diff --git a/Demos/AllBulletDemos/CMakeLists.txt b/Demos/AllBulletDemos/CMakeLists.txt
index b1ab5b9fe..0223818cc 100644
--- a/Demos/AllBulletDemos/CMakeLists.txt
+++ b/Demos/AllBulletDemos/CMakeLists.txt
@@ -57,7 +57,7 @@ SET(AllBulletDemos_SRCS
 IF (WIN32)
 	ADD_EXECUTABLE(AppAllBulletDemos
 		${AllBulletDemos_SRCS}
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ELSE()
 	ADD_EXECUTABLE(AppAllBulletDemos
diff --git a/Demos/BasicDemo/CMakeLists.txt b/Demos/BasicDemo/CMakeLists.txt
index 09568037b..253b8b47c 100644
--- a/Demos/BasicDemo/CMakeLists.txt
+++ b/Demos/BasicDemo/CMakeLists.txt
@@ -27,7 +27,7 @@ ADD_EXECUTABLE(AppBasicDemo
 		main.cpp
 		BasicDemo.cpp 
 		BasicDemo.h
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ELSE()
 	ADD_EXECUTABLE(AppBasicDemo
@@ -74,7 +74,7 @@ ELSE (USE_GLUT)
 		Win32BasicDemo.cpp
 		BasicDemo.cpp 
 		BasicDemo.h
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 	
 	
diff --git a/Demos/Benchmarks/BenchmarkDemo.cpp b/Demos/Benchmarks/BenchmarkDemo.cpp
index 7c01e883e..b65a341cc 100644
--- a/Demos/Benchmarks/BenchmarkDemo.cpp
+++ b/Demos/Benchmarks/BenchmarkDemo.cpp
@@ -1252,21 +1252,24 @@ void	BenchmarkDemo::exitPhysics()
 		RagDoll* doll = m_ragdolls[i];
 		delete doll;
 	}
+    m_ragdolls.clear();
 
 	//cleanup in the reverse order of creation/initialization
-
-	//remove the rigidbodies from the dynamics world and delete them
-	for (i=m_dynamicsWorld->getNumCollisionObjects()-1; i>=0 ;i--)
-	{
-		btCollisionObject* obj = m_dynamicsWorld->getCollisionObjectArray()[i];
-		btRigidBody* body = btRigidBody::upcast(obj);
-		if (body && body->getMotionState())
-		{
-			delete body->getMotionState();
-		}
-		m_dynamicsWorld->removeCollisionObject( obj );
-		delete obj;
-	}
+    if (m_dynamicsWorld)
+    {
+        //remove the rigidbodies from the dynamics world and delete them
+        for (i=m_dynamicsWorld->getNumCollisionObjects()-1; i>=0 ;i--)
+        {
+            btCollisionObject* obj = m_dynamicsWorld->getCollisionObjectArray()[i];
+            btRigidBody* body = btRigidBody::upcast(obj);
+            if (body && body->getMotionState())
+            {
+                delete body->getMotionState();
+            }
+            m_dynamicsWorld->removeCollisionObject( obj );
+            delete obj;
+        }
+    }
 
 	//delete collision shapes
 	for (int j=0;j<m_collisionShapes.size();j++)
@@ -1274,20 +1277,26 @@ void	BenchmarkDemo::exitPhysics()
 		btCollisionShape* shape = m_collisionShapes[j];
 		delete shape;
 	}
+    m_collisionShapes.clear();
 
 	//delete dynamics world
 	delete m_dynamicsWorld;
+    m_dynamicsWorld=0;
 
 	//delete solver
 	delete m_solver;
+    m_solver=0;
 
 	//delete broadphase
 	delete m_overlappingPairCache;
+    m_overlappingPairCache=0;
 
 	//delete dispatcher
 	delete m_dispatcher;
+    m_dispatcher=0;
 
 	delete m_collisionConfiguration;
+    m_collisionConfiguration=0;
 
 	
 }
diff --git a/Demos/Benchmarks/CMakeLists.txt b/Demos/Benchmarks/CMakeLists.txt
index bcd771fef..ce6671923 100644
--- a/Demos/Benchmarks/CMakeLists.txt
+++ b/Demos/Benchmarks/CMakeLists.txt
@@ -27,7 +27,7 @@ IF (USE_GLUT)
 			main.cpp
 			BenchmarkDemo.cpp 
 			BenchmarkDemo.h
-			${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+			${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 		)
 	ELSE()
 		ADD_EXECUTABLE(AppBenchmarks
@@ -83,7 +83,7 @@ ELSE (USE_GLUT)
 		BenchmarkDemo.cpp 
 		BenchmarkDemo.h
 		Win32BenchmarkDemo.cpp
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ENDIF (USE_GLUT)
 ELSE (USE_GRAPHICAL_BENCHMARK)
diff --git a/Demos/Benchmarks/main.cpp b/Demos/Benchmarks/main.cpp
index 48e23bbdd..78c05e5bd 100644
--- a/Demos/Benchmarks/main.cpp
+++ b/Demos/Benchmarks/main.cpp
@@ -22,12 +22,12 @@ subject to the following restrictions:
 	#include "GlutStuff.h"
 	#include "GLDebugDrawer.h"
 	GLDebugDrawer	gDebugDrawer;
-#define benchmarkDemo benchmarkDemo4
+#define benchmarkDemo benchmarkDemo2
 #endif //USE_GRAPHICAL_BENCHMARK
 
 
 #define NUM_DEMOS 7
-#define NUM_TESTS 650
+#define NUM_TESTS 200
 
 extern bool gDisableDeactivation;
 
@@ -75,6 +75,7 @@ int main(int argc,char** argv)
 
 			
 		}
+        demoArray[d]->exitPhysics();
 	}
 
 	for (d=0;d<NUM_DEMOS;d++)
diff --git a/Demos/Benchmarks/premake4.lua b/Demos/Benchmarks/premake4.lua
new file mode 100644
index 000000000..a89d69e66
--- /dev/null
+++ b/Demos/Benchmarks/premake4.lua
@@ -0,0 +1,22 @@
+
+project "AppBenchmarks"
+
+if _OPTIONS["ios"] then
+	kind "WindowedApp"
+else	
+	kind "ConsoleApp"
+end
+
+includedirs {"../../src"}
+
+links {
+	"BulletDynamics","BulletCollision", "LinearMath"
+}
+
+language "C++"
+
+files {
+	"**.cpp",
+	"**.h",
+}
+
diff --git a/Demos/Box2dDemo/CMakeLists.txt b/Demos/Box2dDemo/CMakeLists.txt
index 0b434f652..0d40f7fea 100644
--- a/Demos/Box2dDemo/CMakeLists.txt
+++ b/Demos/Box2dDemo/CMakeLists.txt
@@ -25,7 +25,7 @@ IF (USE_GLUT)
 			main.cpp
 			Box2dDemo.cpp
 			Box2dDemo.h
-			${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+			${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 		)
 	ELSE()
 		ADD_EXECUTABLE(AppBox2dDemo
@@ -63,7 +63,7 @@ ELSE (USE_GLUT)
 		Win32Box2dDemo.cpp
 		Box2dDemo.cpp
 		Box2dDemo.h
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ENDIF (USE_GLUT)
 
diff --git a/Demos/CcdPhysicsDemo/CMakeLists.txt b/Demos/CcdPhysicsDemo/CMakeLists.txt
index 515d83c71..7324593fb 100644
--- a/Demos/CcdPhysicsDemo/CMakeLists.txt
+++ b/Demos/CcdPhysicsDemo/CMakeLists.txt
@@ -23,7 +23,7 @@ IF (WIN32)
 	ADD_EXECUTABLE(AppCcdPhysicsDemo
 		main.cpp
 		CcdPhysicsDemo.cpp
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ELSE()
 	ADD_EXECUTABLE(AppCcdPhysicsDemo
diff --git a/Demos/CharacterDemo/CMakeLists.txt b/Demos/CharacterDemo/CMakeLists.txt
index 175e11b56..d0cda1b4a 100644
--- a/Demos/CharacterDemo/CMakeLists.txt
+++ b/Demos/CharacterDemo/CMakeLists.txt
@@ -32,7 +32,7 @@ SET(CharacterDemo_SRCS
 IF (WIN32)
 	ADD_EXECUTABLE(AppCharacterDemo
 		${CharacterDemo_SRCS}
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ELSE()
 	ADD_EXECUTABLE(AppCharacterDemo
diff --git a/Demos/CollisionDemo/CMakeLists.txt b/Demos/CollisionDemo/CMakeLists.txt
index 6fbc03c28..0d6843f80 100644
--- a/Demos/CollisionDemo/CMakeLists.txt
+++ b/Demos/CollisionDemo/CMakeLists.txt
@@ -21,7 +21,7 @@ OpenGLSupport BulletDynamics  BulletCollision LinearMath    ${GLUT_glut_LIBRARY}
 IF (WIN32)
 	ADD_EXECUTABLE(AppCollisionDemo
 			CollisionDemo.cpp
-			${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+			${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ELSE()
 	ADD_EXECUTABLE(AppCollisionDemo
diff --git a/Demos/CollisionDemo/CollisionDemo.cpp b/Demos/CollisionDemo/CollisionDemo.cpp
index 124cba30f..3f4b0504f 100644
--- a/Demos/CollisionDemo/CollisionDemo.cpp
+++ b/Demos/CollisionDemo/CollisionDemo.cpp
@@ -167,7 +167,7 @@ void CollisionDemo::displayCallback(void) {
 	   convexConvex.getClosestPoints(input, gjkOutput, 0); 
    }
     
-   	btScalar m[16];
+   	ATTRIBUTE_ALIGNED16(btScalar) m[16];
 	int i;
 
 	//m_ele = 21.2;
@@ -257,7 +257,7 @@ void CollisionDemo::displayCallback(void) {
 			if (mystate!=2 || i==myiter)
 			
 			{
-				btScalar m[16];
+				ATTRIBUTE_ALIGNED16(btScalar) m[16];
 						
 				input.m_transformA.getOpenGLMatrix( m );
 				//m_shapeDrawer->drawOpenGL(m,shapePtr[0],btVector3(108./255.,131./255.,158./255),btIDebugDraw::DBG_FastWireframe,worldBoundsMin,worldBoundsMax);
diff --git a/Demos/CollisionInterfaceDemo/CMakeLists.txt b/Demos/CollisionInterfaceDemo/CMakeLists.txt
index 2848f525e..82e2ca6ff 100644
--- a/Demos/CollisionInterfaceDemo/CMakeLists.txt
+++ b/Demos/CollisionInterfaceDemo/CMakeLists.txt
@@ -24,7 +24,7 @@ IF (USE_GLUT)
 				CollisionInterfaceDemo.cpp
 				CollisionInterfaceDemo.h
 				main.cpp
-				${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+				${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 		)
 	ELSE()
 		ADD_EXECUTABLE(AppCollisionInterfaceDemo
@@ -65,7 +65,7 @@ ELSE (USE_GLUT)
 			CollisionInterfaceDemo.h
 			Win32CollisionInterfaceDemo.cpp
 			../OpenGL/Win32AppMain.cpp
-			${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+			${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ENDIF (USE_GLUT)
 
diff --git a/Demos/CollisionInterfaceDemo/CollisionInterfaceDemo.cpp b/Demos/CollisionInterfaceDemo/CollisionInterfaceDemo.cpp
index c21ebdf8c..ed8c2902f 100644
--- a/Demos/CollisionInterfaceDemo/CollisionInterfaceDemo.cpp
+++ b/Demos/CollisionInterfaceDemo/CollisionInterfaceDemo.cpp
@@ -104,7 +104,7 @@ btSimplexSolverInterface& gGjkSimplexSolver = sGjkSimplexSolver;
 
 struct btDrawingResult : public btCollisionWorld::ContactResultCallback
 {
-	virtual	btScalar	addSingleResult(btManifoldPoint& cp,	const btCollisionObject* colObj0,int partId0,int index0,const btCollisionObject* colObj1,int partId1,int index1)
+	virtual	btScalar	addSingleResult(btManifoldPoint& cp,	const btCollisionObjectWrapper* colObj0Wrap,int partId0,int index0,const btCollisionObjectWrapper* colObj1Wrap,int partId1,int index1)
 	{
 
 		glBegin(GL_LINES);
diff --git a/Demos/ConcaveConvexcastDemo/ConcaveConvexcastDemo.cpp b/Demos/ConcaveConvexcastDemo/ConcaveConvexcastDemo.cpp
index 95094829a..867832556 100644
--- a/Demos/ConcaveConvexcastDemo/ConcaveConvexcastDemo.cpp
+++ b/Demos/ConcaveConvexcastDemo/ConcaveConvexcastDemo.cpp
@@ -198,7 +198,7 @@ public:
 
 	void drawCube (const btTransform& T)
 	{
-		btScalar m[16];
+		ATTRIBUTE_ALIGNED16(btScalar) m[16];
 		T.getOpenGLMatrix (&m[0]);
 		glPushMatrix ();
 #ifdef BT_USE_DOUBLE_PRECISION
diff --git a/Demos/ConcaveDemo/CMakeLists.txt b/Demos/ConcaveDemo/CMakeLists.txt
index 4de3e42dc..a269bbde6 100644
--- a/Demos/ConcaveDemo/CMakeLists.txt
+++ b/Demos/ConcaveDemo/CMakeLists.txt
@@ -25,7 +25,7 @@ IF (USE_GLUT)
 		ADD_EXECUTABLE(AppConcaveDemo
         		ConcavePhysicsDemo.cpp
         		main.cpp
-        		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+        		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 		)
 	ELSE()
 	ADD_EXECUTABLE(AppConcaveDemo
@@ -68,7 +68,7 @@ ELSE (USE_GLUT)
         	ConcavePhysicsDemo.cpp
         	ConcaveDemo.h
         	Win32ConcaveDemo.cpp
-        	${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+        	${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 
 ENDIF (USE_GLUT)
diff --git a/Demos/ConcaveDemo/ConcavePhysicsDemo.cpp b/Demos/ConcaveDemo/ConcavePhysicsDemo.cpp
index d4f40df41..e261662b0 100644
--- a/Demos/ConcaveDemo/ConcavePhysicsDemo.cpp
+++ b/Demos/ConcaveDemo/ConcavePhysicsDemo.cpp
@@ -75,20 +75,20 @@ inline btScalar	calculateCombinedRestitution(float restitution0,float restitutio
 
 
 
-static bool CustomMaterialCombinerCallback(btManifoldPoint& cp,	const btCollisionObject* colObj0,int partId0,int index0,const btCollisionObject* colObj1,int partId1,int index1)
+static bool CustomMaterialCombinerCallback(btManifoldPoint& cp,	const btCollisionObjectWrapper* colObj0Wrap,int partId0,int index0,const btCollisionObjectWrapper* colObj1Wrap,int partId1,int index1)
 {
 
-	float friction0 = colObj0->getFriction();
-	float friction1 = colObj1->getFriction();
-	float restitution0 = colObj0->getRestitution();
-	float restitution1 = colObj1->getRestitution();
+	float friction0 = colObj0Wrap->getCollisionObject()->getFriction();
+	float friction1 = colObj1Wrap->getCollisionObject()->getFriction();
+	float restitution0 = colObj0Wrap->getCollisionObject()->getRestitution();
+	float restitution1 = colObj1Wrap->getCollisionObject()->getRestitution();
 
-	if (colObj0->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)
+	if (colObj0Wrap->getCollisionObject()->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)
 	{
 		friction0 = 1.0;//partId0,index0
 		restitution0 = 0.f;
 	}
-	if (colObj1->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)
+	if (colObj1Wrap->getCollisionObject()->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)
 	{
 		if (index1&1)
 		{
diff --git a/Demos/ConcaveRaycastDemo/CMakeLists.txt b/Demos/ConcaveRaycastDemo/CMakeLists.txt
index 71bb8ce6b..ae1a458e8 100644
--- a/Demos/ConcaveRaycastDemo/CMakeLists.txt
+++ b/Demos/ConcaveRaycastDemo/CMakeLists.txt
@@ -22,7 +22,7 @@ IF (WIN32)
 	ADD_EXECUTABLE(AppConcaveRayCastDemo
 			ConcaveRaycastDemo.cpp
 			main.cpp
-			${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+			${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ELSE()
 	ADD_EXECUTABLE(AppConcaveRayCastDemo
diff --git a/Demos/ConvexDecompositionDemo/CMakeLists.txt b/Demos/ConvexDecompositionDemo/CMakeLists.txt
index 529794941..ba8ee3be5 100644
--- a/Demos/ConvexDecompositionDemo/CMakeLists.txt
+++ b/Demos/ConvexDecompositionDemo/CMakeLists.txt
@@ -60,7 +60,7 @@ LINK_LIBRARIES(
 		ConvexDecompositionDemo.cpp 
 		ConvexDecompositionDemo.h
 		Win32ConvexDecompositionDemo.cpp
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 	
 ENDIF (USE_GLUT)
diff --git a/Demos/ConvexDecompositionDemo/ConvexDecompositionDemo.cpp b/Demos/ConvexDecompositionDemo/ConvexDecompositionDemo.cpp
index f62b9459a..58b6f8c6b 100644
--- a/Demos/ConvexDecompositionDemo/ConvexDecompositionDemo.cpp
+++ b/Demos/ConvexDecompositionDemo/ConvexDecompositionDemo.cpp
@@ -80,24 +80,24 @@ void ConvexDecompositionDemo::initPhysics()
 ///MyContactCallback is just an example to show how to get access to the child shape that collided
 bool MyContactCallback (
     btManifoldPoint& cp,
-    const btCollisionObject* colObj0,
+    const btCollisionObjectWrapper* colObj0Wrap,
     int partId0,
     int index0,
-    const btCollisionObject* colObj1,
+    const btCollisionObjectWrapper* colObj1Wrap,
     int partId1,
     int index1)
 {
 
-	if (colObj0->getRootCollisionShape()->getShapeType()==COMPOUND_SHAPE_PROXYTYPE)
+	if (colObj0Wrap->getCollisionObject()->getCollisionShape()->getShapeType()==COMPOUND_SHAPE_PROXYTYPE)
 	{
-		btCompoundShape* compound = (btCompoundShape*)colObj0->getRootCollisionShape();
+		btCompoundShape* compound = (btCompoundShape*)colObj0Wrap->getCollisionObject()->getCollisionShape();
 		btCollisionShape* childShape;
 		childShape = compound->getChildShape(index0);
 	}
 
-	if (colObj1->getRootCollisionShape()->getShapeType()==COMPOUND_SHAPE_PROXYTYPE)
+	if (colObj1Wrap->getCollisionObject()->getCollisionShape()->getShapeType()==COMPOUND_SHAPE_PROXYTYPE)
 	{
-		btCompoundShape* compound = (btCompoundShape*)colObj1->getRootCollisionShape();
+		btCompoundShape* compound = (btCompoundShape*)colObj1Wrap->getCollisionObject()->getCollisionShape();
 		btCollisionShape* childShape;
 		childShape = compound->getChildShape(index1);
 	}
diff --git a/Demos/ConvexDecompositionDemo/ConvexDecompositionDemo.h b/Demos/ConvexDecompositionDemo/ConvexDecompositionDemo.h
index 0f174677b..b0db79a92 100644
--- a/Demos/ConvexDecompositionDemo/ConvexDecompositionDemo.h
+++ b/Demos/ConvexDecompositionDemo/ConvexDecompositionDemo.h
@@ -36,13 +36,14 @@ class btDefaultCollisionConfiguration;
 class btTriangleMesh;
 
 ///ConvexDecompositionDemo shows automatic convex decomposition of a concave mesh
-class ConvexDecompositionDemo : public PlatformDemoApplication
+ATTRIBUTE_ALIGNED16(class) ConvexDecompositionDemo : public PlatformDemoApplication
 {
 
 	void setupEmptyDynamicsWorld();
 public:
-
-
+	
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	//keep the collision shapes, for deletion/cleanup
 	btAlignedObjectArray<btCollisionShape*>	m_collisionShapes;
 
diff --git a/Demos/ConvexHullDistance/CMakeLists.txt b/Demos/ConvexHullDistance/CMakeLists.txt
index 872063bda..50de83c76 100644
--- a/Demos/ConvexHullDistance/CMakeLists.txt
+++ b/Demos/ConvexHullDistance/CMakeLists.txt
@@ -26,7 +26,7 @@ IF (WIN32)
 ADD_EXECUTABLE(AppConvexHullDistanceDemo
 
 		ConvexHullDistanceDemo.cpp
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ELSE()
 	ADD_EXECUTABLE(AppConvexHullDistanceDemo
@@ -69,7 +69,7 @@ ELSE (USE_GLUT)
 		WIN32
 		../OpenGL/Win32AppMain.cpp
 		ConvexHullDistanceDemo.cpp
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 	
 	
diff --git a/Demos/ConvexHullDistance/ConvexHullDistanceDemo.cpp b/Demos/ConvexHullDistance/ConvexHullDistanceDemo.cpp
index cef9e8567..e5b87fec9 100644
--- a/Demos/ConvexHullDistance/ConvexHullDistanceDemo.cpp
+++ b/Demos/ConvexHullDistance/ConvexHullDistanceDemo.cpp
@@ -276,7 +276,7 @@ void clientDisplay(void) {
 
 	//GL_ShapeDrawer::drawCoordSystem();
 
-	btScalar m[16];
+	ATTRIBUTE_ALIGNED16(btScalar) m[16];
 	int i;
 #ifdef USE_GJK
 	btGjkEpaPenetrationDepthSolver epa;
@@ -309,7 +309,7 @@ void clientDisplay(void) {
 	
 	struct	MyContactResultCallback : public btCollisionWorld::ContactResultCallback
 	{
-		virtual	btScalar	addSingleResult(btManifoldPoint& cp,	const btCollisionObject* colObj0,int partId0,int index0,const btCollisionObject* colObj1,int partId1,int index1)
+		virtual	btScalar	addSingleResult(btManifoldPoint& cp,	const btCollisionObjectWrapper* colObj0Wrap,int partId0,int index0,const btCollisionObjectWrapper* colObj1Wrap,int partId1,int index1)
 		{
 			 glBegin(GL_LINES);
 			glColor3f(1, 0, 0);
diff --git a/Demos/DoublePrecisionDemo/DoublePrecisionDemo.cpp b/Demos/DoublePrecisionDemo/DoublePrecisionDemo.cpp
index 6ba380dc6..6120e1b04 100644
--- a/Demos/DoublePrecisionDemo/DoublePrecisionDemo.cpp
+++ b/Demos/DoublePrecisionDemo/DoublePrecisionDemo.cpp
@@ -132,8 +132,8 @@ void DoublePrecisionDemo::displayCallback(void)
   for (i=0;i<numManifolds;i++)
   {
     btPersistentManifold* contactManifold = collisionWorld->getDispatcher()->getManifoldByIndexInternal(i);
-    btCollisionObject* obA = static_cast<btCollisionObject*>(contactManifold->getBody0());
-    btCollisionObject* obB = static_cast<btCollisionObject*>(contactManifold->getBody1());
+    const btCollisionObject* obA = static_cast<const btCollisionObject*>(contactManifold->getBody0());
+    const btCollisionObject* obB = static_cast<const btCollisionObject*>(contactManifold->getBody1());
     contactManifold->refreshContactPoints(obA->getWorldTransform(),obB->getWorldTransform());
 
     int numContacts = contactManifold->getNumContacts();
diff --git a/Demos/EPAPenDepthDemo/PenetrationTestBullet.cpp b/Demos/EPAPenDepthDemo/PenetrationTestBullet.cpp
index 319e01bff..9c4302ee8 100644
--- a/Demos/EPAPenDepthDemo/PenetrationTestBullet.cpp
+++ b/Demos/EPAPenDepthDemo/PenetrationTestBullet.cpp
@@ -210,7 +210,8 @@ void MyConvex::Render(bool only_wireframe, const btVector3& wire_color) const
 	const float Scale = 1.0f;
 	glPushMatrix();
 
-	btScalar glmat[16];	//4x4 column major matrix for OpenGL.
+	ATTRIBUTE_ALIGNED16(btScalar) glmat[16];	//4x4 column major matrix for OpenGL.
+
 	mTransform.getOpenGLMatrix(glmat);
 #ifndef BT_USE_DOUBLE_PRECISION
 	glMultMatrixf(&(glmat[0]));
diff --git a/Demos/ForkLiftDemo/CMakeLists.txt b/Demos/ForkLiftDemo/CMakeLists.txt
index f0c566738..9098ad8d7 100644
--- a/Demos/ForkLiftDemo/CMakeLists.txt
+++ b/Demos/ForkLiftDemo/CMakeLists.txt
@@ -23,7 +23,7 @@ IF (WIN32)
 	ADD_EXECUTABLE(AppForkLiftDemo
 		ForkLiftDemo.cpp
 		main.cpp
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 	
 	IF (WIN32)
diff --git a/Demos/FractureDemo/CMakeLists.txt b/Demos/FractureDemo/CMakeLists.txt
index eae9acd9a..487a207da 100644
--- a/Demos/FractureDemo/CMakeLists.txt
+++ b/Demos/FractureDemo/CMakeLists.txt
@@ -31,7 +31,7 @@ ADD_EXECUTABLE(AppFractureDemo
 		btFractureBody.cpp
 		btFractureDynamicsWorld.cpp
 		btFractureDynamicsWorld.h
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ELSE()
 	ADD_EXECUTABLE(AppFractureDemo
@@ -82,7 +82,7 @@ ELSE (USE_GLUT)
 		Win32FractureDemo.cpp
 		FractureDemo.cpp 
 		FractureDemo.h
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 	
 	
diff --git a/Demos/FractureDemo/btFractureBody.cpp b/Demos/FractureDemo/btFractureBody.cpp
index b7a52896b..1c5065b77 100644
--- a/Demos/FractureDemo/btFractureBody.cpp
+++ b/Demos/FractureDemo/btFractureBody.cpp
@@ -25,7 +25,7 @@ void	btFractureBody::recomputeConnectivity(btCollisionWorld* world)
 					MyContactResultCallback() :m_connected(false)
 					{
 					}
-					virtual   btScalar   addSingleResult(btManifoldPoint& cp,   const btCollisionObject* colObj0,int partId0,int index0,const btCollisionObject* colObj1,int partId1,int index1)
+					virtual   btScalar   addSingleResult(btManifoldPoint& cp,   const btCollisionObjectWrapper* colObj0Wrap,int partId0,int index0,const btCollisionObjectWrapper* colObj1Wrap,int partId1,int index1)
 					{
 						if (cp.getDistance()<=0)
 							m_connected = true;
diff --git a/Demos/GenericJointDemo/CMakeLists.txt b/Demos/GenericJointDemo/CMakeLists.txt
index 02d4cfb95..d664fab23 100644
--- a/Demos/GenericJointDemo/CMakeLists.txt
+++ b/Demos/GenericJointDemo/CMakeLists.txt
@@ -25,7 +25,7 @@ IF (USE_GLUT)
 			GenericJointDemo.h
 			Ragdoll.cpp
 			main.cpp
-			${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+			${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 		)
 	ELSE()
 		ADD_EXECUTABLE(AppGenericJointDemo
@@ -70,7 +70,7 @@ ELSE(USE_GLUT)
 		GenericJointDemo.cpp
 		GenericJointDemo.h
 		Ragdoll.cpp
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 		
 	)
 ENDIF (USE_GLUT)
diff --git a/Demos/GjkConvexCastDemo/LinearConvexCastDemo.cpp b/Demos/GjkConvexCastDemo/LinearConvexCastDemo.cpp
index a1733d087..29fbb091c 100644
--- a/Demos/GjkConvexCastDemo/LinearConvexCastDemo.cpp
+++ b/Demos/GjkConvexCastDemo/LinearConvexCastDemo.cpp
@@ -149,7 +149,10 @@ void LinearConvexCastDemo::displayCallback(void)
 
 	convexCaster.calcTimeOfImpact( tr[ 0 ], toA, tr[ 1 ], toB, result );
 
-	btScalar m1[16], m2[16],m3[16];
+	ATTRIBUTE_ALIGNED16(btScalar) m1[16];
+	ATTRIBUTE_ALIGNED16(btScalar) m2[16];
+	ATTRIBUTE_ALIGNED16(btScalar) m3[16];
+	
 	tr[ 0 ].getOpenGLMatrix( m1 );
 	tr[ 1 ].getOpenGLMatrix( m2 );
 
diff --git a/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidBuoyantConvexShape.h b/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidBuoyantConvexShape.h
index 13948caa5..3e25a94fd 100644
--- a/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidBuoyantConvexShape.h
+++ b/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidBuoyantConvexShape.h
@@ -31,7 +31,10 @@ public:
 	~btHfFluidBuoyantConvexShape ();
 	void generateShape (btScalar radius, btScalar gap);
 
-	btConvexShape* getConvexShape () { return m_convexShape; }
+	const btConvexShape* getConvexShape () const
+	{ 
+		return m_convexShape; 
+	}
 
 	virtual void getAabb(const btTransform& t,btVector3& aabbMin,btVector3& aabbMax) const;
 	virtual void	setMargin(btScalar margin);
diff --git a/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidBuoyantShapeCollisionAlgorithm.cpp b/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidBuoyantShapeCollisionAlgorithm.cpp
index 6273b4607..b1a97487d 100644
--- a/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidBuoyantShapeCollisionAlgorithm.cpp
+++ b/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidBuoyantShapeCollisionAlgorithm.cpp
@@ -25,50 +25,50 @@ Experimental Buoyancy fluid demo written by John McCutchan
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletDynamics/Dynamics/btRigidBody.h"
 #include "btHfFluid.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
-btHfFluidBuoyantShapeCollisionAlgorithm::btHfFluidBuoyantShapeCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1,btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver)
-: btCollisionAlgorithm(ci), m_convexConvexAlgorithm(NULL, ci, col0, col1, simplexSolver, pdSolver,0,0) 
+btHfFluidBuoyantShapeCollisionAlgorithm::btHfFluidBuoyantShapeCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap,btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver)
+: btCollisionAlgorithm(ci), m_convexConvexAlgorithm(NULL, ci, col0Wrap, col1Wrap, simplexSolver, pdSolver,0,0) 
 {
-	m_collisionObject0 = col0;
-	m_collisionObject1 = col1;
+	m_collisionObject0 = col0Wrap->getCollisionObject();
+	m_collisionObject1 = col1Wrap->getCollisionObject();
 }
 
 btHfFluidBuoyantShapeCollisionAlgorithm::~btHfFluidBuoyantShapeCollisionAlgorithm()
 {
 }
 
-void btHfFluidBuoyantShapeCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btHfFluidBuoyantShapeCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
-	btHfFluidBuoyantConvexShape* tmpShape0 = (btHfFluidBuoyantConvexShape*)body0->getCollisionShape();
-	btHfFluidBuoyantConvexShape* tmpShape1 = (btHfFluidBuoyantConvexShape*)body1->getCollisionShape();
-	btConvexShape* convexShape0 = tmpShape0->getConvexShape();
-	btConvexShape* convexShape1 = tmpShape1->getConvexShape();
+	const btHfFluidBuoyantConvexShape* tmpShape0 = (const btHfFluidBuoyantConvexShape*)body0Wrap->getCollisionShape();
+	const btHfFluidBuoyantConvexShape* tmpShape1 = (const btHfFluidBuoyantConvexShape*)body1Wrap->getCollisionShape();
+	const btConvexShape* convexShape0 = tmpShape0->getConvexShape();
+	const btConvexShape* convexShape1 = tmpShape1->getConvexShape();
 
-	body0->setCollisionShape (convexShape0);
-	body1->setCollisionShape (convexShape1);
+	//body0->setCollisionShape (convexShape0);
+	//body1->setCollisionShape (convexShape1);
 
-	m_convexConvexAlgorithm.processCollision (body0, body1, dispatchInfo,resultOut);
+	btCollisionObjectWrapper ob0(body0Wrap,convexShape0,body0Wrap->getCollisionObject(),body0Wrap->getWorldTransform());
+	btCollisionObjectWrapper ob1(body1Wrap,convexShape1,body1Wrap->getCollisionObject(),body1Wrap->getWorldTransform());
+	m_convexConvexAlgorithm.processCollision (&ob0, &ob1, dispatchInfo,resultOut);
 
-	body0->setCollisionShape (tmpShape0);
-	body1->setCollisionShape (tmpShape1);
+	
 }
 
 btScalar btHfFluidBuoyantShapeCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
+	btAssert(0);
+
 	btHfFluidBuoyantConvexShape* tmpShape0 = (btHfFluidBuoyantConvexShape*)body0->getCollisionShape();
 	btHfFluidBuoyantConvexShape* tmpShape1 = (btHfFluidBuoyantConvexShape*)body1->getCollisionShape();
-	btConvexShape* convexShape0 = tmpShape0->getConvexShape();
-	btConvexShape* convexShape1 = tmpShape1->getConvexShape();
-
-	body0->setCollisionShape (convexShape0);
-	body1->setCollisionShape (convexShape1);
+	const btConvexShape* convexShape0 = tmpShape0->getConvexShape();
+	const btConvexShape* convexShape1 = tmpShape1->getConvexShape();
 
+	
 	btScalar toi = btScalar(0.0f);
 
 	toi = m_convexConvexAlgorithm.calculateTimeOfImpact (body0, body1, dispatchInfo, resultOut);
 
-	body0->setCollisionShape (tmpShape0);
-	body1->setCollisionShape (tmpShape1);
-
+	
 	return toi;
 }
diff --git a/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidBuoyantShapeCollisionAlgorithm.h b/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidBuoyantShapeCollisionAlgorithm.h
index a744dc0ac..853f07239 100644
--- a/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidBuoyantShapeCollisionAlgorithm.h
+++ b/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidBuoyantShapeCollisionAlgorithm.h
@@ -38,17 +38,17 @@ class btSimplexSolverInterface;
 /// btHfFluidBuoyantShapeCollisionAlgorithm  provides collision detection between btHfFluidBuoyantConvexShape and btHfFluidBuoyantConvexShape
 class btHfFluidBuoyantShapeCollisionAlgorithm : public btCollisionAlgorithm
 {
-	btCollisionObject*		m_collisionObject0;
-	btCollisionObject*		m_collisionObject1;
+	const btCollisionObject*		m_collisionObject0;
+	const btCollisionObject*		m_collisionObject1;
 
 	btConvexConvexAlgorithm m_convexConvexAlgorithm;
 public:
 
-	btHfFluidBuoyantShapeCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver);
+	btHfFluidBuoyantShapeCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver);
 
 	virtual ~btHfFluidBuoyantShapeCollisionAlgorithm();
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -70,15 +70,15 @@ public:
 		}
 		
 		virtual ~CreateFunc() {}
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btHfFluidBuoyantShapeCollisionAlgorithm));
 			if (!m_swapped)
 			{
-				return new(mem) btHfFluidBuoyantShapeCollisionAlgorithm(ci,body0,body1, m_simplexSolver, m_pdSolver);
+				return new(mem) btHfFluidBuoyantShapeCollisionAlgorithm(ci,body0Wrap,body1Wrap, m_simplexSolver, m_pdSolver);
 			} else
 			{
-				return new(mem) btHfFluidBuoyantShapeCollisionAlgorithm(ci,body0,body1, m_simplexSolver, m_pdSolver);
+				return new(mem) btHfFluidBuoyantShapeCollisionAlgorithm(ci,body0Wrap,body1Wrap, m_simplexSolver, m_pdSolver);
 			}
 		}
 	};
diff --git a/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidRigidCollisionAlgorithm.cpp b/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidRigidCollisionAlgorithm.cpp
index 7584fff2a..d3497cee6 100644
--- a/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidRigidCollisionAlgorithm.cpp
+++ b/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidRigidCollisionAlgorithm.cpp
@@ -25,54 +25,69 @@ Experimental Buoyancy fluid demo written by John McCutchan
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletDynamics/Dynamics/btRigidBody.h"
 #include "btHfFluid.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 btHfFluidRigidCollisionAlgorithm::~btHfFluidRigidCollisionAlgorithm()
 {
 }
 
-btHfFluidRigidCollisionAlgorithm::btHfFluidRigidCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1, bool isSwapped)
+btHfFluidRigidCollisionAlgorithm::btHfFluidRigidCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap, bool isSwapped)
 : btCollisionAlgorithm(ci), m_isSwapped(isSwapped), 
-	m_convexTrianglecallback(ci.m_dispatcher1, col0, col1, !isSwapped) // we flip the isSwapped because we are hf fluid vs. convex and callback expects convex vs. concave
+	m_convexTrianglecallback(ci.m_dispatcher1, col0Wrap, col1Wrap, !isSwapped) // we flip the isSwapped because we are hf fluid vs. convex and callback expects convex vs. concave
 {
 	m_manifoldPtr = m_convexTrianglecallback.m_manifoldPtr;
 	if (m_isSwapped)
 	{
-		m_hfFluid = static_cast<btHfFluid*>(col1);
-		m_rigidCollisionObject = static_cast<btCollisionObject*>(col0);
+		m_hfFluid = static_cast<const btHfFluid*>(col1Wrap->getCollisionObject());
+		m_rigidCollisionObject = static_cast<const btCollisionObject*>(col0Wrap->getCollisionObject());
 		m_manifoldPtr->setBodies(m_hfFluid,m_rigidCollisionObject);
 	} else {
-		m_hfFluid = static_cast<btHfFluid*>(col0);
-		m_rigidCollisionObject = static_cast<btCollisionObject*>(col1);
+		m_hfFluid = static_cast<const btHfFluid*>(col0Wrap->getCollisionObject());
+		m_rigidCollisionObject = static_cast<const btCollisionObject*>(col1Wrap->getCollisionObject());
 		m_manifoldPtr->setBodies(m_rigidCollisionObject,m_hfFluid);
 	}
 }
 
 void btHfFluidRigidCollisionAlgorithm::processGround (const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
+	btAssert(0);
+	//needs fixing after btCollisionObjectWrapper introduction
+#if 0
 	btScalar triangleMargin = m_rigidCollisionObject->getCollisionShape()->getMargin();
 	resultOut->setPersistentManifold(m_manifoldPtr);
 	// to perform the convex shape vs. ground terrain:
 	// we pull the convex shape out of the buoyant shape and replace it temporarily
 	btHfFluidBuoyantConvexShape* tmpShape = (btHfFluidBuoyantConvexShape*)m_rigidCollisionObject->getCollisionShape();
-	btConvexShape* convexShape = ((btHfFluidBuoyantConvexShape*)tmpShape)->getConvexShape();
-	m_rigidCollisionObject->setCollisionShape (convexShape);
+	const btConvexShape* convexShape = ((const btHfFluidBuoyantConvexShape*)tmpShape)->getConvexShape();
+	//m_rigidCollisionObject->setCollisionShape (convexShape);
 	m_convexTrianglecallback.setTimeStepAndCounters (triangleMargin, dispatchInfo, resultOut);
 	m_hfFluid->foreachGroundTriangle (&m_convexTrianglecallback, m_convexTrianglecallback.getAabbMin(),m_convexTrianglecallback.getAabbMax());
 	resultOut->refreshContactPoints();
+#endif
 	// restore the buoyant shape 
-	m_rigidCollisionObject->setCollisionShape (tmpShape);
+	//m_rigidCollisionObject->setCollisionShape (tmpShape);
 }
 
 btScalar btHfFluidRigidCollisionAlgorithm::processFluid (const btDispatcherInfo& dispatchInfo, btScalar density, btScalar floatyness)
 {
+		btAssert(0);
+	//needs fixing after btCollisionObjectWrapper introduction
+#if 0
 	btRigidBody* rb = btRigidBody::upcast(m_rigidCollisionObject);
 	btHfFluidColumnRigidBodyCallback columnCallback (rb, dispatchInfo.m_debugDraw, density, floatyness);
 	m_hfFluid->foreachFluidColumn (&columnCallback, m_convexTrianglecallback.getAabbMin(), m_convexTrianglecallback.getAabbMax());
 	return columnCallback.getVolume ();
+#endif
+	return 0.f;
+
 }
 
 void btHfFluidRigidCollisionAlgorithm::applyFluidFriction (btScalar mu, btScalar submerged_percentage)
 {
+		btAssert(0);
+	//needs fixing after btCollisionObjectWrapper introduction
+#if 0
+
 	btRigidBody* rb = btRigidBody::upcast(m_rigidCollisionObject);
 	btScalar dt = btScalar(1.0f/60.0f);
 
@@ -103,10 +118,15 @@ void btHfFluidRigidCollisionAlgorithm::applyFluidFriction (btScalar mu, btScalar
 	rb->applyCentralImpulse (dt * scaled_mu * -rb->getLinearVelocity());
 	rb->applyTorqueImpulse (dt * scaled_mu * -rb->getAngularVelocity());
 #endif
+#endif 
+	
 }
 
-void btHfFluidRigidCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btHfFluidRigidCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
+	btAssert(0);
+	//needs fixing after btCollisionObjectWrapper introduction
+#if 0
 	processGround (dispatchInfo, resultOut);
 	btHfFluidBuoyantConvexShape* buoyantShape = (btHfFluidBuoyantConvexShape*)m_rigidCollisionObject->getCollisionShape();
 	btRigidBody* rb = btRigidBody::upcast(m_rigidCollisionObject);
@@ -125,6 +145,8 @@ void btHfFluidRigidCollisionAlgorithm::processCollision (btCollisionObject* body
 			applyFluidFriction (mu, submerged_percentage);
 		}
 	}
+#endif
+
 }
 
 btScalar btHfFluidRigidCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
diff --git a/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidRigidCollisionAlgorithm.h b/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidRigidCollisionAlgorithm.h
index a72eb34e0..b03051a04 100644
--- a/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidRigidCollisionAlgorithm.h
+++ b/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidRigidCollisionAlgorithm.h
@@ -35,8 +35,8 @@ class btHfFluidRigidCollisionAlgorithm : public btCollisionAlgorithm
 {
 	btPersistentManifold*	m_manifoldPtr;
 
-	btHfFluid*				m_hfFluid;
-	btCollisionObject*		m_rigidCollisionObject;
+	const btHfFluid*				m_hfFluid;
+	const btCollisionObject*		m_rigidCollisionObject;
 
 	///for rigid versus fluid (instead of fluid versus rigid), we use this swapped boolean
 	bool	m_isSwapped;
@@ -48,11 +48,11 @@ class btHfFluidRigidCollisionAlgorithm : public btCollisionAlgorithm
 	btScalar processFluid (const btDispatcherInfo& dispatchInfo, btScalar density, btScalar floatyness);
 public:
 
-	btHfFluidRigidCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1, bool isSwapped);
+	btHfFluidRigidCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap, bool isSwapped);
 
 	virtual ~btHfFluidRigidCollisionAlgorithm();
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -64,15 +64,15 @@ public:
 
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btHfFluidRigidCollisionAlgorithm));
 			if (!m_swapped)
 			{
-				return new(mem) btHfFluidRigidCollisionAlgorithm(ci,body0,body1,false);
+				return new(mem) btHfFluidRigidCollisionAlgorithm(ci,body0Wrap,body1Wrap,false);
 			} else
 			{
-				return new(mem) btHfFluidRigidCollisionAlgorithm(ci,body0,body1,true);
+				return new(mem) btHfFluidRigidCollisionAlgorithm(ci,body0Wrap,body1Wrap,true);
 			}
 		}
 	};
diff --git a/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidRigidDynamicsWorld.cpp b/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidRigidDynamicsWorld.cpp
index 79d44b820..f11438418 100644
--- a/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidRigidDynamicsWorld.cpp
+++ b/Demos/HeightFieldFluidDemo/BulletHfFluid/btHfFluidRigidDynamicsWorld.cpp
@@ -190,7 +190,7 @@ void btHfFluidRigidDynamicsWorld::drawHfFluidBuoyantConvexShape (btIDebugDraw* d
 		}
 		};
 
-		btConvexShape* convexShape = ((btHfFluidBuoyantConvexShape*)object->getCollisionShape())->getConvexShape();
+		const btConvexShape* convexShape = ((const btHfFluidBuoyantConvexShape*)object->getCollisionShape())->getConvexShape();
 		debugDrawObject(object->getWorldTransform(),(btCollisionShape*)convexShape,color);
 	}
 }
diff --git a/Demos/HeightFieldFluidDemo/HfFluidDemo_GL_ShapeDrawer.cpp b/Demos/HeightFieldFluidDemo/HfFluidDemo_GL_ShapeDrawer.cpp
index 33f2e032e..0f7087776 100644
--- a/Demos/HeightFieldFluidDemo/HfFluidDemo_GL_ShapeDrawer.cpp
+++ b/Demos/HeightFieldFluidDemo/HfFluidDemo_GL_ShapeDrawer.cpp
@@ -162,7 +162,7 @@ void HfFluidDemo_GL_ShapeDrawer::drawOpenGL(btScalar* m, const btCollisionShape*
 
 	if (shape->getShapeType() == HFFLUID_BUOYANT_CONVEX_SHAPE_PROXYTYPE)
 	{
-		btConvexShape* convexShape = ((btHfFluidBuoyantConvexShape*)shape)->getConvexShape();
+		const btConvexShape* convexShape = ((btHfFluidBuoyantConvexShape*)shape)->getConvexShape();
 		btTransform I;
 		I.setIdentity();
 		btScalar mat[16];
diff --git a/Demos/HelloWorld/CMakeLists.txt b/Demos/HelloWorld/CMakeLists.txt
index 59eee2d00..a0ed46092 100644
--- a/Demos/HelloWorld/CMakeLists.txt
+++ b/Demos/HelloWorld/CMakeLists.txt
@@ -11,7 +11,7 @@ LINK_LIBRARIES(
 IF (WIN32)
 	ADD_EXECUTABLE(AppHelloWorld
 		HelloWorld.cpp 
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ELSE()
 	ADD_EXECUTABLE(AppHelloWorld
@@ -22,8 +22,8 @@ ENDIF()
 
 
 
-IF (INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES)
-			SET_TARGET_PROPERTIES(AppHelloWorld PROPERTIES  DEBUG_POSTFIX "_Debug")
-			SET_TARGET_PROPERTIES(AppHelloWorld PROPERTIES  MINSIZEREL_POSTFIX "_MinsizeRel")
-			SET_TARGET_PROPERTIES(AppHelloWorld PROPERTIES  RELWITHDEBINFO_POSTFIX "_RelWithDebugInfo")
+IF (INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES)
+			SET_TARGET_PROPERTIES(AppHelloWorld PROPERTIES  DEBUG_POSTFIX "_Debug")
+			SET_TARGET_PROPERTIES(AppHelloWorld PROPERTIES  MINSIZEREL_POSTFIX "_MinsizeRel")
+			SET_TARGET_PROPERTIES(AppHelloWorld PROPERTIES  RELWITHDEBINFO_POSTFIX "_RelWithDebugInfo")
 ENDIF(INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES)
\ No newline at end of file
diff --git a/Demos/HelloWorld/premake4.lua b/Demos/HelloWorld/premake4.lua
new file mode 100644
index 000000000..b88aa1746
--- /dev/null
+++ b/Demos/HelloWorld/premake4.lua
@@ -0,0 +1,22 @@
+
+project "AppHelloWorld"
+
+if _OPTIONS["ios"] then
+	kind "WindowedApp"
+else	
+	kind "ConsoleApp"
+end
+
+includedirs {"../../src"}
+
+links {
+	"BulletDynamics","BulletCollision", "LinearMath"
+}
+
+language "C++"
+
+files {
+	"**.cpp",
+	"**.h",
+}
+
diff --git a/Demos/InternalEdgeDemo/InternalEdgeDemo.cpp b/Demos/InternalEdgeDemo/InternalEdgeDemo.cpp
index d5084d384..628849d28 100644
--- a/Demos/InternalEdgeDemo/InternalEdgeDemo.cpp
+++ b/Demos/InternalEdgeDemo/InternalEdgeDemo.cpp
@@ -73,27 +73,27 @@ inline btScalar	calculateCombinedRestitution(float restitution0,float restitutio
 ///////////////////////////////////////////////////////////////
 
 
-static bool CustomMaterialCombinerCallback(btManifoldPoint& cp,	const btCollisionObject* colObj0,int partId0,int index0,const btCollisionObject* colObj1,int partId1,int index1)
+static bool CustomMaterialCombinerCallback(btManifoldPoint& cp,	const btCollisionObjectWrapper* colObj0Wrap,int partId0,int index0,const btCollisionObjectWrapper* colObj1Wrap,int partId1,int index1)
 {
 
 	if (enable)
 	{
-		btAdjustInternalEdgeContacts(cp,colObj1,colObj0, partId1,index1);
+		btAdjustInternalEdgeContacts(cp,colObj1Wrap,colObj0Wrap, partId1,index1);
 		//btAdjustInternalEdgeContacts(cp,colObj1,colObj0, partId1,index1, BT_TRIANGLE_CONVEX_BACKFACE_MODE);
 		//btAdjustInternalEdgeContacts(cp,colObj1,colObj0, partId1,index1, BT_TRIANGLE_CONVEX_DOUBLE_SIDED+BT_TRIANGLE_CONCAVE_DOUBLE_SIDED);
 	}
 
-	float friction0 = colObj0->getFriction();
-	float friction1 = colObj1->getFriction();
-	float restitution0 = colObj0->getRestitution();
-	float restitution1 = colObj1->getRestitution();
+	float friction0 = colObj0Wrap->getCollisionObject()->getFriction();
+	float friction1 = colObj1Wrap->getCollisionObject()->getFriction();
+	float restitution0 = colObj0Wrap->getCollisionObject()->getRestitution();
+	float restitution1 = colObj1Wrap->getCollisionObject()->getRestitution();
 
-	if (colObj0->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)
+	if (colObj0Wrap->getCollisionObject()->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)
 	{
 		friction0 = 1.0;//partId0,index0
 		restitution0 = 0.f;
 	}
-	if (colObj1->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)
+	if (colObj1Wrap->getCollisionObject()->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)
 	{
 		if (index1&1)
 		{
diff --git a/Demos/MovingConcaveDemo/CMakeLists.txt b/Demos/MovingConcaveDemo/CMakeLists.txt
index 02f07683a..d6bcaf5f7 100644
--- a/Demos/MovingConcaveDemo/CMakeLists.txt
+++ b/Demos/MovingConcaveDemo/CMakeLists.txt
@@ -1,29 +1,48 @@
-# This is basically the overall name of the project in Visual Studio this is the name of the Solution File
-
-
-# For every executable you have with a main method you should have an add_executable line below.
-# For every add executable line you should list every .cpp and .h file you have associated with that executable.  
-
-
-# This is the variable for Windows.  I use this to define the root of my directory structure.
-SET(GLUT_ROOT ${BULLET_PHYSICS_SOURCE_DIR}/Glut)
-
-# You shouldn't have to modify anything below this line 
-########################################################
-
-
-INCLUDE_DIRECTORIES(
- ${BULLET_PHYSICS_SOURCE_DIR}/src ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL 
-)
-
-LINK_LIBRARIES(
-OpenGLSupport  BulletDynamics  BulletCollision LinearMath    ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
-)
-
-ADD_EXECUTABLE(AppMovingConcaveDemo
-ConcavePhysicsDemo.cpp
-)
-
+# This is basically the overall name of the project in Visual Studio this is the name of the Solution File
+
+
+# For every executable you have with a main method you should have an add_executable line below.
+# For every add executable line you should list every .cpp and .h file you have associated with that executable.  
+
+
+# This is the variable for Windows.  I use this to define the root of my directory structure.
+SET(GLUT_ROOT ${BULLET_PHYSICS_SOURCE_DIR}/Glut)
+
+# You shouldn't have to modify anything below this line 
+########################################################
+
+
+INCLUDE_DIRECTORIES(
+ ${BULLET_PHYSICS_SOURCE_DIR}/src ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL 
+)
+
+LINK_LIBRARIES(
+OpenGLSupport  BulletDynamics  BulletCollision LinearMath    ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
+)
+
+
+
+ADD_EXECUTABLE(AppMovingConcaveDemo
+ConcavePhysicsDemo.cpp
+)
+
+IF (WIN32)
+		IF (NOT INTERNAL_CREATE_MSVC_RELATIVE_PATH_PROJECTFILES)
+			IF (CMAKE_CL_64)
+				ADD_CUSTOM_COMMAND(
+					TARGET AppMovingConcaveDemo
+					POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR}
+				)
+			ELSE(CMAKE_CL_64)
+				ADD_CUSTOM_COMMAND(
+					TARGET AppMovingConcaveDemo
+					POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR}
+				)
+			ENDIF(CMAKE_CL_64)
+		ENDIF (NOT INTERNAL_CREATE_MSVC_RELATIVE_PATH_PROJECTFILES)
+ENDIF(WIN32)
 
 IF (INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES)
 			SET_TARGET_PROPERTIES(AppMovingConcaveDemo PROPERTIES  DEBUG_POSTFIX "_Debug")
diff --git a/Demos/MovingConcaveDemo/ConcavePhysicsDemo.cpp b/Demos/MovingConcaveDemo/ConcavePhysicsDemo.cpp
index 40cc3e1e0..b04c61885 100644
--- a/Demos/MovingConcaveDemo/ConcavePhysicsDemo.cpp
+++ b/Demos/MovingConcaveDemo/ConcavePhysicsDemo.cpp
@@ -16,6 +16,7 @@ subject to the following restrictions:
 #include "btBulletDynamicsCommon.h"
 #include "ConcaveDemo.h"
 
+
 #include "LinearMath/btDefaultMotionState.h"
 #include "LinearMath/btIDebugDraw.h"
 #include "LinearMath/btQuickprof.h"
@@ -32,7 +33,7 @@ subject to the following restrictions:
 #include "GlutStuff.h"
 
 
-GLDebugDrawer	debugDrawer;
+GLDebugDrawer	debugDrawer1;
 
 //***************************THE FAMOUS BUNNY TRIMESH********************************************//
 
@@ -1431,20 +1432,20 @@ inline btScalar	calculateCombinedRestitution(float restitution0,float restitutio
 
 
 
-bool CustomMaterialCombinerCallback(btManifoldPoint& cp,	const btCollisionObject* colObj0,int partId0,int index0,const btCollisionObject* colObj1,int partId1,int index1)
+bool CustomMaterialCombinerCallback(btManifoldPoint& cp,	const btCollisionObjectWrapper* colObj0Wrap,int partId0,int index0,const btCollisionObjectWrapper* colObj1Wrap,int partId1,int index1)
 {
 
-	float friction0 = colObj0->getFriction();
-	float friction1 = colObj1->getFriction();
-	float restitution0 = colObj0->getRestitution();
-	float restitution1 = colObj1->getRestitution();
+	float friction0 = colObj0Wrap->getCollisionObject()->getFriction();
+	float friction1 = colObj1Wrap->getCollisionObject()->getFriction();
+	float restitution0 = colObj0Wrap->getCollisionObject()->getRestitution();
+	float restitution1 = colObj1Wrap->getCollisionObject()->getRestitution();
 
-	if (colObj0->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)
+	if (colObj0Wrap->getCollisionObject()->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)
 	{
 		friction0 = 1.0;//partId0,index0
 		restitution0 = 0.f;
 	}
-	if (colObj1->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)
+	if (colObj1Wrap->getCollisionObject()->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)
 	{
 		if (index1&1)
 		{
diff --git a/Demos/MultiMaterialDemo/MultiMaterialDemo.cpp b/Demos/MultiMaterialDemo/MultiMaterialDemo.cpp
index 6c748d8ca..dee265683 100644
--- a/Demos/MultiMaterialDemo/MultiMaterialDemo.cpp
+++ b/Demos/MultiMaterialDemo/MultiMaterialDemo.cpp
@@ -24,6 +24,7 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btTriangleIndexVertexMaterialArray.h"
 #include "BulletCollision/CollisionShapes/btMultimaterialTriangleMeshShape.h"
 #include "BulletCollision/CollisionShapes/btMaterial.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 // Create a custom material, just because we can
 class CustomMaterial : public btMaterial
@@ -72,30 +73,30 @@ inline btScalar	calculateCombinedRestitution(float restitution0,float restitutio
 
 
 
-static bool CustomMaterialCombinerCallback(btManifoldPoint& cp,	const btCollisionObject* colObj0,int partId0,int index0,const btCollisionObject* colObj1,int partId1,int index1)
+static bool CustomMaterialCombinerCallback(btManifoldPoint& cp,	const btCollisionObjectWrapper* colObj0Wrap,int partId0,int index0,const btCollisionObjectWrapper* colObj1Wrap,int partId1,int index1)
 {
     
     // Apply material properties
-    if (colObj0->getCollisionShape()->getShapeType() == TRIANGLE_SHAPE_PROXYTYPE)
+    if (colObj0Wrap->getCollisionShape()->getShapeType() == TRIANGLE_SHAPE_PROXYTYPE)
     {
-        const btCollisionShape* parent0 = colObj0->getRootCollisionShape();
+		const btCollisionShape* parent0 = colObj0Wrap->getCollisionObject()->getCollisionShape();
         if(parent0 != 0 && parent0->getShapeType() == MULTIMATERIAL_TRIANGLE_MESH_PROXYTYPE)
         {
             btMultimaterialTriangleMeshShape* shape = (btMultimaterialTriangleMeshShape*)parent0;
             const btMaterial * props = shape->getMaterialProperties(partId0, index0);
-            cp.m_combinedFriction = calculateCombinedFriction(props->m_friction, colObj1->getFriction());
-            cp.m_combinedRestitution = props->m_restitution * colObj1->getRestitution();
+            cp.m_combinedFriction = calculateCombinedFriction(props->m_friction, colObj1Wrap->getCollisionObject()->getFriction());
+            cp.m_combinedRestitution = props->m_restitution * colObj1Wrap->getCollisionObject()->getRestitution();
         }
     }
-    else if (colObj1->getCollisionShape()->getShapeType() == TRIANGLE_SHAPE_PROXYTYPE)
+    else if (colObj1Wrap->getCollisionShape()->getShapeType() == TRIANGLE_SHAPE_PROXYTYPE)
     {
-        const btCollisionShape* parent1 = colObj1->getRootCollisionShape();
+        const btCollisionShape* parent1 = colObj1Wrap->getCollisionObject()->getCollisionShape();
         if(parent1 != 0 && parent1->getShapeType() == MULTIMATERIAL_TRIANGLE_MESH_PROXYTYPE)
         {
             btMultimaterialTriangleMeshShape* shape = (btMultimaterialTriangleMeshShape*)parent1;
             const btMaterial * props = shape->getMaterialProperties(partId1, index1);
-            cp.m_combinedFriction = calculateCombinedFriction(props->m_friction, colObj0->getFriction());
-            cp.m_combinedRestitution = props->m_restitution * colObj0->getRestitution();
+            cp.m_combinedFriction = calculateCombinedFriction(props->m_friction, colObj0Wrap->getCollisionObject()->getFriction());
+            cp.m_combinedRestitution = props->m_restitution * colObj0Wrap->getCollisionObject()->getRestitution();
         }
     }
 
diff --git a/Demos/OpenGL/DebugCastResult.h b/Demos/OpenGL/DebugCastResult.h
index ef3befe44..ee476bf9f 100644
--- a/Demos/OpenGL/DebugCastResult.h
+++ b/Demos/OpenGL/DebugCastResult.h
@@ -75,7 +75,7 @@ struct btDebugCastResult : public btConvexCast::CastResult
 		btVector3 worldBoundsMax(1000,1000,1000);
 
 	
-		btScalar m[16];
+		ATTRIBUTE_ALIGNED16(btScalar) m[16];
 		btTransform hitTrans;
 		btTransformUtil::integrateTransform(m_fromTrans,m_linVel,m_angVel,fraction,hitTrans);
 		hitTrans.getOpenGLMatrix(m);
diff --git a/Demos/OpenGL/DemoApplication.cpp b/Demos/OpenGL/DemoApplication.cpp
index 8b07cf91f..d8b8f17ef 100644
--- a/Demos/OpenGL/DemoApplication.cpp
+++ b/Demos/OpenGL/DemoApplication.cpp
@@ -755,7 +755,7 @@ void DemoApplication::mouseFunc(int button, int state, int x, int y)
 					{
 
 
-						btRigidBody* body = btRigidBody::upcast(rayCallback.m_collisionObject);
+						btRigidBody* body = (btRigidBody*)btRigidBody::upcast(rayCallback.m_collisionObject);
 						if (body)
 						{
 							//other exclusions?
diff --git a/Demos/OpenGL/GL_DialogDynamicsWorld.cpp b/Demos/OpenGL/GL_DialogDynamicsWorld.cpp
index 0fa5b46f7..08948d273 100644
--- a/Demos/OpenGL/GL_DialogDynamicsWorld.cpp
+++ b/Demos/OpenGL/GL_DialogDynamicsWorld.cpp
@@ -460,7 +460,7 @@ bool GL_DialogDynamicsWorld::mouseFunc(int button, int state, int x, int y)
 
 						btScalar maxPickingClamp = mousePickClamping;
 
-						btRigidBody* body = btRigidBody::upcast(rayCallback.m_collisionObject);
+						btRigidBody* body = (btRigidBody*)btRigidBody::upcast(rayCallback.m_collisionObject);
 						if (body)
 						{
 							bool doPick = true;
diff --git a/Demos/OpenGL/GL_ShapeDrawer.cpp b/Demos/OpenGL/GL_ShapeDrawer.cpp
index bfcb5f735..37e01e7b4 100644
--- a/Demos/OpenGL/GL_ShapeDrawer.cpp
+++ b/Demos/OpenGL/GL_ShapeDrawer.cpp
@@ -719,7 +719,7 @@ void GL_ShapeDrawer::drawOpenGL(btScalar* m, const btCollisionShape* shape, cons
 				{
 					btSphereShape sc(multiSphereShape->getSphereRadius(i));
 					childTransform.setOrigin(multiSphereShape->getSpherePosition(i));
-					btScalar childMat[16];
+					ATTRIBUTE_ALIGNED16(btScalar) childMat[16];
 					childTransform.getOpenGLMatrix(childMat);
 					drawOpenGL(childMat,&sc,color,debugMode,worldBoundsMin,worldBoundsMax);
 				}
diff --git a/Demos/OpenGL/GlutDemoApplication.h b/Demos/OpenGL/GlutDemoApplication.h
index e2727a777..9d3a72178 100644
--- a/Demos/OpenGL/GlutDemoApplication.h
+++ b/Demos/OpenGL/GlutDemoApplication.h
@@ -19,10 +19,12 @@ subject to the following restrictions:
 
 #include "DemoApplication.h"
 
-class GlutDemoApplication : public DemoApplication
+ATTRIBUTE_ALIGNED16(class) GlutDemoApplication : public DemoApplication
 {
 public:
-	
+
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	void specialKeyboard(int key, int x, int y);
 
 	virtual void swapBuffers();
diff --git a/Demos/OpenGL/Win32DemoApplication.h b/Demos/OpenGL/Win32DemoApplication.h
index 0c2a1ee49..af3eec90e 100644
--- a/Demos/OpenGL/Win32DemoApplication.h
+++ b/Demos/OpenGL/Win32DemoApplication.h
@@ -20,13 +20,14 @@ subject to the following restrictions:
 
 #include "DemoApplication.h"
 
-class Win32DemoApplication : public DemoApplication
+ATTRIBUTE_ALIGNED16(class) Win32DemoApplication : public DemoApplication
 {
 protected:
 
 
 public:
-
+	
+	BT_DECLARE_ALIGNED_ALLOCATOR();
 	
 	virtual void	swapBuffers();
 		
diff --git a/Demos/SerializeDemo/AMD/CMakeLists.txt b/Demos/SerializeDemo/AMD/CMakeLists.txt
index ab13f6ebd..d154a13c8 100644
--- a/Demos/SerializeDemo/AMD/CMakeLists.txt
+++ b/Demos/SerializeDemo/AMD/CMakeLists.txt
@@ -48,7 +48,7 @@ IF (USE_GLUT)
 			../main.cpp
 			../SerializeDemo.cpp 
 			../SerializeDemo.h
-			${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+			${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOpenCLUtils.cpp
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOpenCLUtils.h
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOpenCLInclude.h
@@ -109,7 +109,7 @@ ELSE (USE_GLUT)
 		../Win32SerializeDemo.cpp
 		../SerializeDemo.cpp 
 		../SerializeDemo.h
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOpenCLUtils.cpp
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOpenCLUtils.h
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOpenCLInclude.h
diff --git a/Demos/SerializeDemo/CMakeLists.txt b/Demos/SerializeDemo/CMakeLists.txt
index b03175073..045acb409 100644
--- a/Demos/SerializeDemo/CMakeLists.txt
+++ b/Demos/SerializeDemo/CMakeLists.txt
@@ -34,7 +34,7 @@ IF (USE_GLUT)
 			main.cpp
 			SerializeDemo.cpp 
 			SerializeDemo.h
-			${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+			${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 		)
 	ELSE()
 		ADD_EXECUTABLE(AppSerializeDemo
@@ -73,7 +73,7 @@ ELSE (USE_GLUT)
 		Win32SerializeDemo.cpp
 		SerializeDemo.cpp 
 		SerializeDemo.h
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ENDIF (USE_GLUT)
 
diff --git a/Demos/SoftDemo/CMakeLists.txt b/Demos/SoftDemo/CMakeLists.txt
index c92199df8..e819e4625 100644
--- a/Demos/SoftDemo/CMakeLists.txt
+++ b/Demos/SoftDemo/CMakeLists.txt
@@ -24,7 +24,7 @@ IF (USE_GLUT)
 		ADD_EXECUTABLE(AppSoftBodyDemo
 			main.cpp
 			SoftDemo.cpp
-			${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+			${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 		)
 	ELSE()
 		ADD_EXECUTABLE(AppSoftBodyDemo
diff --git a/Demos/SoftDemo/SoftDemo.cpp b/Demos/SoftDemo/SoftDemo.cpp
index 0a370c1e9..d25acfc72 100644
--- a/Demos/SoftDemo/SoftDemo.cpp
+++ b/Demos/SoftDemo/SoftDemo.cpp
@@ -52,7 +52,7 @@ static btRigidBody* staticBody = 0;
 static float waveheight = 5.f;
 
 const float TRIANGLE_SIZE=8.f;
-unsigned int		current_demo=7;
+unsigned int		current_demo=20;
 #define DEMO_MODE_TIMEOUT 15.f //15 seconds for each demo
 
 
diff --git a/Demos/ThreadingDemo/CMakeLists.txt b/Demos/ThreadingDemo/CMakeLists.txt
index b27cce4cd..54bdf314d 100644
--- a/Demos/ThreadingDemo/CMakeLists.txt
+++ b/Demos/ThreadingDemo/CMakeLists.txt
@@ -26,7 +26,7 @@ BulletMultiThreaded BulletDynamics  BulletCollision LinearMath
 IF (WIN32)
 ADD_EXECUTABLE(AppThreadingDemo
 		main.cpp
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ELSE()
 	ADD_EXECUTABLE(AppThreadingDemo
diff --git a/Demos/ThreadingDemo/main.cpp b/Demos/ThreadingDemo/main.cpp
index 0bd6d6838..ac504ecd6 100644
--- a/Demos/ThreadingDemo/main.cpp
+++ b/Demos/ThreadingDemo/main.cpp
@@ -59,7 +59,12 @@ btThreadSupportInterface* createThreadSupport(int numThreads)
 
 struct	SampleArgs
 {
+	SampleArgs()
+		:m_fakeWork(1)
+	{
+	}
 	btCriticalSection* m_cs;
+	float m_fakeWork;
 };
 
 struct SampleThreadLocalStorage
@@ -86,6 +91,9 @@ void	SampleThreadFunc(void* userPtr,void* lsMemory)
 		{
 			printf("thread %d processed number %d\n",localStorage->threadId, count);
 		}
+		//do some fake work
+		for (int i=0;i<1000000;i++)
+			args->m_fakeWork = 1.21*args->m_fakeWork;
 		workLeft = count>0;
 	}
 	printf("finished\n");
@@ -110,7 +118,7 @@ void*	SamplelsMemoryFunc()
 
 int main(int argc,char** argv)
 {
-	int numThreads = 4;
+	int numThreads = 8;
 
 	btThreadSupportInterface* threadSupport = createThreadSupport(numThreads);
 
diff --git a/Demos/UserCollisionAlgorithm/CMakeLists.txt b/Demos/UserCollisionAlgorithm/CMakeLists.txt
index 47dc74f89..cfe9e4a17 100644
--- a/Demos/UserCollisionAlgorithm/CMakeLists.txt
+++ b/Demos/UserCollisionAlgorithm/CMakeLists.txt
@@ -22,7 +22,7 @@ LINK_LIBRARIES(
 IF (WIN32)
 	ADD_EXECUTABLE(AppUserCollisionAlgorithm
 		UserCollisionAlgorithm.cpp
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ELSE()
 	ADD_EXECUTABLE(AppUserCollisionAlgorithm
diff --git a/Demos/VoronoiFractureDemo/CMakeLists.txt b/Demos/VoronoiFractureDemo/CMakeLists.txt
index 7c06df228..478925439 100644
--- a/Demos/VoronoiFractureDemo/CMakeLists.txt
+++ b/Demos/VoronoiFractureDemo/CMakeLists.txt
@@ -27,7 +27,7 @@ ADD_EXECUTABLE(AppVoronoiFractureDemo
 		main.cpp
 		VoronoiFractureDemo.cpp 
 		VoronoiFractureDemo.h
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ELSE()
 	ADD_EXECUTABLE(AppVoronoiFractureDemo
@@ -74,7 +74,7 @@ ELSE (USE_GLUT)
 		Win32VoronoiFractureDemo.cpp
 		VoronoiFractureDemo.cpp 
 		VoronoiFractureDemo.h
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 	
 	
diff --git a/Demos/premake4.lua b/Demos/premake4.lua
index 4900db675..b0d874e13 100644
--- a/Demos/premake4.lua
+++ b/Demos/premake4.lua
@@ -16,7 +16,7 @@ function createDemos( demos, incdirs, linknames)
 	 		links { "opengl32" }
 			includedirs{	"../Glut"	}
 	 		libdirs {"../Glut"}
-	 		files   { "../msvc/bullet.rc" }
+	 		files   { "../build/bullet.rc" }
 	 		
 	 		configuration {"Windows", "x32"}
 				links {"glew32s","glut32"}
@@ -65,7 +65,6 @@ end
     "GenericJointDemo",
     "GimpactTestDemo",
     "GjkConvexCastDemo",
-    "HelloWorld",
     "InternalEdgeDemo",
     "MovingConcaveDemo",
     "MultiMaterialDemo",
diff --git a/Extras/GIMPACTUtils/btGImpactConvexDecompositionShape.cpp b/Extras/GIMPACTUtils/btGImpactConvexDecompositionShape.cpp
index 44d0a6de8..f7022ed2f 100644
--- a/Extras/GIMPACTUtils/btGImpactConvexDecompositionShape.cpp
+++ b/Extras/GIMPACTUtils/btGImpactConvexDecompositionShape.cpp
@@ -164,7 +164,7 @@ public:
 		float cpercent     = 5;
 		float ppercent     = 15;
 		unsigned int maxv  = 16;
-		float skinWidth    = 0.0;
+		float skinWidth    = 0.0f;
 
 
 		ConvexDecomposition::DecompDesc desc;
diff --git a/Extras/RigidBodyGpuPipeline/build/vs2008.bat b/Extras/RigidBodyGpuPipeline/build/vs2008.bat
index 02665f970..50c8617d7 100644
--- a/Extras/RigidBodyGpuPipeline/build/vs2008.bat
+++ b/Extras/RigidBodyGpuPipeline/build/vs2008.bat
@@ -4,7 +4,7 @@ rem premake4 --no-pedemos vs2008
 rem premake4 --no-bulletlibs --no-pelibs vs2008
 rem premake4 --with-nacl vs2008
 
-..\..\..\msvc\premake4 vs2008
+..\..\..\build\premake4 vs2008
 mkdir vs2008\cache
 
 pause
\ No newline at end of file
diff --git a/Extras/RigidBodyGpuPipeline/build/vs2010.bat b/Extras/RigidBodyGpuPipeline/build/vs2010.bat
index 9122bab9a..6b77748fb 100644
--- a/Extras/RigidBodyGpuPipeline/build/vs2010.bat
+++ b/Extras/RigidBodyGpuPipeline/build/vs2010.bat
@@ -1,5 +1,5 @@
 
-..\..\..\msvc\premake4 vs2010
+..\..\..\build\premake4 vs2010
 
 mkdir vs2010\cache
 pause
\ No newline at end of file
diff --git a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp
index 7229b8e95..ca08ef5a6 100644
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp
@@ -267,9 +267,7 @@ void bt3dGridBroadphaseOCL::allocateBuffers()
 	m_dPairsChanged = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
 	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
 
-	m_dPairsContiguous = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
-	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
-
+	
 	memSize = 3 * 4 * sizeof(float);
 	m_dBpParams = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
 	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
diff --git a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h
index dee297c29..c12efa37a 100644
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h
@@ -92,7 +92,6 @@ public:
 protected:
 	cl_mem					m_dPairScanChanged;
 	cl_mem					m_dPairsChanged;
-	cl_mem					m_dPairsContiguous;
 	cl_mem					m_dBpParams;
 
 	adl::Device*			m_deviceHost;
diff --git a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/btGridBroadphaseCL.cpp b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/btGridBroadphaseCL.cpp
index 9e0ea62cc..4cd9264a7 100644
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/btGridBroadphaseCL.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/btGridBroadphaseCL.cpp
@@ -63,8 +63,7 @@ btGridBroadphaseCl::btGridBroadphaseCl(	btOverlappingPairCache* overlappingPairC
 						maxSmallProxySize,maxSmallProxiesPerCell,
 						context,device,queue,deviceCL)			
 {
-	m_computeAabbKernel = m_deviceCL->getKernel(COMPUTE_AABB_KERNEL_PATH,"computeAabb","",spComputeAabbSource);
-
+	
 	m_countOverlappingPairs = m_deviceCL->getKernel(COMPUTE_AABB_KERNEL_PATH,"countOverlappingpairs","",spComputeAabbSource);
 
 	m_squeezePairCaches = m_deviceCL->getKernel(COMPUTE_AABB_KERNEL_PATH,"squeezePairCaches","",spComputeAabbSource);
diff --git a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/btGridBroadphaseCL.h b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/btGridBroadphaseCL.h
index 7f064488d..51a7d9f76 100644
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/btGridBroadphaseCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/btGridBroadphaseCL.h
@@ -35,7 +35,6 @@ class btGridBroadphaseCl : public bt3dGridBroadphaseOCL
 {
 protected:
 
-	adl::Kernel*			m_computeAabbKernel;
 	adl::Kernel*			m_countOverlappingPairs;
 	adl::Kernel*			m_squeezePairCaches;
 
diff --git a/Extras/Serialize/BulletWorldImporter/btBulletWorldImporter.cpp b/Extras/Serialize/BulletWorldImporter/btBulletWorldImporter.cpp
index 7e58f3c53..dab5d6764 100644
--- a/Extras/Serialize/BulletWorldImporter/btBulletWorldImporter.cpp
+++ b/Extras/Serialize/BulletWorldImporter/btBulletWorldImporter.cpp
@@ -881,8 +881,8 @@ bool	btBulletWorldImporter::convertAllObjects(  bParse::btBulletFile* bulletFile
 				startTransform.deSerializeDouble(colObjData->m_worldTransform);
 				btCollisionShape* shape = (btCollisionShape*)*shapePtr;
 				btCollisionObject* body = createCollisionObject(startTransform,shape,colObjData->m_name);
-				body->setFriction(colObjData->m_friction);
-				body->setRestitution(colObjData->m_restitution);
+				body->setFriction(btScalar(colObjData->m_friction));
+				body->setRestitution(btScalar(colObjData->m_restitution));
 				
 #ifdef USE_INTERNAL_EDGE_UTILITY
 				if (shape->getShapeType() == TRIANGLE_MESH_SHAPE_PROXYTYPE)
diff --git a/Extras/Serialize/ReadBulletSample/CMakeLists.txt b/Extras/Serialize/ReadBulletSample/CMakeLists.txt
index eca415d76..e819b9e5e 100644
--- a/Extras/Serialize/ReadBulletSample/CMakeLists.txt
+++ b/Extras/Serialize/ReadBulletSample/CMakeLists.txt
@@ -8,7 +8,7 @@ LINK_LIBRARIES(
 )
 IF (WIN32)
 	SET(ADDITIONAL_SRC
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		${BULLET_PHYSICS_SOURCE_DIR}/build/bullet.rc
 	)
 ENDIF()
 
diff --git a/Test/Info.plist b/Test/Info.plist
new file mode 100644
index 000000000..4785b60c5
--- /dev/null
+++ b/Test/Info.plist
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleDisplayName</key>
+	<string>${PRODUCT_NAME}</string>
+	<key>CFBundleExecutable</key>
+	<string>${EXECUTABLE_NAME}</string>
+	<key>CFBundleIconFiles</key>
+	<array/>
+	<key>CFBundleIdentifier</key>
+	<string>Apple.${PRODUCT_NAME:rfc1034identifier}</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>${PRODUCT_NAME}</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleSignature</key>
+	<string>????</string>
+	<key>CFBundleVersion</key>
+	<string>1.0</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+</dict>
+</plist>
diff --git a/Test/README.txt b/Test/README.txt
new file mode 100644
index 000000000..c231a5491
--- /dev/null
+++ b/Test/README.txt
@@ -0,0 +1,28 @@
+1) Add a .cpp and .h file for your test function.  The function should conform to:
+
+    #ifdef __cplusplus
+        extern "C" {
+    #endif
+
+        #include "Utils.h"
+        #include "main.h"
+        #include "vector.h"
+
+        // Your test function
+        int MyTestFunc(void);
+
+    #ifdef __cplusplus
+        }
+    #endif
+
+    The rest of the program doesn't care or know what you do in MyTestFunc, except that MyTestFunc should return non-zero in case of failure in MyTestFunc. There are some handy functions in Utils.h that you might want to use. Please use vlog instead of printf to print stuff, and random_number32/64() in place of rand(), so I can multithread later if it comes to that.  There are some read-only globals that you may wish to respond to, declared in Utils.h:
+
+        gReportAverageTimes	if you do timing, report times as averages instead of best times if non-zero
+        gExitOnError		if non-zero, return non-zero immediately if you encounter an error
+        gAppName			(const char*) the name of the application
+
+    As a convenience, vector.h has some cross platform vector types declared and will correctly include various vector headers according to compiler flag.
+
+
+2) Add an entry to gTestList in TestList.cpp for your test function, so the rest of the app knows to call it
+
diff --git a/Test/Source/TestList.cpp b/Test/Source/TestList.cpp
new file mode 100644
index 000000000..7e342b1bf
--- /dev/null
+++ b/Test/Source/TestList.cpp
@@ -0,0 +1,97 @@
+//
+//  TestList.c
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#include <stdlib.h>
+#include "TestList.h"
+
+#include "Test_qtmul.h"
+#include "Test_qtmulQV3.h"
+#include "Test_qtmulV3Q.h"
+#include "Test_qtdot.h"
+#include "Test_qtnorm.h"
+
+#include "Test_v3dot.h"
+#include "Test_v3sdiv.h"
+#include "Test_v3norm.h"
+#include "Test_v3cross.h"
+#include "Test_v3triple.h"
+#include "Test_v3interp.h"
+#include "Test_v3lerp.h"
+#include "Test_v3skew.h"
+#include "Test_v3div.h"
+#include "Test_v3rotate.h"
+
+#include "Test_maxdot.h"
+#include "Test_mindot.h"
+#include "Test_dot3.h"
+#include "Test_3x3transpose.h"
+#include "Test_3x3transposeTimes.h"
+#include "Test_3x3timesTranspose.h"
+#include "Test_3x3mulM.h"
+#include "Test_3x3mulM1M2.h"
+#include "Test_3x3mulMV.h"
+#include "Test_3x3mulVM.h"
+#include "Test_3x3setRot.h"
+#include "Test_3x3getRot.h"
+
+#include "Test_btDbvt.h"
+#include "Test_quat_aos_neon.h"
+
+#include "LinearMath/btScalar.h"
+#define ENTRY( _name, _func )       { _name, _func }
+
+//
+// Test functions have the form  int (*TestFunc)( void )
+// They return a non-zero result in case of failure.
+//
+// Please see handy stuff in Utils.h, vector.h when writing your test code.
+//
+#if defined (BT_USE_NEON) || defined (BT_USE_SSE_IN_API)
+
+TestDesc  gTestList[] = 
+{
+    ENTRY( "maxdot", Test_maxdot ),
+    ENTRY( "mindot", Test_mindot ),
+
+    ENTRY( "qtmul", Test_qtmul ),
+    ENTRY( "qtmulQV3", Test_qtmulQV3 ),
+    ENTRY( "qtmulV3Q", Test_qtmulV3Q ),
+    ENTRY( "qtdot", Test_qtdot ),
+    ENTRY( "qtnorm", Test_qtnorm ),
+
+    ENTRY( "v3dot", Test_v3dot ),
+    ENTRY( "v3sdiv", Test_v3sdiv ),
+    ENTRY( "v3norm", Test_v3norm ),
+    ENTRY( "v3cross", Test_v3cross ),
+    ENTRY( "v3triple", Test_v3triple ),
+    ENTRY( "v3interp", Test_v3interp ),
+    ENTRY( "v3lerp", Test_v3lerp ),
+    ENTRY( "v3skew", Test_v3skew ),
+    ENTRY( "v3div", Test_v3div ),
+    ENTRY( "v3rotate", Test_v3rotate ),
+
+    ENTRY( "dot3", Test_dot3 ),
+    ENTRY( "3x3transpose", Test_3x3transpose ),
+    ENTRY( "3x3transposeTimes", Test_3x3transposeTimes ),
+    ENTRY( "3x3timesTranspose", Test_3x3timesTranspose ),
+    ENTRY( "3x3mulM", Test_3x3mulM ),
+    ENTRY( "3x3mulM1M2", Test_3x3mulM1M2 ),
+    ENTRY( "3x3mulMV", Test_3x3mulMV ),
+    ENTRY( "3x3mulVM", Test_3x3mulMV ),
+    ENTRY( "3x3setRot", Test_3x3setRot ),
+    ENTRY( "3x3getRot", Test_3x3getRot ),
+  
+    ENTRY( "btDbvt", Test_btDbvt ),
+    ENTRY("quat_aos_neon", Test_quat_aos_neon),
+    
+    { NULL, NULL }
+};
+#else
+TestDesc  gTestList[]={{NULL,NULL}};
+
+#endif
+
diff --git a/Test/Source/TestList.h b/Test/Source/TestList.h
new file mode 100644
index 000000000..f66d4c67c
--- /dev/null
+++ b/Test/Source/TestList.h
@@ -0,0 +1,28 @@
+//
+//  TestList.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_TestList_h
+#define BulletTest_TestList_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+    
+typedef struct TestDesc
+{
+    const char *name;
+    int (*test_func)(void);     // return 0 for success, non-zero for failure
+}TestDesc;
+
+extern TestDesc  gTestList[];
+    
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/Test/Source/Tests/Test_3x3getRot.cpp b/Test/Source/Tests/Test_3x3getRot.cpp
new file mode 100644
index 000000000..dbf2241e7
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3getRot.cpp
@@ -0,0 +1,158 @@
+//
+//  Test_3x3getRot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_3x3getRot.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_m1p1, RANDF_m1p1, RANDF_m1p1, BT_NAN );      // w channel NaN
+}
+
+static inline btSimdFloat4 qtNAN_f4(void)
+{
+    return btAssign128( BT_NAN, BT_NAN, BT_NAN, BT_NAN );
+}
+
+static void M3x3getRot_ref( const btMatrix3x3 &m, btQuaternion &q )
+{
+    btVector3 m_el[3] = { m[0], m[1], m[2] };
+
+    btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
+
+    btScalar temp[4];
+
+    if (trace > btScalar(0.0)) 
+    {
+        btScalar s = btSqrt(trace + btScalar(1.0));
+        temp[3]=(s * btScalar(0.5));
+        s = btScalar(0.5) / s;
+
+        temp[0]=((m_el[2].y() - m_el[1].z()) * s);
+        temp[1]=((m_el[0].z() - m_el[2].x()) * s);
+        temp[2]=((m_el[1].x() - m_el[0].y()) * s);
+    } 
+    else 
+    {
+        int i = m_el[0].x() < m_el[1].y() ? 
+            (m_el[1].y() < m_el[2].z() ? 2 : 1) :
+            (m_el[0].x() < m_el[2].z() ? 2 : 0); 
+        int j = (i + 1) % 3;  
+        int k = (i + 2) % 3;
+
+        btScalar s = btSqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0));
+        temp[i] = s * btScalar(0.5);
+        s = btScalar(0.5) / s;
+
+        temp[3] = (m_el[k][j] - m_el[j][k]) * s;
+        temp[j] = (m_el[j][i] + m_el[i][j]) * s;
+        temp[k] = (m_el[k][i] + m_el[i][k]) * s;
+    }
+    q.setValue(temp[0],temp[1],temp[2],temp[3]);
+}
+
+static int operator!= ( const btQuaternion &a, const btQuaternion &b )
+{
+    if( fabs(a.x() - b.x()) + 
+        fabs(a.y() - b.y()) +
+        fabs(a.z() - b.z()) +
+        fabs(a.w() - b.w()) > FLT_EPSILON * 4)
+        return 1;
+    
+    return 0;
+}
+
+int Test_3x3getRot(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3     in1[ARRAY_SIZE];
+    btQuaternion    out[ARRAY_SIZE];
+    btQuaternion    out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        out[i] = btQuaternion(qtNAN_f4());   
+        out2[i] = btQuaternion(qtNAN_f4());
+        
+        M3x3getRot_ref(in1[i], out[i]);
+        in1[i].getRotation(out2[i]);
+
+        if( out[i] != out2[i] )
+        {
+            vlog( "Error - M3x3getRot result error! ");
+            vlog( "failure @ %ld\n", i);
+         	vlog( 	"\ncorrect = (%10.7f, %10.7f, %10.7f, %10.7f) "
+					"\ntested  = (%10.7f, %10.7f, %10.7f, %10.7f) \n", 
+					out[i].x(), out[i].y(), out[i].z(), out[i].w(),
+					out2[i].x(), out2[i].y(), out2[i].z(), out2[i].w());
+		
+            return -1;
+        }
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = ~(bestTime&0);//-1ULL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            M3x3getRot_ref(in1[i], out[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = ~(bestTime&0);//-1ULL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+        {
+            in1[i].getRotation(out2[i]);
+        }
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+
+#endif//BT_USE_SSE
diff --git a/Test/Source/Tests/Test_3x3getRot.h b/Test/Source/Tests/Test_3x3getRot.h
new file mode 100644
index 000000000..1998763bb
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3getRot.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3getRot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3getRot_h
+#define BulletTest_Test_3x3getRot_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3getRot(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_3x3mulM.cpp b/Test/Source/Tests/Test_3x3mulM.cpp
new file mode 100644
index 000000000..51a013e1f
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3mulM.cpp
@@ -0,0 +1,169 @@
+//
+//  Test_3x3mulM.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_3x3mulM.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_01, RANDF_01, RANDF_01, BT_NAN );      // w channel NaN
+}
+
+static btMatrix3x3 M3x3mulM_ref( btMatrix3x3 &in, const btMatrix3x3 &m )
+{
+    btVector3 m_el[3] = { in[0], in[1], in[2] };
+
+	in.setValue(
+        m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
+		m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
+		m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
+
+    return in;
+}
+
+static SIMD_FORCE_INLINE	bool	fuzzyEqualSlow(const btVector3& ref, const btVector3& other)
+{
+	const btScalar epsilon = SIMD_EPSILON;
+	return ((btFabs(ref.m_floats[3]-other.m_floats[3])<=epsilon) &&
+            (btFabs(ref.m_floats[2]-other.m_floats[2])<=epsilon) && 
+            (btFabs(ref.m_floats[1]-other.m_floats[1])<=epsilon) && 
+            (btFabs(ref.m_floats[0]-other.m_floats[0])<=epsilon));
+}
+	
+
+static int operator!= ( const btMatrix3x3 &a, const btMatrix3x3 &b )
+{
+    if( a.getRow(0) != b.getRow(0) )
+	{
+		if (!fuzzyEqualSlow(a.getRow(0),b.getRow(0)))
+		{
+			return 1;
+		}
+	}
+    if( a.getRow(1) != b.getRow(1) )
+	{
+	    if( !fuzzyEqualSlow(a.getRow(1),b.getRow(1)) )
+	        return 1;
+	}
+    if( a.getRow(2) != b.getRow(2) )
+	{
+		if( !fuzzyEqualSlow(a.getRow(2),b.getRow(2)) )
+		{
+			return 1;
+		}
+	}
+    return 0;
+}
+
+int Test_3x3mulM(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3 in1[ARRAY_SIZE];
+    btMatrix3x3 in2[ARRAY_SIZE];
+    btMatrix3x3 in3[ARRAY_SIZE];
+    btMatrix3x3 out[ARRAY_SIZE];
+    btMatrix3x3 out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in2[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in3[i] = in1[i];
+        
+        out[i] = M3x3mulM_ref(in1[i], in2[i]);
+        out2[i] = (in3[i] *= in2[i]);
+        
+        if( out[i] != out2[i] )
+        {
+ 			vlog( "Error - M3x3mulM result error! ");
+            vlog( "failure @ %ld\n", i);
+            btVector3 m0, m1, m2;
+            m0 = out[i].getRow(0);
+            m1 = out[i].getRow(1);
+            m2 = out[i].getRow(2);
+            
+            vlog(   "\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\n          (%10.4f, %10.4f, %10.4f, %10.4f) "
+                    "\n          (%10.4f, %10.4f, %10.4f, %10.4f) \n",
+                    m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+                    m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+                    m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+            m0 = out2[i].getRow(0);
+            m1 = out2[i].getRow(1);
+            m2 = out2[i].getRow(2);
+					
+            vlog(   "\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\n          (%10.4f, %10.4f, %10.4f, %10.4f) " 
+					"\n          (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+                    m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+                    m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+            return -1;
+        }
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = M3x3mulM_ref(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out2[i] = (in3[i] *= in2[i]);
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_3x3mulM.h b/Test/Source/Tests/Test_3x3mulM.h
new file mode 100644
index 000000000..fb5128e66
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3mulM.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3mulM.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3mulM_h
+#define BulletTest_Test_3x3mulM_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3mulM(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_3x3mulM1M2.cpp b/Test/Source/Tests/Test_3x3mulM1M2.cpp
new file mode 100644
index 000000000..c435c546c
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3mulM1M2.cpp
@@ -0,0 +1,164 @@
+//
+//  Test_3x3mulM1M2.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_3x3mulM1M2.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_01, RANDF_01, RANDF_01, BT_NAN );      // w channel NaN
+}
+
+static btMatrix3x3 M3x3mulM1M2_ref( const btMatrix3x3 &m1, const btMatrix3x3 &m2 )
+{
+	return btMatrix3x3(
+        m2.tdotx(m1[0]), m2.tdoty(m1[0]), m2.tdotz(m1[0]),
+		m2.tdotx(m1[1]), m2.tdoty(m1[1]), m2.tdotz(m1[1]),
+		m2.tdotx(m1[2]), m2.tdoty(m1[2]), m2.tdotz(m1[2]));
+}
+
+static bool	fuzzyEqualSlow(const btVector3& ref, const btVector3& other)
+{
+	const btScalar epsilon = SIMD_EPSILON;
+	return ((btFabs(ref.m_floats[3]-other.m_floats[3])<=epsilon) &&
+            (btFabs(ref.m_floats[2]-other.m_floats[2])<=epsilon) && 
+            (btFabs(ref.m_floats[1]-other.m_floats[1])<=epsilon) && 
+            (btFabs(ref.m_floats[0]-other.m_floats[0])<=epsilon));
+}
+	
+
+static int operator!= ( const btMatrix3x3 &a, const btMatrix3x3 &b )
+{
+    if( a.getRow(0) != b.getRow(0) )
+	{
+		if (!fuzzyEqualSlow(a.getRow(0),b.getRow(0)))
+		{
+			return 1;
+		}
+	}
+    if( a.getRow(1) != b.getRow(1) )
+	{
+	    if( !fuzzyEqualSlow(a.getRow(1),b.getRow(1)) )
+	        return 1;
+	}
+    if( a.getRow(2) != b.getRow(2) )
+	{
+		if( !fuzzyEqualSlow(a.getRow(2),b.getRow(2)) )
+		{
+			return 1;
+		}
+	}
+    return 0;
+}
+
+int Test_3x3mulM1M2(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3 in1[ARRAY_SIZE];
+    btMatrix3x3 in2[ARRAY_SIZE];
+    btMatrix3x3 out[ARRAY_SIZE];
+    btMatrix3x3 out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in2[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        
+        out[i] = M3x3mulM1M2_ref(in1[i], in2[i]);
+        out2[i] = (in1[i] * in2[i]);
+        
+        if( out[i] != out2[i] )
+        {
+ 			vlog( "Error - M3x3mulM1M2 result error! ");
+            vlog( "failure @ %ld\n", i);
+            btVector3 m0, m1, m2;
+            m0 = out[i].getRow(0);
+            m1 = out[i].getRow(1);
+            m2 = out[i].getRow(2);
+            
+            vlog(   "\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\n          (%10.4f, %10.4f, %10.4f, %10.4f) "
+                    "\n          (%10.4f, %10.4f, %10.4f, %10.4f) \n",
+                    m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+                    m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+                    m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+            m0 = out2[i].getRow(0);
+            m1 = out2[i].getRow(1);
+            m2 = out2[i].getRow(2);
+					
+            vlog(   "\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\n          (%10.4f, %10.4f, %10.4f, %10.4f) " 
+					"\n          (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+                    m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+                    m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+            return -1;
+        }
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = M3x3mulM1M2_ref(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out2[i] = (in1[i] * in2[i]);
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_3x3mulM1M2.h b/Test/Source/Tests/Test_3x3mulM1M2.h
new file mode 100644
index 000000000..a3075bd86
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3mulM1M2.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3mulM1M2.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3mulM1M2_h
+#define BulletTest_Test_3x3mulM1M2_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3mulM1M2(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_3x3mulMV.cpp b/Test/Source/Tests/Test_3x3mulMV.cpp
new file mode 100644
index 000000000..ba07831e3
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3mulMV.cpp
@@ -0,0 +1,112 @@
+//
+//  Test_3x3mulMV.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_3x3mulMV.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+	return btAssign128(RANDF_01, RANDF_01, RANDF_01, BT_NAN );      // w channel NaN
+}
+
+static btVector3 M3x3mulMV_ref( const btMatrix3x3 &m, const btVector3 &v )
+{
+	return btVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
+}
+
+int Test_3x3mulMV(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3 in1[ARRAY_SIZE];
+    btVector3   in2[ARRAY_SIZE];
+    btVector3   out[ARRAY_SIZE];
+    btVector3   out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in2[i] = btVector3(rand_f4());   
+        
+        out[i] = M3x3mulMV_ref(in1[i], in2[i]);
+        out2[i] = (in1[i] * in2[i]);
+
+		if( fabsf(out[i].m_floats[0] - out2[i].m_floats[0]) + 
+			fabsf(out[i].m_floats[1] - out2[i].m_floats[1]) +
+			fabsf(out[i].m_floats[2] - out2[i].m_floats[2]) +
+			fabsf(out[i].m_floats[3] - out2[i].m_floats[3]) > FLT_EPSILON*4 )
+		{	
+			vlog( "Error - M3x3mulMV result error! ");
+            vlog( "failure @ %ld\n", i);
+			vlog( 	"\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					out[i].m_floats[0], out[i].m_floats[1], out[i].m_floats[2], out[i].m_floats[3], 
+					out2[i].m_floats[0], out2[i].m_floats[1], out2[i].m_floats[2], out2[i].m_floats[3]);
+		
+			return 1;
+		}
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = M3x3mulMV_ref(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out2[i] = (in1[i] * in2[i]);
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_3x3mulMV.h b/Test/Source/Tests/Test_3x3mulMV.h
new file mode 100644
index 000000000..877380d3d
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3mulMV.h
@@ -0,0 +1,23 @@
+//
+//  Test_3x3mulMV.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3mulMV_h
+#define BulletTest_Test_3x3mulMV_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3mulMV(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
+
diff --git a/Test/Source/Tests/Test_3x3mulVM.cpp b/Test/Source/Tests/Test_3x3mulVM.cpp
new file mode 100644
index 000000000..86db895ca
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3mulVM.cpp
@@ -0,0 +1,112 @@
+//
+//  Test_3x3mulVM.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_3x3mulVM.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_01, RANDF_01, RANDF_01, BT_NAN );      // w channel NaN
+}
+
+static btVector3 M3x3mulVM_ref(  const btVector3 &v, const btMatrix3x3 &m)
+{
+	return btVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
+}
+
+int Test_3x3mulVM(void)
+{
+    // Init an array flanked by guard pages
+    btVector3   in1[ARRAY_SIZE];
+    btMatrix3x3 in2[ARRAY_SIZE];
+    btVector3   out[ARRAY_SIZE];
+    btVector3   out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btVector3(rand_f4());   
+        in2[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        
+        out[i] = M3x3mulVM_ref(in1[i], in2[i]);
+        out2[i] = (in1[i] * in2[i]);
+
+		if( fabsf(out[i].m_floats[0] - out2[i].m_floats[0]) + 
+			fabsf(out[i].m_floats[1] - out2[i].m_floats[1]) +
+			fabsf(out[i].m_floats[2] - out2[i].m_floats[2]) +
+			fabsf(out[i].m_floats[3] - out2[i].m_floats[3]) > FLT_EPSILON*4 )
+		{	
+			vlog( "Error - M3x3mulVM result error! ");
+            vlog( "failure @ %ld\n", i);
+			vlog( 	"\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					out[i].m_floats[0], out[i].m_floats[1], out[i].m_floats[2], out[i].m_floats[3], 
+					out2[i].m_floats[0], out2[i].m_floats[1], out2[i].m_floats[2], out2[i].m_floats[3]);
+		
+			return 1;
+		}
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = M3x3mulVM_ref(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out2[i] = (in1[i] * in2[i]);
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_3x3mulVM.h b/Test/Source/Tests/Test_3x3mulVM.h
new file mode 100644
index 000000000..81c34df89
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3mulVM.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3mulVM.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3mulVM_h
+#define BulletTest_Test_3x3mulVM_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3mulVM(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_3x3setRot.cpp b/Test/Source/Tests/Test_3x3setRot.cpp
new file mode 100644
index 000000000..49f177521
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3setRot.cpp
@@ -0,0 +1,171 @@
+//
+//  Test_3x3setRot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_3x3setRot.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_01, RANDF_01, RANDF_01, BT_NAN );      // w channel NaN
+}
+
+static inline btSimdFloat4 qtrand_f4(void)
+{
+    return btAssign128( RANDF_01, RANDF_01, RANDF_01, RANDF_01 );
+}
+
+static btMatrix3x3 M3x3setRot_ref( btMatrix3x3 &m, const btQuaternion &q )
+{
+    btScalar d = q.length2();
+    btScalar s = btScalar(2.0) / d;
+
+    btScalar xs = q.x() * s,   ys = q.y() * s,   zs = q.z() * s;
+    
+    btScalar wx = q.w() * xs,  wy = q.w() * ys,  wz = q.w() * zs;
+    btScalar xx = q.x() * xs,  xy = q.x() * ys,  xz = q.x() * zs;
+    btScalar yy = q.y() * ys,  yz = q.y() * zs,  zz = q.z() * zs;
+    m.setValue(
+        btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
+        xy + wz, btScalar(1.0) - (xx + zz), yz - wx,
+        xz - wy, yz + wx, btScalar(1.0) - (xx + yy));
+
+    return m;
+}
+
+
+static int operator!= ( const btMatrix3x3 &a, const btMatrix3x3 &b )
+{
+    int i; 
+    btVector3 av3, bv3;
+
+    for(i=0; i<3; i++)
+    {
+        av3 = a.getRow(i);
+        bv3 = b.getRow(i);
+        
+        if( fabs(av3.m_floats[0] - bv3.m_floats[0]) + 
+            fabs(av3.m_floats[1] - bv3.m_floats[1]) +
+            fabs(av3.m_floats[2] - bv3.m_floats[2]) > FLT_EPSILON * 4)
+            return 1;
+    }
+    
+    return 0;
+}
+
+int Test_3x3setRot(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3     in1[ARRAY_SIZE];
+    btQuaternion    in2[ARRAY_SIZE];
+    btMatrix3x3     in3[ARRAY_SIZE];
+    btMatrix3x3     out[ARRAY_SIZE];
+    btMatrix3x3     out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in2[i] = btQuaternion(qtrand_f4());   
+        in3[i] = in1[i];
+        
+        out[i] = M3x3setRot_ref(in1[i], in2[i]);
+        in3[i].setRotation(in2[i]);
+        out2[i] = in3[i];
+
+        if( out[i] != out2[i] )
+        {
+            vlog( "Error - M3x3setRot result error! ");
+            vlog( "failure @ %ld\n", i);
+            btVector3 m0, m1, m2;
+            m0 = out[i].getRow(0);
+            m1 = out[i].getRow(1);
+            m2 = out[i].getRow(2);
+            
+            vlog(   "\ncorrect = (%10.7f, %10.7f, %10.7f, %10.7f) "
+					"\n          (%10.7f, %10.7f, %10.7f, %10.7f) "
+                    "\n          (%10.7f, %10.7f, %10.7f, %10.7f) \n",
+                    m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+                    m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+                    m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+            m0 = out2[i].getRow(0);
+            m1 = out2[i].getRow(1);
+            m2 = out2[i].getRow(2);
+					
+            vlog(   "\ntested  = (%10.7f, %10.7f, %10.7f, %10.7f) "
+					"\n          (%10.7f, %10.7f, %10.7f, %10.7f) " 
+					"\n          (%10.7f, %10.7f, %10.7f, %10.7f) \n", 
+					m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+                    m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+                    m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+            return -1;
+        }
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = M3x3setRot_ref(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+        {
+            in3[i].setRotation(in2[i]);
+            out2[i] = in3[i];
+        }
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+#endif //BT_USE_SSE
+
diff --git a/Test/Source/Tests/Test_3x3setRot.h b/Test/Source/Tests/Test_3x3setRot.h
new file mode 100644
index 000000000..aac0ebf9a
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3setRot.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3setRot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3setRot_h
+#define BulletTest_Test_3x3setRot_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3setRot(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_3x3timesTranspose.cpp b/Test/Source/Tests/Test_3x3timesTranspose.cpp
new file mode 100644
index 000000000..70e12e4ad
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3timesTranspose.cpp
@@ -0,0 +1,117 @@
+//
+//  Test_3x3timesTranspose.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_3x3timesTranspose.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF, RANDF, RANDF, BT_NAN );      // w channel NaN
+}
+
+static btMatrix3x3 timesTranspose( const btMatrix3x3 &in, const btMatrix3x3 &m )
+{
+    btVector3 m_el[3] = { in[0], in[1], in[2] };
+	return btMatrix3x3(
+                       m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
+                       m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
+                       m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
+}
+
+static int operator!= ( const btMatrix3x3 &a, const btMatrix3x3 &b )
+{
+    if( a.getRow(0) != b.getRow(0) )
+        return 1;
+    if( a.getRow(1) != b.getRow(1) )
+        return 1;
+    if( a.getRow(2) != b.getRow(2) )
+        return 1;
+    return 0;
+}
+
+int Test_3x3timesTranspose(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3 in1[ARRAY_SIZE];
+    btMatrix3x3 in2[ARRAY_SIZE];
+    btMatrix3x3 out[ARRAY_SIZE];
+    btMatrix3x3 out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in2[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        
+        out[i] = timesTranspose(in1[i], in2[i]);
+        out2[i] = in1[i].timesTranspose(in2[i]);
+        
+        if( out[i] != out2[i] )
+        {
+            printf( "failure @ %ld\n", i);
+            return -1;
+        }
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = timesTranspose(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = in1[i].timesTranspose(in2[i]);
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_3x3timesTranspose.h b/Test/Source/Tests/Test_3x3timesTranspose.h
new file mode 100644
index 000000000..a1c396bd2
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3timesTranspose.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3timesTranspose.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3timesTranspose_h
+#define BulletTest_Test_3x3timesTranspose_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3timesTranspose(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_3x3transpose.cpp b/Test/Source/Tests/Test_3x3transpose.cpp
new file mode 100644
index 000000000..c85333ff2
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3transpose.cpp
@@ -0,0 +1,116 @@
+//
+//  Test_3x3transpose.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_3x3transpose.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 1024 
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF, RANDF, RANDF, BT_NAN );      // w channel NaN
+}
+
+static btMatrix3x3 Transpose( btMatrix3x3 &in )
+{
+    btVector3 row0 = in.getRow(0);
+    btVector3 row1 = in.getRow(1);
+    btVector3 row2 = in.getRow(2);
+	btVector3 col0 = btAssign128(row0.x(), row1.x(), row2.x(), 0 );
+	btVector3 col1 = btAssign128(row0.y(), row1.y(), row2.y(), 0 );
+	btVector3 col2 = btAssign128(row0.z(), row1.z(), row2.z(), 0);
+	return btMatrix3x3( col0, col1, col2);
+}
+
+static int operator!= ( const btMatrix3x3 &a, const btMatrix3x3 &b )
+{
+    if( a.getRow(0) != b.getRow(0) )
+        return 1;
+    if( a.getRow(1) != b.getRow(1) )
+        return 1;
+    if( a.getRow(2) != b.getRow(2) )
+        return 1;
+    return 0;
+}
+
+int Test_3x3transpose(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3 in[ARRAY_SIZE];
+    btMatrix3x3 out[ARRAY_SIZE];
+    btMatrix3x3 out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+     
+        out[i] = Transpose(in[i]);
+        out2[i] = in[i].transpose();
+        
+        if( out[i] != out2[i] )
+        {
+            printf( "failure @ %ld\n", i);
+            return -1;
+        }
+    }
+
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = Transpose(in[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = in[i].transpose();
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+        
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+        
+    return 0;
+}
+#endif //BT_USE_SSE
+
diff --git a/Test/Source/Tests/Test_3x3transpose.h b/Test/Source/Tests/Test_3x3transpose.h
new file mode 100644
index 000000000..3c4b834c2
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3transpose.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3transpose.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3transpose_h
+#define BulletTest_Test_3x3transpose_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+    
+    int Test_3x3transpose(void);
+    
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/Test/Source/Tests/Test_3x3transposeTimes.cpp b/Test/Source/Tests/Test_3x3transposeTimes.cpp
new file mode 100644
index 000000000..5f14caf76
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3transposeTimes.cpp
@@ -0,0 +1,168 @@
+//
+//  Test_3x3transposeTimes.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_3x3transposeTimes.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_01, RANDF_01, RANDF_01, BT_NAN );      // w channel NaN
+}
+
+static btMatrix3x3 TransposeTimesReference( const btMatrix3x3 &in, const btMatrix3x3 &m )
+{
+    btVector3 m_el[3] = { in[0], in[1], in[2] };
+    btSimdFloat4 r0 = btAssign128(m_el[0].x() * m[0].x() + m_el[1].x() * m[1].x() + m_el[2].x() * m[2].x(),
+                            m_el[0].x() * m[0].y() + m_el[1].x() * m[1].y() + m_el[2].x() * m[2].y(),
+                            m_el[0].x() * m[0].z() + m_el[1].x() * m[1].z() + m_el[2].x() * m[2].z(),
+                            0.0f );
+    btSimdFloat4 r1 = btAssign128(   m_el[0].y() * m[0].x() + m_el[1].y() * m[1].x() + m_el[2].y() * m[2].x(),
+                            m_el[0].y() * m[0].y() + m_el[1].y() * m[1].y() + m_el[2].y() * m[2].y(),
+                            m_el[0].y() * m[0].z() + m_el[1].y() * m[1].z() + m_el[2].y() * m[2].z(),
+                            0.0f );
+    btSimdFloat4 r2 = btAssign128(   m_el[0].z() * m[0].x() + m_el[1].z() * m[1].x() + m_el[2].z() * m[2].x(),
+                            m_el[0].z() * m[0].y() + m_el[1].z() * m[1].y() + m_el[2].z() * m[2].y(),
+                            m_el[0].z() * m[0].z() + m_el[1].z() * m[1].z() + m_el[2].z() * m[2].z(),
+                            0.0f );
+    return btMatrix3x3( r0, r1, r2 );
+}
+
+static int operator!= ( const btMatrix3x3 &a, const btMatrix3x3 &b )
+{
+    if( a.getRow(0) != b.getRow(0) )
+        return 1;
+    if( a.getRow(1) != b.getRow(1) )
+        return 1;
+    if( a.getRow(2) != b.getRow(2) )
+        return 1;
+    return 0;
+}
+
+int Test_3x3transposeTimes(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3 in1[ARRAY_SIZE];
+    btMatrix3x3 in2[ARRAY_SIZE];
+    btMatrix3x3 out[ARRAY_SIZE];
+    btMatrix3x3 out2[ARRAY_SIZE];
+    
+	float maxRelativeError = 0.f;
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in2[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        
+        out[i] = TransposeTimesReference(in1[i], in2[i]);
+        out2[i] = in1[i].transposeTimes(in2[i]);
+        
+        if( out[i] != out2[i] )
+        {
+
+			float relativeError = 0.f;
+
+			for (int column=0;column<3;column++)
+			for (int row=0;row<3;row++)
+				relativeError = btMax(relativeError,btFabs(out2[i][row][column] - out[i][row][column]) / out[i][row][column]);
+
+			if (relativeError>1e-6)
+			{
+				vlog( "failure @ %ld\n", i);
+				btVector3 m0, m1, m2;
+				m0 = out[i].getRow(0);
+				m1 = out[i].getRow(1);
+				m2 = out[i].getRow(2);
+            
+				vlog(   "\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+						"\n          (%10.4f, %10.4f, %10.4f, %10.4f) "
+						"\n          (%10.4f, %10.4f, %10.4f, %10.4f) \n",
+						m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+						m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+						m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+				m0 = out2[i].getRow(0);
+				m1 = out2[i].getRow(1);
+				m2 = out2[i].getRow(2);
+					
+				vlog(   "\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) "
+						"\n          (%10.4f, %10.4f, %10.4f, %10.4f) " 
+						"\n          (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+						m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+						m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+						m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+				return -1;
+			} else
+			{
+				if (relativeError>maxRelativeError)
+					maxRelativeError = relativeError;
+			}
+        }
+    }
+    
+	if (maxRelativeError)
+	{
+		printf("Warning: maxRelativeError = %e\n",maxRelativeError);
+	}
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = TransposeTimesReference(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = in1[i].transposeTimes(in2[i]);
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+
+#endif //BT_USE_SSE
+
diff --git a/Test/Source/Tests/Test_3x3transposeTimes.h b/Test/Source/Tests/Test_3x3transposeTimes.h
new file mode 100644
index 000000000..08af2e1e0
--- /dev/null
+++ b/Test/Source/Tests/Test_3x3transposeTimes.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3transposeTimes.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3transposeTimes_h
+#define BulletTest_Test_3x3transposeTimes_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3transposeTimes(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_btDbvt.cpp b/Test/Source/Tests/Test_btDbvt.cpp
new file mode 100644
index 000000000..96ce52508
--- /dev/null
+++ b/Test/Source/Tests/Test_btDbvt.cpp
@@ -0,0 +1,495 @@
+//
+//  Test_btDbvt.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc., Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_btDbvt.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <BulletCollision/BroadphaseCollision/btDbvt.h>
+
+// reference code for testing purposes
+SIMD_FORCE_INLINE bool Intersect_ref( btDbvtAabbMm& a,  btDbvtAabbMm& b)
+{
+    return(	(a.tMins().x()<=b.tMaxs().x())&&
+           (a.tMaxs().x()>=b.tMins().x())&&
+           (a.tMins().y()<=b.tMaxs().y())&&
+           (a.tMaxs().y()>=b.tMins().y())&&
+           (a.tMins().z()<=b.tMaxs().z())&&		
+           (a.tMaxs().z()>=b.tMins().z()));
+    
+   }
+
+
+SIMD_FORCE_INLINE btScalar	Proximity_ref(	 btDbvtAabbMm& a,
+								   btDbvtAabbMm& b)
+{
+	const btVector3	d=(a.tMins()+a.tMaxs())-(b.tMins()+b.tMaxs());
+	return(btFabs(d.x())+btFabs(d.y())+btFabs(d.z()));
+}
+
+
+
+SIMD_FORCE_INLINE int			Select_ref(	 btDbvtAabbMm& o,
+							    btDbvtAabbMm& a,
+							    btDbvtAabbMm& b)
+{
+	return(Proximity_ref(o,a)<Proximity_ref(o,b)?0:1);
+}
+
+
+SIMD_FORCE_INLINE void		Merge_ref(	 btDbvtAabbMm& a,
+							   btDbvtAabbMm& b,
+							  btDbvtAabbMm& r)
+{
+    //
+    //Changing '3' into '4' to compare with the vector code which changes all 4 floats.
+    //Erwin: don't do this because the 4th component is ignore and not computed on non-vector code (there is no NEON version and scalar is just 3 components)
+    //
+	for(int i=0;i<3;++i)
+	{
+		if(a.tMins().m_floats[i]<b.tMins().m_floats[i]) 
+            r.tMins().m_floats[i] = a.tMins().m_floats[i]; 
+        else 
+            r.tMins().m_floats[i] = b.tMins().m_floats[i];
+		
+        
+        if(a.tMaxs().m_floats[i]>b.tMaxs().m_floats[i]) 
+            r.tMaxs().m_floats[i]=a.tMaxs().m_floats[i]; 
+        else 
+            r.tMaxs().m_floats[i]=b.tMaxs().m_floats[i];
+	}
+}
+/*
+[0]	float32_t	0.0318338
+[1]	float32_t	0.0309355
+[2]	float32_t	0.93264
+[3]	float32_t	0.88788
+
+[0]	float32_t	0.59133
+[1]	float32_t	0.478779
+[2]	float32_t	0.833354
+[3]	float32_t	0.186335
+
+[0]	float32_t	0.242578
+[1]	float32_t	0.0134696
+[2]	float32_t	0.383139
+[3]	float32_t	0.414653
+
+[0]	float32_t	0.067769
+[1]	float32_t	0.993127
+[2]	float32_t	0.484308
+[3]	float32_t	0.765338
+*/
+
+#define LOOPCOUNT 1000
+#define NUM_CYCLES 10000
+#define DATA_SIZE 1024
+
+int Test_btDbvt(void)
+{
+    btDbvtAabbMm a[DATA_SIZE], b[DATA_SIZE], c[DATA_SIZE];
+    btDbvtAabbMm a_ref[DATA_SIZE], b_ref[DATA_SIZE], c_ref[DATA_SIZE];
+        
+    int i;
+        
+    bool Intersect_Test_Res[DATA_SIZE], Intersect_Ref_Res[DATA_SIZE];
+    int Select_Test_Res[DATA_SIZE], Select_Ref_Res[DATA_SIZE];
+    
+    
+    for (i = 0; i < DATA_SIZE; i++)
+    {
+        a[i].tMins().m_floats[0] = (float)rand() / (float)RAND_MAX;
+        a[i].tMins().m_floats[1] = (float)rand() / (float)RAND_MAX;
+        a[i].tMins().m_floats[2] = (float)rand() / (float)RAND_MAX;
+        a[i].tMins().m_floats[3] = (float)rand() / (float)RAND_MAX;
+        
+        a[i].tMaxs().m_floats[0] = (float)rand() / (float)RAND_MAX;
+        a[i].tMaxs().m_floats[1] = (float)rand() / (float)RAND_MAX;
+        a[i].tMaxs().m_floats[2] = (float)rand() / (float)RAND_MAX;
+        a[i].tMaxs().m_floats[3] = (float)rand() / (float)RAND_MAX;
+        
+        b[i].tMins().m_floats[0] = (float)rand() / (float)RAND_MAX;
+        b[i].tMins().m_floats[1] = (float)rand() / (float)RAND_MAX;
+        b[i].tMins().m_floats[2] = (float)rand() / (float)RAND_MAX;
+        b[i].tMins().m_floats[3] = (float)rand() / (float)RAND_MAX;
+        
+        b[i].tMaxs().m_floats[0] = (float)rand() / (float)RAND_MAX;
+        b[i].tMaxs().m_floats[1] = (float)rand() / (float)RAND_MAX;
+        b[i].tMaxs().m_floats[2] = (float)rand() / (float)RAND_MAX;
+        b[i].tMaxs().m_floats[3] = (float)rand() / (float)RAND_MAX;
+        
+        c[i].tMins().m_floats[0] = (float)rand() / (float)RAND_MAX;
+        c[i].tMins().m_floats[1] = (float)rand() / (float)RAND_MAX;
+        c[i].tMins().m_floats[2] = (float)rand() / (float)RAND_MAX;
+        c[i].tMins().m_floats[3] = (float)rand() / (float)RAND_MAX;
+        
+        c[i].tMaxs().m_floats[0] = (float)rand() / (float)RAND_MAX;
+        c[i].tMaxs().m_floats[1] = (float)rand() / (float)RAND_MAX;
+        c[i].tMaxs().m_floats[2] = (float)rand() / (float)RAND_MAX;
+        c[i].tMaxs().m_floats[3] = (float)rand() / (float)RAND_MAX;
+        
+        
+        a_ref[i].tMins().m_floats[0] = a[i].tMins().m_floats[0];
+        a_ref[i].tMins().m_floats[1] = a[i].tMins().m_floats[1];
+        a_ref[i].tMins().m_floats[2] = a[i].tMins().m_floats[2];
+        a_ref[i].tMins().m_floats[3] = a[i].tMins().m_floats[3];
+        
+        a_ref[i].tMaxs().m_floats[0] = a[i].tMaxs().m_floats[0];
+        a_ref[i].tMaxs().m_floats[1] = a[i].tMaxs().m_floats[1];
+        a_ref[i].tMaxs().m_floats[2] = a[i].tMaxs().m_floats[2];
+        a_ref[i].tMaxs().m_floats[3] = a[i].tMaxs().m_floats[3];
+        
+        b_ref[i].tMins().m_floats[0] = b[i].tMins().m_floats[0];
+        b_ref[i].tMins().m_floats[1] = b[i].tMins().m_floats[1];
+        b_ref[i].tMins().m_floats[2] = b[i].tMins().m_floats[2];
+        b_ref[i].tMins().m_floats[3] = b[i].tMins().m_floats[3];
+        
+        b_ref[i].tMaxs().m_floats[0] = b[i].tMaxs().m_floats[0];
+        b_ref[i].tMaxs().m_floats[1] = b[i].tMaxs().m_floats[1];
+        b_ref[i].tMaxs().m_floats[2] = b[i].tMaxs().m_floats[2];
+        b_ref[i].tMaxs().m_floats[3] = b[i].tMaxs().m_floats[3];
+        
+        c_ref[i].tMins().m_floats[0] = c[i].tMins().m_floats[0];
+        c_ref[i].tMins().m_floats[1] = c[i].tMins().m_floats[1];
+        c_ref[i].tMins().m_floats[2] = c[i].tMins().m_floats[2];
+        c_ref[i].tMins().m_floats[3] = c[i].tMins().m_floats[3];
+        
+        c_ref[i].tMaxs().m_floats[0] = c[i].tMaxs().m_floats[0];
+        c_ref[i].tMaxs().m_floats[1] = c[i].tMaxs().m_floats[1];
+        c_ref[i].tMaxs().m_floats[2] = c[i].tMaxs().m_floats[2];
+        c_ref[i].tMaxs().m_floats[3] = c[i].tMaxs().m_floats[3];
+        
+    }
+    
+    
+#if 1
+    for (i = 0; i < DATA_SIZE; i++)
+    {
+        
+        Intersect_Test_Res[i] = Intersect(a[i], b[i]);
+        Intersect_Ref_Res[i]  = Intersect_ref(a_ref[i], b_ref[i]);
+        
+        if(Intersect_Test_Res[i] != Intersect_Ref_Res[i])
+        {
+            printf("Diff on %d\n", i); 
+            
+            printf("a_mx_f[0] = %.3f, a_mx_f[1] = %.3f, a_mx_f[2] = %.3f, a_mx_f[3] = %.3f\n", a[i].tMaxs().m_floats[0], a[i].tMaxs().m_floats[1], a[i].tMaxs().m_floats[2], a[i].tMaxs().m_floats[3]);
+            printf("a_mi_f[0] = %.3f, a_mi_f[1] = %.3f, a_mi_f[2] = %.3f, a_mi_f[3] = %.3f\n", a[i].tMins().m_floats[0], a[i].tMins().m_floats[1], a[i].tMins().m_floats[2], a[i].tMins().m_floats[3]);
+            printf("b_mx_f[0] = %.3f, b_mx_f[1] = %.3f, b_mx_f[2] = %.3f, b_mx_f[3] = %.3f\n", b[i].tMaxs().m_floats[0], b[i].tMaxs().m_floats[1], b[i].tMaxs().m_floats[2], b[i].tMaxs().m_floats[3]);
+            printf("b_mi_f[0] = %.3f, b_mi_f[1] = %.3f, b_mi_f[2] = %.3f, b_mi_f[3] = %.3f\n", b[i].tMins().m_floats[0], b[i].tMins().m_floats[1], b[i].tMins().m_floats[2], b[i].tMins().m_floats[3]);
+            
+            printf("a_mx_f_ref[0] = %.3f, a_mx_f_ref[1] = %.3f, a_mx_f_ref[2] = %.3f, a_mx_f_ref[3] = %.3f\n", a_ref[i].tMaxs().m_floats[0], a_ref[i].tMaxs().m_floats[1], a_ref[i].tMaxs().m_floats[2], a_ref[i].tMaxs().m_floats[3]);
+            printf("a_mi_f_ref[0] = %.3f, a_mi_f_ref[1] = %.3f, a_mi_f_ref[2] = %.3f, a_mi_f_ref[3] = %.3f\n", a_ref[i].tMins().m_floats[0], a_ref[i].tMins().m_floats[1], a_ref[i].tMins().m_floats[2], a_ref[i].tMins().m_floats[3]);
+            printf("b_mx_f_ref[0] = %.3f, b_mx_f_ref[1] = %.3f, b_mx_f_ref[2] = %.3f, b_mx_f_ref[3] = %.3f\n", b_ref[i].tMaxs().m_floats[0], b_ref[i].tMaxs().m_floats[1], b_ref[i].tMaxs().m_floats[2], b_ref[i].tMaxs().m_floats[3]);
+            printf("b_mi_f_ref[0] = %.3f, b_mi_f_ref[1] = %.3f, b_mi_f_ref[2] = %.3f, b_mi_f_ref[3] = %.3f\n", b_ref[i].tMins().m_floats[0], b_ref[i].tMins().m_floats[1], b_ref[i].tMins().m_floats[2], b_ref[i].tMins().m_floats[3]);   
+        }
+    }
+#endif    
+        
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j;
+        
+    
+    ////////////////////////////////////
+    //
+    // Time and Test Intersect
+    //
+    ////////////////////////////////////
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            
+            
+            for (i = 0; i < DATA_SIZE; i++)
+            {
+                Intersect_Ref_Res[i]  = Intersect_ref(a_ref[i], b_ref[i]);
+            }
+            
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            
+            for (i = 0; i < DATA_SIZE; i++)
+            {
+                Intersect_Test_Res[i] = Intersect(a[i], b[i]);
+            }
+
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+    
+    vlog( "Intersect Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+    
+    //printf("scalar = %llu, vector = %llu\n", scalarTime, vectorTime);
+    
+    for (i = 0; i < DATA_SIZE; i++)
+    {
+        if(Intersect_Test_Res[i] != Intersect_Ref_Res[i])
+        {
+            printf("Intersect fail at %d\n", i);
+			return 1;
+        }
+    }
+    
+    ////////////////////////////////////
+    //
+    // Time and Test Merge
+    //
+    ////////////////////////////////////
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            
+            
+            for (i = 0; i < DATA_SIZE; i++)
+            {
+                Merge_ref(a_ref[i], b_ref[i], c_ref[i]);
+            }
+            
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            
+            for (i = 0; i < DATA_SIZE; i++)
+            {
+                Merge(a[i], b[i], c[i]);
+            }
+            
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+    
+    vlog( "Merge Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+    
+    //printf("scalar = %llu, vector = %llu\n", scalarTime, vectorTime);
+    /*
+ c  [0]	float32_t	0.00455523
+    [1]	float32_t	0.559712
+    [2]	float32_t	0.0795838
+    [3]	float32_t	0.10182
+    
+c_ref
+    [0]	float32_t	0.00455523
+    [1]	float32_t	0.559712
+    [2]	float32_t	0.0795838
+    [3]	float32_t	0.552081
+    
+    
+c   [0]	float32_t	0.829904
+    [1]	float32_t	0.692891
+    [2]	float32_t	0.961654
+    [3]	float32_t	0.666956
+    
+ c_ref
+    [0]	float32_t	0.829904
+    [1]	float32_t	0.692891
+    [2]	float32_t	0.961654
+    [3]	float32_t	0.522878
+    */
+    for (i = 0; i < DATA_SIZE; i++)
+    {
+        //ignore 4th component because it is not computed in all code-paths
+        if( (fabs(c[i].tMaxs().m_floats[0] - c_ref[i].tMaxs().m_floats[0]) > 0.001) || 
+           (fabs(c[i].tMaxs().m_floats[1] - c_ref[i].tMaxs().m_floats[1]) > 0.001) || 
+           (fabs(c[i].tMaxs().m_floats[2] - c_ref[i].tMaxs().m_floats[2]) > 0.001) || 
+          // (fabs(c[i].tMaxs().m_floats[3] - c_ref[i].tMaxs().m_floats[3]) > 0.001) || 
+           (fabs(c[i].tMins().m_floats[0] - c_ref[i].tMins().m_floats[0]) > 0.001) || 
+           (fabs(c[i].tMins().m_floats[1] - c_ref[i].tMins().m_floats[1]) > 0.001) || 
+           (fabs(c[i].tMins().m_floats[2] - c_ref[i].tMins().m_floats[2]) > 0.001)  
+          //|| (fabs(c[i].tMins().m_floats[3] - c_ref[i].tMins().m_floats[3]) > 0.001) 
+           )
+           
+        
+        //if((c[i].tMaxs().m_floats[0] != c_ref[i].tMaxs().m_floats[0]) || (c[i].tMaxs().m_floats[1] != c_ref[i].tMaxs().m_floats[1]) || (c[i].tMaxs().m_floats[2] != c_ref[i].tMaxs().m_floats[2]) || (c[i].tMaxs().m_floats[3] != c_ref[i].tMaxs().m_floats[3]) || (c[i].tMins().m_floats[0] != c_ref[i].tMins().m_floats[0]) || (c[i].tMins().m_floats[1] != c_ref[i].tMins().m_floats[1]) || (c[i].tMins().m_floats[2] != c_ref[i].tMins().m_floats[2]) || (c[i].tMins().m_floats[3] != c_ref[i].tMins().m_floats[3]))
+        {
+            printf("Merge fail at %d with test = %d, ref = %d\n", i, Select_Test_Res[i], Select_Ref_Res[i]);
+            
+            printf("a_mx_f[0] = %.3f, a_mx_f[1] = %.3f, a_mx_f[2] = %.3f, a_mx_f[3] = %.3f\n", a[i].tMaxs().m_floats[0], a[i].tMaxs().m_floats[1], a[i].tMaxs().m_floats[2], a[i].tMaxs().m_floats[3]);
+            printf("a_mi_f[0] = %.3f, a_mi_f[1] = %.3f, a_mi_f[2] = %.3f, a_mi_f[3] = %.3f\n", a[i].tMins().m_floats[0], a[i].tMins().m_floats[1], a[i].tMins().m_floats[2], a[i].tMins().m_floats[3]);
+            printf("b_mx_f[0] = %.3f, b_mx_f[1] = %.3f, b_mx_f[2] = %.3f, b_mx_f[3] = %.3f\n", b[i].tMaxs().m_floats[0], b[i].tMaxs().m_floats[1], b[i].tMaxs().m_floats[2], b[i].tMaxs().m_floats[3]);
+            printf("b_mi_f[0] = %.3f, b_mi_f[1] = %.3f, b_mi_f[2] = %.3f, b_mi_f[3] = %.3f\n", b[i].tMins().m_floats[0], b[i].tMins().m_floats[1], b[i].tMins().m_floats[2], b[i].tMins().m_floats[3]);
+            printf("c_mx_f[0] = %.3f, c_mx_f[1] = %.3f, c_mx_f[2] = %.3f, c_mx_f[3] = %.3f\n", c[i].tMaxs().m_floats[0], c[i].tMaxs().m_floats[1], c[i].tMaxs().m_floats[2], c[i].tMaxs().m_floats[3]);
+            printf("c_mi_f[0] = %.3f, c_mi_f[1] = %.3f, c_mi_f[2] = %.3f, c_mi_f[3] = %.3f\n", c[i].tMins().m_floats[0], c[i].tMins().m_floats[1], c[i].tMins().m_floats[2], c[i].tMins().m_floats[3]);
+            
+            printf("a_mx_f_ref[0] = %.3f, a_mx_f_ref[1] = %.3f, a_mx_f_ref[2] = %.3f, a_mx_f_ref[3] = %.3f\n", a_ref[i].tMaxs().m_floats[0], a_ref[i].tMaxs().m_floats[1], a_ref[i].tMaxs().m_floats[2], a_ref[i].tMaxs().m_floats[3]);
+            printf("a_mi_f_ref[0] = %.3f, a_mi_f_ref[1] = %.3f, a_mi_f_ref[2] = %.3f, a_mi_f_ref[3] = %.3f\n", a_ref[i].tMins().m_floats[0], a_ref[i].tMins().m_floats[1], a_ref[i].tMins().m_floats[2], a_ref[i].tMins().m_floats[3]);
+            printf("b_mx_f_ref[0] = %.3f, b_mx_f_ref[1] = %.3f, b_mx_f_ref[2] = %.3f, b_mx_f_ref[3] = %.3f\n", b_ref[i].tMaxs().m_floats[0], b_ref[i].tMaxs().m_floats[1], b_ref[i].tMaxs().m_floats[2], b_ref[i].tMaxs().m_floats[3]);
+            printf("b_mi_f_ref[0] = %.3f, b_mi_f_ref[1] = %.3f, b_mi_f_ref[2] = %.3f, b_mi_f_ref[3] = %.3f\n", b_ref[i].tMins().m_floats[0], b_ref[i].tMins().m_floats[1], b_ref[i].tMins().m_floats[2], b_ref[i].tMins().m_floats[3]);
+            printf("c_mx_f_ref[0] = %.3f, c_mx_f_ref[1] = %.3f, c_mx_f_ref[2] = %.3f, c_mx_f_ref[3] = %.3f\n", c_ref[i].tMaxs().m_floats[0], c_ref[i].tMaxs().m_floats[1], c_ref[i].tMaxs().m_floats[2], c_ref[i].tMaxs().m_floats[3]);
+            printf("c_mi_f_ref[0] = %.3f, c_mi_f_ref[1] = %.3f, c_mi_f_ref[2] = %.3f, c_mi_f_ref[3] = %.3f\n", c_ref[i].tMins().m_floats[0], c_ref[i].tMins().m_floats[1], c_ref[i].tMins().m_floats[2], c_ref[i].tMins().m_floats[3]);
+			return 1;
+
+		}
+        
+    }
+    
+    ////////////////////////////////////
+    //
+    // Time and Test Select
+    //
+    ////////////////////////////////////
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            
+            
+            for (i = 0; i < DATA_SIZE; i++)
+            {
+                Select_Ref_Res[i]  = Select_ref(a_ref[i], b_ref[i], c_ref[i]);
+            }
+            
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            
+            for (i = 0; i < DATA_SIZE; i++)
+            {
+                Select_Test_Res[i] = Select(a[i], b[i], c[i]);
+            }
+            
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+    
+    vlog( "Select Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+    
+    //printf("scalar = %llu, vector = %llu\n", scalarTime, vectorTime);
+    
+    for (i = 0; i < DATA_SIZE; i++)
+    {
+        Select_Ref_Res[i]  = Select_ref(a_ref[i], b_ref[i], c_ref[i]);
+        Select_Test_Res[i] = Select(a[i], b[i], c[i]);
+        
+        if(Select_Test_Res[i] != Select_Ref_Res[i])
+        {
+            printf("Select fail at %d with test = %d, ref = %d\n", i, Select_Test_Res[i], Select_Ref_Res[i]);
+            
+            printf("a_mx_f[0] = %.3f, a_mx_f[1] = %.3f, a_mx_f[2] = %.3f, a_mx_f[3] = %.3f\n", a[i].tMaxs().m_floats[0], a[i].tMaxs().m_floats[1], a[i].tMaxs().m_floats[2], a[i].tMaxs().m_floats[3]);
+            printf("a_mi_f[0] = %.3f, a_mi_f[1] = %.3f, a_mi_f[2] = %.3f, a_mi_f[3] = %.3f\n", a[i].tMins().m_floats[0], a[i].tMins().m_floats[1], a[i].tMins().m_floats[2], a[i].tMins().m_floats[3]);
+            printf("b_mx_f[0] = %.3f, b_mx_f[1] = %.3f, b_mx_f[2] = %.3f, b_mx_f[3] = %.3f\n", b[i].tMaxs().m_floats[0], b[i].tMaxs().m_floats[1], b[i].tMaxs().m_floats[2], b[i].tMaxs().m_floats[3]);
+            printf("b_mi_f[0] = %.3f, b_mi_f[1] = %.3f, b_mi_f[2] = %.3f, b_mi_f[3] = %.3f\n", b[i].tMins().m_floats[0], b[i].tMins().m_floats[1], b[i].tMins().m_floats[2], b[i].tMins().m_floats[3]);
+            printf("c_mx_f[0] = %.3f, c_mx_f[1] = %.3f, c_mx_f[2] = %.3f, c_mx_f[3] = %.3f\n", c[i].tMaxs().m_floats[0], c[i].tMaxs().m_floats[1], c[i].tMaxs().m_floats[2], c[i].tMaxs().m_floats[3]);
+            printf("c_mi_f[0] = %.3f, c_mi_f[1] = %.3f, c_mi_f[2] = %.3f, c_mi_f[3] = %.3f\n", c[i].tMins().m_floats[0], c[i].tMins().m_floats[1], c[i].tMins().m_floats[2], c[i].tMins().m_floats[3]);
+            
+            printf("a_mx_f_ref[0] = %.3f, a_mx_f_ref[1] = %.3f, a_mx_f_ref[2] = %.3f, a_mx_f_ref[3] = %.3f\n", a_ref[i].tMaxs().m_floats[0], a_ref[i].tMaxs().m_floats[1], a_ref[i].tMaxs().m_floats[2], a_ref[i].tMaxs().m_floats[3]);
+            printf("a_mi_f_ref[0] = %.3f, a_mi_f_ref[1] = %.3f, a_mi_f_ref[2] = %.3f, a_mi_f_ref[3] = %.3f\n", a_ref[i].tMins().m_floats[0], a_ref[i].tMins().m_floats[1], a_ref[i].tMins().m_floats[2], a_ref[i].tMins().m_floats[3]);
+            printf("b_mx_f_ref[0] = %.3f, b_mx_f_ref[1] = %.3f, b_mx_f_ref[2] = %.3f, b_mx_f_ref[3] = %.3f\n", b_ref[i].tMaxs().m_floats[0], b_ref[i].tMaxs().m_floats[1], b_ref[i].tMaxs().m_floats[2], b_ref[i].tMaxs().m_floats[3]);
+            printf("b_mi_f_ref[0] = %.3f, b_mi_f_ref[1] = %.3f, b_mi_f_ref[2] = %.3f, b_mi_f_ref[3] = %.3f\n", b_ref[i].tMins().m_floats[0], b_ref[i].tMins().m_floats[1], b_ref[i].tMins().m_floats[2], b_ref[i].tMins().m_floats[3]);
+            printf("c_mx_f_ref[0] = %.3f, c_mx_f_ref[1] = %.3f, c_mx_f_ref[2] = %.3f, c_mx_f_ref[3] = %.3f\n", c_ref[i].tMaxs().m_floats[0], c_ref[i].tMaxs().m_floats[1], c_ref[i].tMaxs().m_floats[2], c_ref[i].tMaxs().m_floats[3]);
+            printf("c_mi_f_ref[0] = %.3f, c_mi_f_ref[1] = %.3f, c_mi_f_ref[2] = %.3f, c_mi_f_ref[3] = %.3f\n", c_ref[i].tMins().m_floats[0], c_ref[i].tMins().m_floats[1], c_ref[i].tMins().m_floats[2], c_ref[i].tMins().m_floats[3]);
+			return 1;
+		}
+        
+    }
+    
+    return 0;
+}
+#endif
+
+
+
+
diff --git a/Test/Source/Tests/Test_btDbvt.h b/Test/Source/Tests/Test_btDbvt.h
new file mode 100644
index 000000000..92e309a11
--- /dev/null
+++ b/Test/Source/Tests/Test_btDbvt.h
@@ -0,0 +1,21 @@
+//
+//  Test_btDbvt.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc., Inc.
+//
+
+#ifndef BulletTest_Test_btDbvt_h
+#define BulletTest_Test_btDbvt_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+    
+    int Test_btDbvt(void);
+    
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/Test/Source/Tests/Test_dot3.cpp b/Test/Source/Tests/Test_dot3.cpp
new file mode 100644
index 000000000..ef3ff4a55
--- /dev/null
+++ b/Test/Source/Tests/Test_dot3.cpp
@@ -0,0 +1,153 @@
+//
+//  Test_v3dot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_dot3.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static btVector3 dot3_ref( const btVector3 &,  const btVector3 &,   const btVector3 &,   const btVector3 &);
+static btVector3 dot3_ref( const btVector3 &v, const btVector3 &v1, const btVector3 &v2, const btVector3 &v3)
+{
+    return btVector3( v.dot(v1), v.dot(v2), v.dot(v3));
+}
+
+/*
+SIMD_FORCE_INLINE int operator!=(const btVector3 &s, const btVector3 &v)
+{
+#ifdef __SSE__
+    __m128 test = _mm_cmpneq_ps( s.mVec128, v.mVec128 );
+    return (_mm_movemask_ps( test ) & 7) != 0;
+#elif defined __ARM_NEON_H
+    uint32x4_t test = vandq_u32( vceqq_f32( s.mVec128, v.mVec128 ), (uint32x4_t){-1,-1,-1,0});
+    uint32x2_t t = vpadd_u32( vget_low_u32(test), vget_high_u32(test));
+    t = vpadd_u32(t, t);
+    return -3 != (int32_t) vget_lane_u32(t, 0);
+#else
+    return  s.m_floats[0] != v.m_floats[0] ||
+    s.m_floats[1] != v.m_floats[1] ||
+    s.m_floats[2] != v.m_floats[2];
+#endif
+}
+*/
+ 
+
+
+#define LOOPCOUNT 1000
+#define NUM_CYCLES 10000
+
+int Test_dot3(void)
+{
+    btVector3 v, v1, v2, v3;
+    
+#define DATA_SIZE 1024
+    
+	btVector3 vec3_arr[DATA_SIZE];
+	btVector3 vec3_arr1[DATA_SIZE];
+	btVector3 vec3_arr2[DATA_SIZE];
+	btVector3 vec3_arr3[DATA_SIZE];
+    btVector3 res_arr[DATA_SIZE];
+    
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+    btVector3 correct, test;
+    
+	for( k = 0; k < DATA_SIZE; k++ )
+	{
+        
+        vec3_arr[k]  = btVector3( btAssign128( RANDF, RANDF, RANDF, BT_NAN));
+        vec3_arr1[k] = btVector3( btAssign128( RANDF, RANDF, RANDF, BT_NAN));
+        vec3_arr2[k] = btVector3( btAssign128( RANDF, RANDF, RANDF, BT_NAN ));
+        vec3_arr3[k] = btVector3( btAssign128( RANDF, RANDF, RANDF, BT_NAN));
+
+		correct = dot3_ref(vec3_arr[k], vec3_arr1[k], vec3_arr2[k], vec3_arr3[k]);
+		test = vec3_arr[k].dot3( vec3_arr1[k], vec3_arr2[k], vec3_arr3[k]);
+        
+		if( correct != test )
+		{
+			vlog( "Error (%ld) - dot3 result error! *{%a, %a, %a, %a} != {%a, %a, %a, %a} \n", k,
+                   correct.x(), correct.y(), correct.z(), correct.w(),
+                   test.x(), test.y(), test.z(), test.w() );
+            
+			return 1;
+		}
+    }
+    
+    
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t k32 = (k & (DATA_SIZE-1)); 
+                res_arr[k32] = dot3_ref( vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
+				res_arr[k32] = dot3_ref( vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
+				res_arr[k32] = dot3_ref( vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
+				res_arr[k32] = dot3_ref( vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t k32 = (k & (DATA_SIZE-1)); 
+                res_arr[k32] = vec3_arr[k32].dot3( vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
+				res_arr[k32] = vec3_arr[k32].dot3( vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
+				res_arr[k32] = vec3_arr[k32].dot3( vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
+				res_arr[k32] = vec3_arr[k32].dot3( vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+    
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+    
+    return 0;
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_dot3.h b/Test/Source/Tests/Test_dot3.h
new file mode 100644
index 000000000..cb525f88a
--- /dev/null
+++ b/Test/Source/Tests/Test_dot3.h
@@ -0,0 +1,22 @@
+//
+//  Test_mindot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_dot3_h
+#define BulletTest_Test_dot3_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+    
+    int Test_dot3(void);
+    
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/Test/Source/Tests/Test_maxdot.cpp b/Test/Source/Tests/Test_maxdot.cpp
new file mode 100644
index 000000000..acc0caeb4
--- /dev/null
+++ b/Test/Source/Tests/Test_maxdot.cpp
@@ -0,0 +1,281 @@
+//
+//  Test_maxdot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_maxdot.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+
+// reference code for testing purposes
+static long maxdot_ref(    const btSimdFloat4 *vertices, 
+                float *vec,
+                size_t count, 
+                float *dotResult );
+
+
+
+
+
+#ifdef __arm__
+    #define MAX_LOG2_SIZE   9
+#else
+    #define MAX_LOG2_SIZE   10
+#endif
+#define MAX_SIZE        (1U << MAX_LOG2_SIZE)
+#define LOOPCOUNT 10
+
+int Test_maxdot(void)
+{
+    // Init an array flanked by guard pages
+    btSimdFloat4 *data = (btSimdFloat4*) GuardCalloc( 1, MAX_SIZE * sizeof(btSimdFloat4), NULL );
+    float *fp = (float*) data;
+    long correct, test;
+    btVector3 localScaling( 0.1f, 0.2f, 0.3f);
+    size_t size;
+    
+    // Init the data
+    size_t i;
+    for( i = 0; i < MAX_SIZE; i++ )
+    {
+        fp[4*i] = (int32_t) RANDF_16;
+        fp[4*i+1] = (int32_t) RANDF_16;
+        fp[4*i+2] = (int32_t) RANDF_16;
+        fp[4*i+3] = BT_NAN;     // w channel NaN
+    }
+    
+    float correctDot, testDot;
+    fp = (float*) localScaling;
+	float maxRelativeError = 0.f;
+	
+    for( size = 1; size <= MAX_SIZE; size++ )
+    {
+        float *in = (float*)(data + MAX_SIZE - size);
+        size_t position;
+        
+        for( position = 0; position < size; position++ )
+        {
+            float *biggest = in + position * 4;
+            float old[4] = { biggest[0], biggest[1], biggest[2], biggest[3] };
+            biggest[0] += LARGE_FLOAT17;
+            biggest[1] += LARGE_FLOAT17;
+            biggest[2] += LARGE_FLOAT17;
+            biggest[3] += LARGE_FLOAT17;
+            
+            correctDot = BT_NAN;
+            testDot = BT_NAN;
+            correct = maxdot_ref( (btSimdFloat4*) in, (float*) &localScaling, size, &correctDot);
+            test = localScaling.maxDot( (btVector3*) in, size, testDot);
+            if( test < 0 || test >= size )
+            {
+                vlog( "Error @ %ld: index out of bounds! *%ld vs %ld \n", size, correct, test);
+                continue;
+            }
+            if( correct != test )
+			{
+                vlog( "Error @ %ld: index misreported! *%ld vs %ld  (*%f, %f)\n", size, correct, test, 
+                       fp[0] * in[4*correct] + fp[1] * in[4*correct+1]  + fp[2] * in[4*correct+2], 
+                       fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2] );
+				return 1;
+			}
+            if( test != position )
+			{
+                vlog( "Biggest not found where it is supposed to be: *%ld vs %ld (*%f, %f)\n", position, test, 
+                       fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2],
+                       fp[0] * in[4*position] + fp[1] * in[4*position+1]  + fp[2] * in[4*position+2]  );
+				return 1;
+			}
+
+            if( correctDot != testDot )
+			{
+				float relativeError = btFabs((testDot - correctDot) / correctDot);
+				if (relativeError>1e-6)
+				{
+                vlog( "Error @ %ld: dotpr misreported! *%f vs %f    (*%f, %f)\n", size, correctDot, testDot, 
+                       fp[0] * in[4*correct] + fp[1] * in[4*correct+1]  + fp[2] * in[4*correct+2], 
+                       fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2]  );
+				return 1;
+				} else
+				{
+					if (maxRelativeError < relativeError)
+					{
+						maxRelativeError = relativeError;
+#ifdef VERBOSE_WARNING
+						sprintf(errStr,"Warning @ %ld: dotpr misreported! *%f vs %f    (*%f, %f)\n", size, correctDot, testDot, 
+						   fp[0] * in[4*correct] + fp[1] * in[4*correct+1]  + fp[2] * in[4*correct+2], 
+						   fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2]);
+#endif //VERBOSE_WARNING
+					}
+				}
+			}
+            
+            memcpy( biggest, old, 16 );
+        }
+    }
+    
+	
+	if (maxRelativeError)
+	{
+		printf("Warning: relative error = %e\n", maxRelativeError);
+#ifdef VERBOSE_WARNING
+		vlog(errStr);
+#endif
+	}
+
+    uint64_t scalarTimes[33 + (MAX_LOG2_SIZE-5)];
+    uint64_t vectorTimes[33 + (MAX_LOG2_SIZE-5)];
+    size_t j, k;
+    float *in = (float*) data;
+    for( size = 1; size <= 32; size++ )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTimes[size] = 0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                correct += maxdot_ref( (btSimdFloat4*) in, (float*) &localScaling, size, &correctDot);
+            currentTime = ReadTicks() - startTime;
+            scalarTimes[size] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTimes[size] = bestTime;        
+        else
+            scalarTimes[size] /= 100;
+    }
+    
+    uint64_t *timep = &scalarTimes[33];
+    for( size = 64; size <= MAX_SIZE; size *= 2 )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        timep[0] =0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                correct += maxdot_ref( (btSimdFloat4*) in, (float*) &localScaling, size, &correctDot);
+            currentTime = ReadTicks() - startTime;
+            timep[0] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            timep[0] = bestTime;        
+        else
+            timep[0] /= 100;
+
+        timep++;
+    }
+
+    for( size = 1; size <= 32; size++ )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTimes[size] = 0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                test += localScaling.maxDot( (btVector3*) in, size, testDot);
+            currentTime = ReadTicks() - startTime;
+            vectorTimes[size] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTimes[size] = bestTime;        
+        else
+            vectorTimes[size] /= 100;
+    }
+    
+    timep = &vectorTimes[33];
+    for( size = 64; size <= MAX_SIZE; size *= 2 )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        timep[0] =0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                test += localScaling.maxDot( (btVector3*) in, size, testDot);
+            currentTime = ReadTicks() - startTime;
+            timep[0] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            timep[0] = bestTime;        
+        else
+            timep[0] /= 100;
+        
+        timep++;
+    }
+    
+    vlog( "Timing:\n" );
+    vlog( " size\t    scalar\t    vector\n" );
+    for( size = 1; size <= 32; size++ )
+        vlog( "%5lu\t%10.2f\t%10.2f\n", size, TicksToCycles( scalarTimes[size] ) / LOOPCOUNT, TicksToCycles( vectorTimes[size] ) / LOOPCOUNT );
+    size_t index = 33;
+    for( size = 64; size <= MAX_SIZE; size *= 2 )
+    {
+        vlog( "%5lu\t%10.2f\t%10.2f\n", size, TicksToCycles( scalarTimes[index] ) / LOOPCOUNT, TicksToCycles( vectorTimes[index] ) / LOOPCOUNT );
+        index++;
+    }
+    
+    // Useless check to make sure that the timing loops are not optimized away
+    if( test != correct )
+        vlog( "Error: Test != correct: *%ld vs. %ld\n", correct, test);
+    
+    GuardFree(data);
+    
+    return 0;
+}
+
+
+static long maxdot_ref(    const btSimdFloat4 *vertices, 
+                float *vec,
+                size_t count, 
+                float *dotResult )
+{
+    
+    const float *dp = (const float*) vertices;
+    float  maxDot = -BT_INFINITY;
+    long i = 0;
+    long ptIndex = -1;
+    
+    for( i = 0; i < count; i++ )
+    {
+        float dot = vec[0] * dp[0] + vec[1] * dp[1] + vec[2] * dp[2];   dp += 4;
+        
+        if( dot > maxDot )
+        {
+            maxDot = dot;
+            ptIndex = i;
+        }
+    }
+    
+    *dotResult = maxDot;
+    
+    return ptIndex;
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_maxdot.h b/Test/Source/Tests/Test_maxdot.h
new file mode 100644
index 000000000..2e6c517f2
--- /dev/null
+++ b/Test/Source/Tests/Test_maxdot.h
@@ -0,0 +1,22 @@
+//
+//  Test_maxdot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_maxdot_h
+#define BulletTest_Test_maxdot_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_maxdot(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_mindot.cpp b/Test/Source/Tests/Test_mindot.cpp
new file mode 100644
index 000000000..2f4425621
--- /dev/null
+++ b/Test/Source/Tests/Test_mindot.cpp
@@ -0,0 +1,269 @@
+//
+//  Test_mindot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_mindot.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+
+// reference code for testing purposes
+static long mindot_ref(    const btSimdFloat4 *vertices, 
+                       float *vec,
+                       size_t count, 
+                       float *dotResult );
+
+#ifdef __arm__
+    #define MAX_LOG2_SIZE   9
+#else
+    #define MAX_LOG2_SIZE   9
+#endif
+#define MAX_SIZE        (1U << MAX_LOG2_SIZE)
+#define LOOPCOUNT 100
+
+int Test_mindot(void)
+{
+    // Init an array flanked by guard pages
+    btSimdFloat4 *data = (btSimdFloat4*) GuardCalloc( 1, MAX_SIZE * sizeof(btSimdFloat4), NULL );
+    float *fp = (float*) data;
+    long correct, test;
+    btVector3 localScaling( 0.1f, 0.2f, 0.3f);
+    size_t size;
+    
+    // Init the data
+    size_t i;
+    for( i = 0; i < MAX_SIZE; i++ )
+    {
+        fp[4*i] = (int32_t) RANDF_16;
+        fp[4*i+1] = (int32_t) RANDF_16;
+        fp[4*i+2] = (int32_t) RANDF_16;
+        fp[4*i+3] = BT_NAN;     // w channel NaN
+    }
+    
+    float correctDot, testDot;
+    fp = (float*) localScaling;
+	float maxRelativeError = 0.f;
+
+    for( size = 1; size <= MAX_SIZE; size++ )
+    {
+        float *in = (float*)(data + MAX_SIZE - size);
+        size_t position;
+        
+        for( position = 0; position < size; position++ )
+        {
+            float *biggest = in + position * 4;
+            float old[4] = { biggest[0], biggest[1], biggest[2], biggest[3] };
+            biggest[0] -= LARGE_FLOAT17;
+            biggest[1] -= LARGE_FLOAT17;
+            biggest[2] -= LARGE_FLOAT17;
+            biggest[3] -= LARGE_FLOAT17;
+            
+            correctDot = BT_NAN;
+            testDot = BT_NAN;
+            correct = mindot_ref( (btSimdFloat4*) in, (float*) &localScaling, size, &correctDot);
+            test = localScaling.minDot( (btVector3*) in, size, testDot);
+            if( test < 0 || test >= size )
+            {
+                vlog( "Error @ %ld: index out of bounds! *%ld vs %ld \n", size, correct, test);
+                continue;
+            }
+            if( correct != test )
+			{
+                vlog( "Error @ %ld: index misreported! *%ld vs %ld  (*%f, %f)\n", size, correct, test, 
+                       fp[0] * in[4*correct] + fp[1] * in[4*correct+1]  + fp[2] * in[4*correct+2], 
+                       fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2] );
+				return 1;
+			}
+            if( test != position )
+			{
+                vlog( "Biggest not found where it is supposed to be: *%ld vs %ld (*%f, %f)\n", position, test, 
+                       fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2],
+                       fp[0] * in[4*position] + fp[1] * in[4*position+1]  + fp[2] * in[4*position+2]  );
+				return 1;
+			}
+
+            if( correctDot != testDot )
+			{
+				float relativeError = btFabs((testDot - correctDot) / correctDot);
+				if (relativeError>1e6)
+				{
+					vlog( "Error @ %ld: dotpr misreported! *%f vs %f    (*%f, %f)\n", size, correctDot, testDot, 
+						   fp[0] * in[4*correct] + fp[1] * in[4*correct+1]  + fp[2] * in[4*correct+2], 
+						   fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2]  );
+					return 1;
+				} else
+				{
+					if (maxRelativeError < relativeError)
+					{
+						maxRelativeError = relativeError;
+					}
+				}
+			}
+
+            
+            memcpy( biggest, old, 16 );
+        }
+    }
+    
+	if (maxRelativeError)
+	{
+		printf("Warning: relative error = %e\n", maxRelativeError);
+	}
+    uint64_t scalarTimes[33 + (MAX_LOG2_SIZE-5)];
+    uint64_t vectorTimes[33 + (MAX_LOG2_SIZE-5)];
+    size_t j, k;
+    float *in = (float*) data;
+    for( size = 1; size <= 32; size++ )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTimes[size] = 0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                correct += mindot_ref( (btSimdFloat4*) in, (float*) &localScaling, size, &correctDot);
+            currentTime = ReadTicks() - startTime;
+            scalarTimes[size] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTimes[size] = bestTime;        
+        else
+            scalarTimes[size] /= 100;
+    }
+    
+    uint64_t *timep = &scalarTimes[33];
+    for( size = 64; size <= MAX_SIZE; size *= 2 )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        timep[0] =0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                correct += mindot_ref( (btSimdFloat4*) in, (float*) &localScaling, size, &correctDot);
+            currentTime = ReadTicks() - startTime;
+            timep[0] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            timep[0] = bestTime;        
+        else
+            timep[0] /= 100;
+        
+        timep++;
+    }
+    
+    for( size = 1; size <= 32; size++ )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTimes[size] = 0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                test += localScaling.minDot( (btVector3*) in, size, testDot);
+            currentTime = ReadTicks() - startTime;
+            vectorTimes[size] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTimes[size] = bestTime;        
+        else
+            vectorTimes[size] /= 100;
+    }
+    
+    timep = &vectorTimes[33];
+    for( size = 64; size <= MAX_SIZE; size *= 2 )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        timep[0] =0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                test += localScaling.minDot( (btVector3*) in, size, testDot);
+            currentTime = ReadTicks() - startTime;
+            timep[0] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            timep[0] = bestTime;        
+        else
+            timep[0] /= 100;
+        
+        timep++;
+    }
+    
+    vlog( "Timing:\n" );
+    vlog( " size\t    scalar\t    vector\n" );
+    for( size = 1; size <= 32; size++ )
+        vlog( "%5lu\t%10.2f\t%10.2f\n", size, TicksToCycles( scalarTimes[size] ) / LOOPCOUNT, TicksToCycles( vectorTimes[size] ) / LOOPCOUNT );
+    size_t index = 33;
+    for( size = 64; size <= MAX_SIZE; size *= 2 )
+    {
+        vlog( "%5lu\t%10.2f\t%10.2f\n", size, TicksToCycles( scalarTimes[index] ) / LOOPCOUNT, TicksToCycles( vectorTimes[index] ) / LOOPCOUNT );
+        index++;
+    }
+    
+    // Useless check to make sure that the timing loops are not optimized away
+    if( test != correct )
+        vlog( "Error: Test != correct: *%ld vs. %ld\n", correct, test);
+    
+    GuardFree(data);
+    
+    return 0;
+}
+
+
+
+static long mindot_ref(    const btSimdFloat4 *vertices, 
+                       float *vec,
+                       size_t count, 
+                       float *dotResult )
+{
+    
+    const float *dp = (const float*) vertices;
+    float  minDot = BT_INFINITY;
+    long i = 0;
+    long ptIndex = -1;
+    
+    for( i = 0; i < count; i++ )
+    {
+        float dot = vec[0] * dp[0] + vec[1] * dp[1] + vec[2] * dp[2];   dp += 4;
+        
+        if( dot < minDot )
+        {
+            minDot = dot;
+            ptIndex = i;
+        }
+    }
+    
+    *dotResult = minDot;
+    
+    return ptIndex;
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_mindot.h b/Test/Source/Tests/Test_mindot.h
new file mode 100644
index 000000000..4810dcd8d
--- /dev/null
+++ b/Test/Source/Tests/Test_mindot.h
@@ -0,0 +1,22 @@
+//
+//  Test_mindot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_mindot_h
+#define BulletTest_Test_mindot_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+    
+    int Test_mindot(void);
+    
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/Test/Source/Tests/Test_qtdot.cpp b/Test/Source/Tests/Test_qtdot.cpp
new file mode 100644
index 000000000..3fad74d37
--- /dev/null
+++ b/Test/Source/Tests/Test_qtdot.cpp
@@ -0,0 +1,162 @@
+//
+//  Test_qtdot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_qtdot.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btQuaternion.h>
+
+#define BT_OP(a, b)     (a.dot(b))
+// reference code for testing purposes
+static inline btScalar qtdot_ref(btQuaternion& q1, btQuaternion& q2);
+
+static inline btScalar qtdot_ref(btQuaternion& q1, btQuaternion& q2)
+{
+    return 
+        q1.x() * q2.x() + 
+        q1.y() * q2.y() + 
+        q1.z() * q2.z() + 
+        q1.w() * q2.w();
+}
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_qtdot(void)
+{
+    btQuaternion q1, q2;
+	float x, y, z, w, vNaN;
+    vNaN = BT_NAN;     // w channel NaN
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = RANDF_01;
+    q1.setValue(x,y,z,w);
+	
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = RANDF_01;
+    q2.setValue(x,y,z,w);
+
+	btScalar correct_res, test_res;
+	 
+    {
+		correct_res = vNaN; 
+		test_res = vNaN;
+		correct_res = qtdot_ref(q1, q2);
+		test_res = BT_OP(q1,q2);
+	   
+		if( fabsf(correct_res - test_res) > FLT_EPSILON*4 )
+		{	
+			vlog( "Error - qtdot result error! "
+					"\ncorrect = %10.4f "
+					"\ntested  = %10.4f \n", 
+					correct_res, test_res);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btQuaternion qt_arr1[DATA_SIZE];
+	btQuaternion qt_arr2[DATA_SIZE];
+    btScalar     res_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+    for( k = 0; k < DATA_SIZE; k++ )
+    {
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        w = RANDF_01;
+        qt_arr1[k].setValue(x,y,z,w);
+
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        w = RANDF_01;
+        qt_arr2[k].setValue(x,y,z,w);
+    }
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t km = (k & (DATA_SIZE-1)); 
+                res_arr[km] = qtdot_ref(qt_arr1[km], qt_arr2[km]);km++;
+                res_arr[km] = qtdot_ref(qt_arr1[km], qt_arr2[km]);km++;
+                res_arr[km] = qtdot_ref(qt_arr1[km], qt_arr2[km]);km++;
+                res_arr[km] = qtdot_ref(qt_arr1[km], qt_arr2[km]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t km = (k & (DATA_SIZE-1)); 
+            	res_arr[km] = BT_OP(qt_arr1[km], qt_arr2[km]);km++;
+            	res_arr[km] = BT_OP(qt_arr1[km], qt_arr2[km]);km++;
+            	res_arr[km] = BT_OP(qt_arr1[km], qt_arr2[km]);km++;
+            	res_arr[km] = BT_OP(qt_arr1[km], qt_arr2[km]);km++;
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_qtdot.h b/Test/Source/Tests/Test_qtdot.h
new file mode 100644
index 000000000..4917780c9
--- /dev/null
+++ b/Test/Source/Tests/Test_qtdot.h
@@ -0,0 +1,22 @@
+//
+//  Test_qtdot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_qtdot_h
+#define BulletTest_Test_qtdot_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_qtdot(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_qtmul.cpp b/Test/Source/Tests/Test_qtmul.cpp
new file mode 100644
index 000000000..7b83a7bb4
--- /dev/null
+++ b/Test/Source/Tests/Test_qtmul.cpp
@@ -0,0 +1,183 @@
+//
+//  Test_qtmul.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_qtmul.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btQuaternion.h>
+
+#define BT_OP(a, b)     ((a) *= (b))
+// reference code for testing purposes
+static inline btQuaternion& qtmul_ref(btQuaternion& q1, btQuaternion& q2);
+
+static inline btQuaternion& qtmul_ref(btQuaternion& q1, btQuaternion& q2)
+{
+    float x,y,z,w;
+    x = q1.w() * q2.x() + q1.x() * q2.w() + q1.y() * q2.z() - q1.z() * q2.y(),
+    y = q1.w() * q2.y() + q1.y() * q2.w() + q1.z() * q2.x() - q1.x() * q2.z(),
+    z = q1.w() * q2.z() + q1.z() * q2.w() + q1.x() * q2.y() - q1.y() * q2.x(),
+    w = q1.w() * q2.w() - q1.x() * q2.x() - q1.y() * q2.y() - q1.z() * q2.z();
+
+    q1.setValue(x, y, z, w);
+	return q1;
+}
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_qtmul(void)
+{
+    btQuaternion q1, q2, q3;
+	
+    float x, y, z, w, vNaN;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = RANDF_01;
+    vNaN = BT_NAN;     // w channel NaN
+    q1.setValue(x,y,z,w);
+	
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = RANDF_01;
+    q2.setValue(x,y,z,w);
+
+	q3 = q1;
+		
+    btQuaternion correct_res, test_res;
+	 
+    {
+		float vNaN = BT_NAN;
+		correct_res.setValue(vNaN, vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN, vNaN);
+		correct_res = qtmul_ref(q1, q2);
+		test_res = BT_OP(q3,q2);
+	   
+		if( fabsf(correct_res.x() - test_res.x()) + 
+			fabsf(correct_res.y() - test_res.y()) +
+			fabsf(correct_res.z() - test_res.z()) +
+			fabsf(correct_res.w() - test_res.w()) > FLT_EPSILON*10 )
+		{	
+			vlog( "Error - qtmul result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					correct_res.x(), correct_res.y(), 
+                    correct_res.z(), correct_res.w(),
+					test_res.x(), test_res.y(), 
+                    test_res.z(), test_res.w());
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btQuaternion qt_arr1[DATA_SIZE];
+	btQuaternion qt_arr2[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				w = RANDF_01;
+				qt_arr1[k].setValue(x,y,z,w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				w = RANDF_01;
+				qt_arr2[k].setValue(x,y,z,w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            qt_arr1[k] = qtmul_ref(qt_arr1[k], qt_arr2[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				w = RANDF_01;
+				qt_arr1[k].setValue(x,y,z,w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				w = RANDF_01;
+				qt_arr2[k].setValue(x,y,z,w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				qt_arr1[k] = BT_OP(qt_arr1[k], qt_arr2[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_qtmul.h b/Test/Source/Tests/Test_qtmul.h
new file mode 100644
index 000000000..9109199b6
--- /dev/null
+++ b/Test/Source/Tests/Test_qtmul.h
@@ -0,0 +1,22 @@
+//
+//  Test_qtmul.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_qtmul_h
+#define BulletTest_Test_qtmul_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_qtmul(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_qtmulQV3.cpp b/Test/Source/Tests/Test_qtmulQV3.cpp
new file mode 100644
index 000000000..0a26b50d9
--- /dev/null
+++ b/Test/Source/Tests/Test_qtmulQV3.cpp
@@ -0,0 +1,162 @@
+//
+//  Test_qtmulQV3.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_qtmulQV3.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btQuaternion.h>
+
+#define BT_OP(a, b)     ((a) * (b))
+// reference code for testing purposes
+static inline btQuaternion qtmulQV3_ref(const btQuaternion& q, const btVector3& w);
+
+static inline btQuaternion qtmulQV3_ref(const btQuaternion& q, const btVector3& w)
+{
+	return btQuaternion( 
+         q.w() * w.x() + q.y() * w.z() - q.z() * w.y(),
+		 q.w() * w.y() + q.z() * w.x() - q.x() * w.z(),
+		 q.w() * w.z() + q.x() * w.y() - q.y() * w.x(),
+		-q.x() * w.x() - q.y() * w.y() - q.z() * w.z()); 
+}
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_m1p1, RANDF_m1p1, RANDF_m1p1, BT_NAN );      // w channel NaN
+}
+
+static inline btSimdFloat4 qtrand_f4(void)
+{
+    return btAssign128( RANDF_m1p1, RANDF_m1p1, RANDF_m1p1, RANDF_m1p1 );
+}
+
+static inline btSimdFloat4 qtNAN_f4(void)
+{
+    return btAssign128( BT_NAN, BT_NAN, BT_NAN, BT_NAN );
+}
+
+int Test_qtmulQV3(void)
+{
+    btQuaternion q;
+	btVector3 v3;
+    
+    // Init the data
+    q = btQuaternion(qtrand_f4()); 
+    v3 = btVector3(rand_f4());
+
+    btQuaternion correct_res, test_res;
+    correct_res = btQuaternion(qtNAN_f4());
+    test_res = btQuaternion(qtNAN_f4());
+	 
+    {
+		correct_res = qtmulQV3_ref(q, v3);
+		test_res = BT_OP(q, v3);
+	   
+		if( fabsf(correct_res.x() - test_res.x()) + 
+			fabsf(correct_res.y() - test_res.y()) +
+			fabsf(correct_res.z() - test_res.z()) +
+			fabsf(correct_res.w() - test_res.w()) > FLT_EPSILON*8 )
+		{	
+			vlog( "Error - qtmulQV3 result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					correct_res.x(), correct_res.y(), 
+                    correct_res.z(), correct_res.w(),
+					test_res.x(), test_res.y(), 
+                    test_res.z(), test_res.w());
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btQuaternion qt_arrR[DATA_SIZE];
+	btQuaternion qt_arr[DATA_SIZE];
+	btVector3 v3_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                qt_arr[k] = btQuaternion(qtrand_f4()); 
+                v3_arr[k] = btVector3(rand_f4());
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            qt_arrR[k] = qtmulQV3_ref(qt_arr[k], v3_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                qt_arr[k] = btQuaternion(qtrand_f4()); 
+                v3_arr[k] = btVector3(rand_f4());
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				qt_arrR[k] = BT_OP(qt_arr[k], v3_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_qtmulQV3.h b/Test/Source/Tests/Test_qtmulQV3.h
new file mode 100644
index 000000000..f7bffce0b
--- /dev/null
+++ b/Test/Source/Tests/Test_qtmulQV3.h
@@ -0,0 +1,22 @@
+//
+//  Test_qtmulQV3.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_qtmulQV3_h
+#define BulletTest_Test_qtmulQV3_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_qtmulQV3(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_qtmulV3Q.cpp b/Test/Source/Tests/Test_qtmulV3Q.cpp
new file mode 100644
index 000000000..48f51245c
--- /dev/null
+++ b/Test/Source/Tests/Test_qtmulV3Q.cpp
@@ -0,0 +1,161 @@
+//
+//  Test_qtmulV3Q.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_qtmulV3Q.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btQuaternion.h>
+
+#define BT_OP(a, b)     ((a) * (b))
+// reference code for testing purposes
+static inline btQuaternion qtmulV3Q_ref(const btVector3& w, const btQuaternion& q);
+
+static inline btQuaternion qtmulV3Q_ref(const btVector3& w, const btQuaternion& q)
+{
+	return btQuaternion( 
+        +w.x() * q.w() + w.y() * q.z() - w.z() * q.y(),
+		+w.y() * q.w() + w.z() * q.x() - w.x() * q.z(),
+		+w.z() * q.w() + w.x() * q.y() - w.y() * q.x(),
+		-w.x() * q.x() - w.y() * q.y() - w.z() * q.z()); 
+}
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_m1p1, RANDF_m1p1, RANDF_m1p1, BT_NAN );      // w channel NaN
+}
+
+static inline btSimdFloat4 qtrand_f4(void)
+{
+    return btAssign128( RANDF_m1p1, RANDF_m1p1, RANDF_m1p1, RANDF_m1p1 );
+}
+
+static inline btSimdFloat4 qtNAN_f4(void)
+{
+    return btAssign128( BT_NAN, BT_NAN, BT_NAN, BT_NAN );
+}
+
+int Test_qtmulV3Q(void)
+{
+    btQuaternion q;
+	btVector3 v3;
+    
+    // Init the data
+    q = btQuaternion(qtrand_f4()); 
+    v3 = btVector3(rand_f4());
+
+    btQuaternion correct_res, test_res;
+    correct_res = btQuaternion(qtNAN_f4());
+    test_res = btQuaternion(qtNAN_f4());
+	 
+    {
+		correct_res = qtmulV3Q_ref(v3, q);
+		test_res = BT_OP(v3, q);
+	   
+		if( fabsf(correct_res.x() - test_res.x()) + 
+			fabsf(correct_res.y() - test_res.y()) +
+			fabsf(correct_res.z() - test_res.z()) +
+			fabsf(correct_res.w() - test_res.w()) > FLT_EPSILON*8 )
+		{	
+			vlog( "Error - qtmulV3Q result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					correct_res.x(), correct_res.y(), 
+                    correct_res.z(), correct_res.w(),
+					test_res.x(), test_res.y(), 
+                    test_res.z(), test_res.w());
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btQuaternion qt_arrR[DATA_SIZE];
+	btQuaternion qt_arr[DATA_SIZE];
+	btVector3 v3_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                qt_arr[k] = btQuaternion(qtrand_f4()); 
+                v3_arr[k] = btVector3(rand_f4());
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            qt_arrR[k] = qtmulV3Q_ref(v3_arr[k], qt_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                qt_arr[k] = btQuaternion(qtrand_f4()); 
+                v3_arr[k] = btVector3(rand_f4());
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				qt_arrR[k] = BT_OP(v3_arr[k], qt_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+#endif//#ifdef BT_USE_SSE
diff --git a/Test/Source/Tests/Test_qtmulV3Q.h b/Test/Source/Tests/Test_qtmulV3Q.h
new file mode 100644
index 000000000..f9714c9b6
--- /dev/null
+++ b/Test/Source/Tests/Test_qtmulV3Q.h
@@ -0,0 +1,22 @@
+//
+//  Test_qtmulV3Q.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_qtmulV3Q_h
+#define BulletTest_Test_qtmulV3Q_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_qtmulV3Q(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_qtnorm.cpp b/Test/Source/Tests/Test_qtnorm.cpp
new file mode 100644
index 000000000..3d008bffd
--- /dev/null
+++ b/Test/Source/Tests/Test_qtnorm.cpp
@@ -0,0 +1,176 @@
+//
+//  Test_qtnorm.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_qtnorm.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btQuaternion.h>
+
+#define BT_OP(a)     (a.normalize())
+// reference code for testing purposes
+static inline btQuaternion& qtnorm_ref(btQuaternion& q1);
+
+static inline btQuaternion& qtnorm_ref(btQuaternion& q1)
+{
+    float dot =
+        q1.x() * q1.x() + 
+        q1.y() * q1.y() + 
+        q1.z() * q1.z() + 
+        q1.w() * q1.w();
+
+	dot = 1.0f / sqrtf(dot);
+
+    q1.setValue(q1.x()*dot, q1.y()*dot, q1.z()*dot, q1.w()*dot);
+
+    return q1;
+}
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_qtnorm(void)
+{
+    int i;
+    btQuaternion q1, q2;
+	float x, y, z, w, vNaN;
+    vNaN = BT_NAN;     // w channel NaN
+    
+	btQuaternion correct_res, test_res;
+	
+    for (i=0; i<LOOPCOUNT; i++)
+    {
+        // Init the data
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        w = RANDF_01;
+        q1.setValue(x,y,z,w);
+        
+        q2 = q1;
+
+		correct_res.setValue(vNaN, vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN, vNaN);
+		correct_res = qtnorm_ref(q1);
+		test_res = BT_OP(q2);
+	   
+		if( fabsf(correct_res.x() - test_res.x()) + 
+			fabsf(correct_res.y() - test_res.y()) +
+			fabsf(correct_res.z() - test_res.z()) +
+			fabsf(correct_res.w() - test_res.w()) > FLT_EPSILON*10 )
+        {	
+			vlog( "Error - qtnorm result error! "
+					"\ncorrect = (%10.7f, %10.7f, %10.7f, %10.7f) "
+					"\ntested  = (%10.7f, %10.7f, %10.7f, %10.7f) \n", 
+					correct_res.x(), correct_res.y(), 
+                    correct_res.z(), correct_res.w(),
+					test_res.x(), test_res.y(), 
+                    test_res.z(), test_res.w());
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btQuaternion qt_arr0[DATA_SIZE];
+	btQuaternion qt_arr1[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            for( k = 0; k < DATA_SIZE; k++ )
+            {
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+                w = RANDF_01;
+                qt_arr1[k].setValue(x,y,z,w);
+            }
+
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t km = (k & (DATA_SIZE-1)); 
+                qt_arr0[km] = qtnorm_ref(qt_arr1[km]);km++;
+                qt_arr0[km] = qtnorm_ref(qt_arr1[km]);km++;
+                qt_arr0[km] = qtnorm_ref(qt_arr1[km]);km++;
+                qt_arr0[km] = qtnorm_ref(qt_arr1[km]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            for( k = 0; k < DATA_SIZE; k++ )
+            {
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+                w = RANDF_01;
+                qt_arr1[k].setValue(x,y,z,w);
+            }
+        
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t km = (k & (DATA_SIZE-1)); 
+            	qt_arr0[km] = BT_OP(qt_arr1[km]);km++;
+            	qt_arr0[km] = BT_OP(qt_arr1[km]);km++;
+            	qt_arr0[km] = BT_OP(qt_arr1[km]);km++;
+            	qt_arr0[km] = BT_OP(qt_arr1[km]);km++;
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_qtnorm.h b/Test/Source/Tests/Test_qtnorm.h
new file mode 100644
index 000000000..5b0021709
--- /dev/null
+++ b/Test/Source/Tests/Test_qtnorm.h
@@ -0,0 +1,22 @@
+//
+//  Test_qtnorm.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_qtnorm_h
+#define BulletTest_Test_qtnorm_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_qtnorm(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_quat_aos_neon.cpp b/Test/Source/Tests/Test_quat_aos_neon.cpp
new file mode 100644
index 000000000..c13e41aeb
--- /dev/null
+++ b/Test/Source/Tests/Test_quat_aos_neon.cpp
@@ -0,0 +1,599 @@
+//
+//  Test_quat_aos_neon.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc., Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_quat_aos_neon.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+
+#include <vectormath/vmInclude.h>
+
+
+//typedef Vectormath::Aos::Vector3    vmVector3;
+//typedef Vectormath::Aos::Quat       vmQuat;
+//typedef Vectormath::Aos::Matrix3    vmMatrix3;
+//typedef Vectormath::Aos::Transform3 vmTransform3;
+//typedef Vectormath::Aos::Point3     vmPoint3;
+
+
+typedef Vectormath::Aos::Vector4    vmVector4;
+
+// reference code for testing purposes
+ATTRIBUTE_ALIGNED16(class) Quat_ref
+{
+    float mX;
+    float mY;
+    float mZ;
+    float mW;
+    
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Quat_ref( ) { };
+    
+    // Copy a quaternion
+    // 
+    inline Quat_ref( const Quat_ref & quat );
+    
+    // Construct a quaternion from x, y, z, and w elements
+    // 
+    inline Quat_ref( float x, float y, float z, float w );
+    
+    // Construct a quaternion from a 3-D vector and a scalar
+    // 
+    inline Quat_ref( const vmVector3 & xyz, float w );
+    
+    // Copy elements from a 4-D vector into a quaternion
+    // 
+    explicit inline Quat_ref( const vmVector4 & vec );
+    
+    // Convert a rotation matrix to a unit-length quaternion
+    // 
+    explicit inline Quat_ref( const vmMatrix3 & rotMat );
+    
+    // Set all elements of a quaternion to the same scalar value
+    // 
+    explicit inline Quat_ref( float scalar );
+    
+    // Assign one quaternion to another
+    // 
+    inline Quat_ref & operator =( const Quat_ref & quat );
+    
+    // Set the x, y, and z elements of a quaternion
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    inline Quat_ref & setXYZ( const vmVector3 & vec );
+    
+    // Get the x, y, and z elements of a quaternion
+    // 
+    inline const vmVector3 getXYZ( ) const;
+    
+    // Set the x element of a quaternion
+    // 
+    inline Quat_ref & setX( float x );
+    
+    // Set the y element of a quaternion
+    // 
+    inline Quat_ref & setY( float y );
+    
+    // Set the z element of a quaternion
+    // 
+    inline Quat_ref & setZ( float z );
+    
+    // Set the w element of a quaternion
+    // 
+    inline Quat_ref & setW( float w );
+    
+    // Get the x element of a quaternion
+    // 
+    inline float getX( ) const;
+    
+    // Get the y element of a quaternion
+    // 
+    inline float getY( ) const;
+    
+    // Get the z element of a quaternion
+    // 
+    inline float getZ( ) const;
+    
+    // Get the w element of a quaternion
+    // 
+    inline float getW( ) const;
+    
+    // Set an x, y, z, or w element of a quaternion by index
+    // 
+    inline Quat_ref & setElem( int idx, float value );
+    
+    // Get an x, y, z, or w element of a quaternion by index
+    // 
+    inline float getElem( int idx ) const;
+    
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+    
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+    
+    // Add two quaternions
+    // 
+    inline const Quat_ref operator +( const Quat_ref & quat ) const;
+    
+    // Subtract a quaternion from another quaternion
+    // 
+    inline const Quat_ref operator -( const Quat_ref & quat ) const;
+    
+    // Multiply two quaternions
+    // 
+    inline const Quat_ref operator *( const Quat_ref & quat ) const;
+    
+    // Multiply a quaternion by a scalar
+    // 
+    inline const Quat_ref operator *( float scalar ) const;
+    
+    // Divide a quaternion by a scalar
+    // 
+    inline const Quat_ref operator /( float scalar ) const;
+    
+    // Perform compound assignment and addition with a quaternion
+    // 
+    inline Quat_ref & operator +=( const Quat_ref & quat );
+    
+    // Perform compound assignment and subtraction by a quaternion
+    // 
+    inline Quat_ref & operator -=( const Quat_ref & quat );
+    
+    // Perform compound assignment and multiplication by a quaternion
+    // 
+    inline Quat_ref & operator *=( const Quat_ref & quat );
+    
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Quat_ref & operator *=( float scalar );
+    
+    // Perform compound assignment and division by a scalar
+    // 
+    inline Quat_ref & operator /=( float scalar );
+    
+    // Negate all elements of a quaternion
+    // 
+    inline const Quat_ref operator -( ) const;
+    
+    // Construct an identity quaternion
+    // 
+    static inline const Quat_ref identity( );
+    
+    // Construct a quaternion to rotate between two unit-length 3-D vectors
+    // NOTE: 
+    // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions.
+    // 
+    static inline const Quat_ref rotation( const vmVector3 & unitVec0, const vmVector3 & unitVec1 );
+    
+    // Construct a quaternion to rotate around a unit-length 3-D vector
+    // 
+    static inline const Quat_ref rotation( float radians, const vmVector3 & unitVec );
+    
+    // Construct a quaternion to rotate around the x axis
+    // 
+    static inline const Quat_ref rotationX( float radians );
+    
+    // Construct a quaternion to rotate around the y axis
+    // 
+    static inline const Quat_ref rotationY( float radians );
+    
+    // Construct a quaternion to rotate around the z axis
+    // 
+    static inline const Quat_ref rotationZ( float radians );
+    
+};
+
+inline Quat_ref::Quat_ref( const Quat_ref & quat )
+{
+    mX = quat.mX;
+    mY = quat.mY;
+    mZ = quat.mZ;
+    mW = quat.mW;
+}
+
+inline Quat_ref::Quat_ref( float _x, float _y, float _z, float _w )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+    mW = _w;
+}
+
+inline Quat_ref::Quat_ref( const vmVector3 & xyz, float _w )
+{
+    this->setXYZ( xyz );
+    this->setW( _w );
+}
+
+inline Quat_ref::Quat_ref( const vmVector4 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    mW = vec.getW();
+}
+
+inline Quat_ref::Quat_ref( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+    mW = scalar;
+}
+
+inline const Quat_ref Quat_ref::identity( )
+{
+    return Quat_ref( 0.0f, 0.0f, 0.0f, 1.0f );
+}
+
+
+inline void loadXYZW_ref( Quat_ref & quat, const float * fptr )
+{
+    quat = Quat_ref( fptr[0], fptr[1], fptr[2], fptr[3] );
+}
+
+inline void storeXYZW_ref( const Quat_ref & quat, float * fptr )
+{
+    fptr[0] = quat.getX();
+    fptr[1] = quat.getY();
+    fptr[2] = quat.getZ();
+    fptr[3] = quat.getW();
+}
+
+inline Quat_ref & Quat_ref::operator =( const Quat_ref & quat )
+{
+    mX = quat.mX;
+    mY = quat.mY;
+    mZ = quat.mZ;
+    mW = quat.mW;
+    return *this;
+}
+
+inline Quat_ref & Quat_ref::setXYZ( const vmVector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    return *this;
+}
+
+inline const vmVector3 Quat_ref::getXYZ( ) const
+{
+    return vmVector3( mX, mY, mZ );
+}
+
+inline Quat_ref & Quat_ref::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Quat_ref::getX( ) const
+{
+    return mX;
+}
+
+inline Quat_ref & Quat_ref::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Quat_ref::getY( ) const
+{
+    return mY;
+}
+
+inline Quat_ref & Quat_ref::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Quat_ref::getZ( ) const
+{
+    return mZ;
+}
+
+inline Quat_ref & Quat_ref::setW( float _w )
+{
+    mW = _w;
+    return *this;
+}
+
+inline float Quat_ref::getW( ) const
+{
+    return mW;
+}
+
+inline Quat_ref & Quat_ref::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Quat_ref::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Quat_ref::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Quat_ref::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Quat_ref Quat_ref::operator +( const Quat_ref & quat ) const
+{
+    return Quat_ref(
+                ( mX + quat.mX ),
+                ( mY + quat.mY ),
+                ( mZ + quat.mZ ),
+                ( mW + quat.mW )
+                );
+}
+
+inline const Quat_ref Quat_ref::operator -( const Quat_ref & quat ) const
+{
+    return Quat_ref(
+                ( mX - quat.mX ),
+                ( mY - quat.mY ),
+                ( mZ - quat.mZ ),
+                ( mW - quat.mW )
+                );
+}
+
+inline const Quat_ref Quat_ref::operator *( float scalar ) const
+{
+    return Quat_ref(
+                ( mX * scalar ),
+                ( mY * scalar ),
+                ( mZ * scalar ),
+                ( mW * scalar )
+                );
+}
+
+inline Quat_ref & Quat_ref::operator +=( const Quat_ref & quat )
+{
+    *this = *this + quat;
+    return *this;
+}
+
+inline Quat_ref & Quat_ref::operator -=( const Quat_ref & quat )
+{
+    *this = *this - quat;
+    return *this;
+}
+
+inline Quat_ref & Quat_ref::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Quat_ref Quat_ref::operator /( float scalar ) const
+{
+    return Quat_ref(
+                ( mX / scalar ),
+                ( mY / scalar ),
+                ( mZ / scalar ),
+                ( mW / scalar )
+                );
+}
+
+inline Quat_ref & Quat_ref::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+inline const Quat_ref Quat_ref::operator -( ) const
+{
+    return Quat_ref(
+                -mX,
+                -mY,
+                -mZ,
+                -mW
+                );
+}
+
+inline const Quat_ref operator *( float scalar, const Quat_ref & quat )
+{
+    return quat * scalar;
+}
+
+inline float dot( const Quat_ref & quat0, const Quat_ref & quat1 )
+{
+    float result;
+    result = ( quat0.getX() * quat1.getX() );
+    result = ( result + ( quat0.getY() * quat1.getY() ) );
+    result = ( result + ( quat0.getZ() * quat1.getZ() ) );
+    result = ( result + ( quat0.getW() * quat1.getW() ) );
+    return result;
+}
+
+inline const Quat_ref lerp( float t, const Quat_ref & quat0, const Quat_ref & quat1 )
+{
+    return ( quat0 + ( ( quat1 - quat0 ) * t ) );
+}
+
+inline const Quat_ref slerp( float t, const Quat_ref & unitQuat0, const Quat_ref & unitQuat1 )
+{
+    Quat_ref start;
+    float recipSinAngle, scale0, scale1, cosAngle, angle;
+    cosAngle = dot( unitQuat0, unitQuat1 );
+    if ( cosAngle < 0.0f ) {
+        cosAngle = -cosAngle;
+        start = ( -unitQuat0 );
+    } else {
+        start = unitQuat0;
+    }
+    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
+        angle = acosf( cosAngle );
+        recipSinAngle = ( 1.0f / sinf( angle ) );
+        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
+        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
+    } else {
+        scale0 = ( 1.0f - t );
+        scale1 = t;
+    }
+    return ( ( start * scale0 ) + ( unitQuat1 * scale1 ) );
+}
+
+inline const Quat_ref squad( float t, const Quat_ref & unitQuat0, const Quat_ref & unitQuat1, const Quat_ref & unitQuat2, const Quat_ref & unitQuat3 )
+{
+    Quat_ref tmp0, tmp1;
+    tmp0 = slerp( t, unitQuat0, unitQuat3 );
+    tmp1 = slerp( t, unitQuat1, unitQuat2 );
+    return slerp( ( ( 2.0f * t ) * ( 1.0f - t ) ), tmp0, tmp1 );
+}
+
+inline float norm( const Quat_ref & quat )
+{
+    float result;
+    result = ( quat.getX() * quat.getX() );
+    result = ( result + ( quat.getY() * quat.getY() ) );
+    result = ( result + ( quat.getZ() * quat.getZ() ) );
+    result = ( result + ( quat.getW() * quat.getW() ) );
+    return result;
+}
+
+inline float length( const Quat_ref & quat )
+{
+    return ::sqrtf( norm( quat ) );
+}
+
+inline const Quat_ref normalize( const Quat_ref & quat )
+{
+    float lenSqr, lenInv;
+    lenSqr = norm( quat );
+    lenInv = ( 1.0f / sqrtf( lenSqr ) );
+    return Quat_ref(
+                ( quat.getX() * lenInv ),
+                ( quat.getY() * lenInv ),
+                ( quat.getZ() * lenInv ),
+                ( quat.getW() * lenInv )
+                );
+}
+
+inline const Quat_ref Quat_ref::rotation( const vmVector3 & unitVec0, const vmVector3 & unitVec1 )
+{
+    float cosHalfAngleX2, recipCosHalfAngleX2;
+    cosHalfAngleX2 = sqrtf( ( 2.0f * ( 1.0f + dot( unitVec0, unitVec1 ) ) ) );
+    recipCosHalfAngleX2 = ( 1.0f / cosHalfAngleX2 );
+    return Quat_ref( ( cross( unitVec0, unitVec1 ) * recipCosHalfAngleX2 ), ( cosHalfAngleX2 * 0.5f ) );
+}
+
+inline const Quat_ref Quat_ref::rotation( float radians, const vmVector3 & unitVec )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat_ref( ( unitVec * s ), c );
+}
+
+inline const Quat_ref Quat_ref::rotationX( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat_ref( s, 0.0f, 0.0f, c );
+}
+
+inline const Quat_ref Quat_ref::rotationY( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat_ref( 0.0f, s, 0.0f, c );
+}
+
+inline const Quat_ref Quat_ref::rotationZ( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat_ref( 0.0f, 0.0f, s, c );
+}
+
+inline const Quat_ref Quat_ref::operator *( const Quat_ref & quat ) const
+{
+    return Quat_ref(
+                ( ( ( ( mW * quat.mX ) + ( mX * quat.mW ) ) + ( mY * quat.mZ ) ) - ( mZ * quat.mY ) ),
+                ( ( ( ( mW * quat.mY ) + ( mY * quat.mW ) ) + ( mZ * quat.mX ) ) - ( mX * quat.mZ ) ),
+                ( ( ( ( mW * quat.mZ ) + ( mZ * quat.mW ) ) + ( mX * quat.mY ) ) - ( mY * quat.mX ) ),
+                ( ( ( ( mW * quat.mW ) - ( mX * quat.mX ) ) - ( mY * quat.mY ) ) - ( mZ * quat.mZ ) )
+                );
+}
+
+inline Quat_ref & Quat_ref::operator *=( const Quat_ref & quat )
+{
+    *this = *this * quat;
+    return *this;
+}
+
+inline const vmVector3 rotate( const Quat_ref & quat, const vmVector3 & vec )
+{
+    float tmpX, tmpY, tmpZ, tmpW;
+    tmpX = ( ( ( quat.getW() * vec.getX() ) + ( quat.getY() * vec.getZ() ) ) - ( quat.getZ() * vec.getY() ) );
+    tmpY = ( ( ( quat.getW() * vec.getY() ) + ( quat.getZ() * vec.getX() ) ) - ( quat.getX() * vec.getZ() ) );
+    tmpZ = ( ( ( quat.getW() * vec.getZ() ) + ( quat.getX() * vec.getY() ) ) - ( quat.getY() * vec.getX() ) );
+    tmpW = ( ( ( quat.getX() * vec.getX() ) + ( quat.getY() * vec.getY() ) ) + ( quat.getZ() * vec.getZ() ) );
+    return vmVector3(
+                   ( ( ( ( tmpW * quat.getX() ) + ( tmpX * quat.getW() ) ) - ( tmpY * quat.getZ() ) ) + ( tmpZ * quat.getY() ) ),
+                   ( ( ( ( tmpW * quat.getY() ) + ( tmpY * quat.getW() ) ) - ( tmpZ * quat.getX() ) ) + ( tmpX * quat.getZ() ) ),
+                   ( ( ( ( tmpW * quat.getZ() ) + ( tmpZ * quat.getW() ) ) - ( tmpX * quat.getY() ) ) + ( tmpY * quat.getX() ) )
+                   );
+}
+
+inline const Quat_ref conj( const Quat_ref & quat )
+{
+    return Quat_ref( -quat.getX(), -quat.getY(), -quat.getZ(), quat.getW() );
+}
+
+inline const Quat_ref select( const Quat_ref & quat0, const Quat_ref & quat1, bool select1 )
+{
+    return Quat_ref(
+                ( select1 )? quat1.getX() : quat0.getX(),
+                ( select1 )? quat1.getY() : quat0.getY(),
+                ( select1 )? quat1.getZ() : quat0.getZ(),
+                ( select1 )? quat1.getW() : quat0.getW()
+                );
+}
+
+
+
+#define LOOPCOUNT 1000
+#define NUM_CYCLES 10000
+#define DATA_SIZE 1024
+
+int Test_quat_aos_neon(void)
+{
+       
+    return 0;
+}
+
+#endif
+
diff --git a/Test/Source/Tests/Test_quat_aos_neon.h b/Test/Source/Tests/Test_quat_aos_neon.h
new file mode 100644
index 000000000..e751a41be
--- /dev/null
+++ b/Test/Source/Tests/Test_quat_aos_neon.h
@@ -0,0 +1,21 @@
+//
+//  Test_quat_aos_neon.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc., Inc.
+//
+
+#ifndef BulletTest_Test_quat_aos_neon_h
+#define BulletTest_Test_quat_aos_neon_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+    
+    int Test_quat_aos_neon(void);
+    
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/Test/Source/Tests/Test_v3cross.cpp b/Test/Source/Tests/Test_v3cross.cpp
new file mode 100644
index 000000000..00ff4e421
--- /dev/null
+++ b/Test/Source/Tests/Test_v3cross.cpp
@@ -0,0 +1,181 @@
+//
+//  Test_v3cross.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_v3cross.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static btVector3& v3cross_ref(btVector3& v1, btVector3& v2);
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_v3cross(void)
+{
+    btVector3 v1, v2, v3;
+	
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    v2.setValue(x,y,z);
+	v2.setW(w);
+
+	v3 = v1;
+		
+    btVector3 correct_res, test_res;
+	 
+    {
+		float vNaN = BT_NAN;
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+		correct_res = v3cross_ref(v1, v2);
+		test_res = v3.cross(v2);
+	   
+		if( fabs(correct_res.m_floats[0] - test_res.m_floats[0]) +  
+			fabs(correct_res.m_floats[1] - test_res.m_floats[1]) + 
+			fabs(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON * 4)
+		{	
+			vlog( "Error - v3cross result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2]);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr1[DATA_SIZE];
+	btVector3 vec3_arr2[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+	    
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+			    x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+                vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+			    x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            vec3_arr1[k] = v3cross_ref(vec3_arr1[k], vec3_arr2[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+			    x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+			    x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				vec3_arr1[k] = vec3_arr1[k].cross(vec3_arr2[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+static btVector3& v3cross_ref(btVector3& v1, btVector3& v2)
+{
+	btScalar x,y,z;
+	x = v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1];
+	y = v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2];
+	z = v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0];
+	
+	v1.m_floats[0] = x;
+	v1.m_floats[1] = y;
+	v1.m_floats[2] = z;
+	
+	return v1;
+}
+
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_v3cross.h b/Test/Source/Tests/Test_v3cross.h
new file mode 100644
index 000000000..79854fe91
--- /dev/null
+++ b/Test/Source/Tests/Test_v3cross.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3cross.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3cross_h
+#define BulletTest_Test_v3cross_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3cross(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_v3div.cpp b/Test/Source/Tests/Test_v3div.cpp
new file mode 100644
index 000000000..bd5800289
--- /dev/null
+++ b/Test/Source/Tests/Test_v3div.cpp
@@ -0,0 +1,178 @@
+//
+//  Test_v3div.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_v3div.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+#define BT_OP(a, b)     ((a) / (b))
+// reference code for testing purposes
+static inline btVector3& v3div_ref(btVector3& v1, btVector3& v2);
+
+static btVector3& v3div_ref(btVector3& v0, btVector3& v1, btVector3& v2)
+{
+	v0.m_floats[0] = BT_OP(v1.m_floats[0] , v2.m_floats[0]), 
+	v0.m_floats[1] = BT_OP(v1.m_floats[1] , v2.m_floats[1]),
+	v0.m_floats[2] = BT_OP(v1.m_floats[2] , v2.m_floats[2]);
+	
+	return v0;
+}
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_v3div(void)
+{
+    btVector3 v1, v2, v3;
+	
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    v2.setValue(x,y,z);
+	v2.setW(w);
+
+	v3 = v1;
+		
+    btVector3 correct_res, test_res;
+	 
+    {
+		float vNaN = BT_NAN;
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+		correct_res = v3div_ref(correct_res, v1, v2);
+		test_res = BT_OP(v3,v2);
+	   
+		if( fabsf(correct_res.m_floats[0] - test_res.m_floats[0]) + 
+			fabsf(correct_res.m_floats[1] - test_res.m_floats[1]) +
+			fabsf(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON*10 )
+		{	
+			vlog( "Error - v3div result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2]);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr0[DATA_SIZE];
+	btVector3 vec3_arr1[DATA_SIZE];
+	btVector3 vec3_arr2[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+		w = BT_NAN;     // w channel NaN
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            vec3_arr0[k] = v3div_ref(vec3_arr0[k], vec3_arr1[k], vec3_arr2[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				vec3_arr0[k] = BT_OP(vec3_arr1[k] , vec3_arr2[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_v3div.h b/Test/Source/Tests/Test_v3div.h
new file mode 100644
index 000000000..21bfb6131
--- /dev/null
+++ b/Test/Source/Tests/Test_v3div.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3div.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3div_h
+#define BulletTest_Test_v3div_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3div(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_v3dot.cpp b/Test/Source/Tests/Test_v3dot.cpp
new file mode 100644
index 000000000..caa2967d2
--- /dev/null
+++ b/Test/Source/Tests/Test_v3dot.cpp
@@ -0,0 +1,164 @@
+//
+//  Test_v3dot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_v3dot.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static inline 
+btScalar v3dot_ref(
+    const btVector3& v1, 
+	const btVector3& v2);
+
+#define LOOPCOUNT 1000
+#define NUM_CYCLES 10000
+
+int Test_v3dot(void)
+{
+    btVector3 v1, v2;
+   
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    v2.setValue(x,y,z);
+	v2.setW(w);
+	
+    float correctDot0, testDot0;
+
+    {
+		correctDot0 = w;
+		testDot0 = w; ;
+		correctDot0 = v3dot_ref(v1, v2);
+		testDot0 = v1.dot(v2);
+	   
+		if( fabsf(correctDot0 - testDot0) > FLT_EPSILON * 4 )
+		{
+			vlog( "Error - v3dot result error! %f != %f \n", correctDot0, testDot0);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE 1024
+
+	btVector3 vec3_arr1[DATA_SIZE];
+	btVector3 vec3_arr2[DATA_SIZE];
+    btScalar res_arr[DATA_SIZE];
+    
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	for( k = 0; k < DATA_SIZE; k++ )
+	{
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+		vec3_arr1[k].setValue(x,y,z);
+		vec3_arr1[k].setW(w);
+
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+		vec3_arr2[k].setValue(x,y,z);
+		vec3_arr2[k].setW(w);
+	
+        res_arr[k] = w;
+    }
+    
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t k32 = (k & (DATA_SIZE-1)); 
+                res_arr[k32] = v3dot_ref( vec3_arr1[k32], vec3_arr2[k32]); k32++;
+				res_arr[k32] = v3dot_ref( vec3_arr1[k32], vec3_arr2[k32]); k32++;
+				res_arr[k32] = v3dot_ref( vec3_arr1[k32], vec3_arr2[k32]); k32++;
+				res_arr[k32] = v3dot_ref( vec3_arr1[k32], vec3_arr2[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t k32 = k & (DATA_SIZE -1); 
+                res_arr[k32] = vec3_arr1[k32].dot(vec3_arr2[k32]); k32++;
+				res_arr[k32] = vec3_arr1[k32].dot(vec3_arr2[k32]); k32++;
+				res_arr[k32] = vec3_arr1[k32].dot(vec3_arr2[k32]); k32++;
+				res_arr[k32] = vec3_arr1[k32].dot(vec3_arr2[k32]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+        vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+
+static btScalar v3dot_ref(const btVector3& v1, 
+						const btVector3& v2)
+{
+	return  (v1.m_floats[0] * v2.m_floats[0] + 
+			 v1.m_floats[1] * v2.m_floats[1] + 
+			 v1.m_floats[2] * v2.m_floats[2]);
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_v3dot.h b/Test/Source/Tests/Test_v3dot.h
new file mode 100644
index 000000000..b80a3af64
--- /dev/null
+++ b/Test/Source/Tests/Test_v3dot.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3dot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3dot_h
+#define BulletTest_Test_v3dot_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3dot(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_v3interp.cpp b/Test/Source/Tests/Test_v3interp.cpp
new file mode 100644
index 000000000..b03377597
--- /dev/null
+++ b/Test/Source/Tests/Test_v3interp.cpp
@@ -0,0 +1,195 @@
+//
+//  Test_v3interp.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_v3interp.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static inline 
+btVector3& v3interp_ref(
+    btVector3& vr, 
+    btVector3& v0, 
+    btVector3& v1, 
+    btScalar& rt);
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_v3interp(void)
+{
+    btVector3 v1, v2;
+	btScalar rt;
+    
+    float x,y,z,w;
+
+	float vNaN = BT_NAN;
+    w = BT_NAN;     // w channel NaN
+    
+    btVector3 correct_res, test_res;
+
+    for (rt = 0.0f; rt <= 1.0f; rt += 0.1f) 
+    {
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+
+        // Init the data
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        v1.setValue(x,y,z);
+        v1.setW(w);
+
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        v2.setValue(x,y,z);
+        v2.setW(w);
+
+        correct_res = v3interp_ref(correct_res, v1, v2, rt);
+		test_res.setInterpolate3(v1, v2, rt);
+	   
+		if( fabs(correct_res.m_floats[0] - test_res.m_floats[0]) +  
+			fabs(correct_res.m_floats[1] - test_res.m_floats[1]) + 
+			fabs(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON * 4)
+		{	
+			vlog( "Error - v3interp result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n"
+                    "\n rt=%10.4f", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2], rt);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr1[DATA_SIZE];
+	btVector3 vec3_arr2[DATA_SIZE];
+    btScalar  rt_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+		
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+		
+                rt_arr[k] = RANDF_01;
+            }
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            v3interp_ref(vec3_arr1[k], vec3_arr1[k], vec3_arr2[k], rt_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+                
+                rt_arr[k] = RANDF_01;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				vec3_arr1[k].setInterpolate3(vec3_arr1[k], vec3_arr2[k], rt_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+static btVector3& 
+v3interp_ref(
+    btVector3& vr, 
+    btVector3& v0, 
+    btVector3& v1, 
+    btScalar& rt)
+{
+    btScalar s = btScalar(1.0) - rt;
+    vr.m_floats[0] = s * v0.m_floats[0] + rt * v1.m_floats[0];
+    vr.m_floats[1] = s * v0.m_floats[1] + rt * v1.m_floats[1];
+    vr.m_floats[2] = s * v0.m_floats[2] + rt * v1.m_floats[2];
+
+	return vr;
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_v3interp.h b/Test/Source/Tests/Test_v3interp.h
new file mode 100644
index 000000000..502b1ad73
--- /dev/null
+++ b/Test/Source/Tests/Test_v3interp.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3interp.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3interp_h
+#define BulletTest_Test_v3interp_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3interp(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_v3lerp.cpp b/Test/Source/Tests/Test_v3lerp.cpp
new file mode 100644
index 000000000..311d0bd4e
--- /dev/null
+++ b/Test/Source/Tests/Test_v3lerp.cpp
@@ -0,0 +1,198 @@
+//
+//  Test_v3lerp.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_v3lerp.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static inline 
+btVector3& 
+v3lerp_ref(
+    btVector3& vr, 
+    btVector3& v0, 
+    btVector3& v1, 
+    btScalar& rt);
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_v3lerp(void)
+{
+    btVector3 v1, v2;
+	btScalar rt;
+    
+    float x,y,z,w;
+
+	float vNaN =BT_NAN;
+    w =BT_NAN;     // w channel NaN
+    
+    btVector3 correct_res, test_res;
+
+    for (rt = 0.0f; rt <= 1.0f; rt += 0.1f) 
+    {
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+
+        // Init the data
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        v1.setValue(x,y,z);
+        v1.setW(w);
+
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        v2.setValue(x,y,z);
+        v2.setW(w);
+
+        correct_res = v3lerp_ref(correct_res, v1, v2, rt);
+		test_res = v1.lerp(v2, rt);
+	   
+		if( fabs(correct_res.m_floats[0] - test_res.m_floats[0]) +  
+			fabs(correct_res.m_floats[1] - test_res.m_floats[1]) +
+			fabs(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON * 4)
+		{	
+			vlog( "Error - v3lerp result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n"
+                    "\n rt=%10.4f", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2], rt);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr1[DATA_SIZE];
+	btVector3 vec3_arr2[DATA_SIZE];
+    btScalar  rt_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+		w =BT_NAN;     // w channel NaN
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+
+                rt_arr[k] = RANDF_01;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            v3lerp_ref(vec3_arr1[k], vec3_arr1[k], vec3_arr2[k], rt_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+
+                rt_arr[k] = RANDF_01;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				vec3_arr1[k] = vec3_arr1[k].lerp(vec3_arr2[k], rt_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+static 
+btVector3& 
+v3lerp_ref(
+    btVector3& vr, 
+    btVector3& v0, 
+    btVector3& v1, 
+    btScalar& rt)
+{
+    vr.m_floats[0] = v0.m_floats[0] + rt * (v1.m_floats[0] - v0.m_floats[0]);
+    vr.m_floats[1] = v0.m_floats[1] + rt * (v1.m_floats[1] - v0.m_floats[1]);
+    vr.m_floats[2] = v0.m_floats[2] + rt * (v1.m_floats[2] - v0.m_floats[2]);
+
+	return vr;
+}
+
+#endif //BT_USE_SSE
+
diff --git a/Test/Source/Tests/Test_v3lerp.h b/Test/Source/Tests/Test_v3lerp.h
new file mode 100644
index 000000000..a38a12c19
--- /dev/null
+++ b/Test/Source/Tests/Test_v3lerp.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3lerp.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3lerp_h
+#define BulletTest_Test_v3lerp_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3lerp(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_v3norm.cpp b/Test/Source/Tests/Test_v3norm.cpp
new file mode 100644
index 000000000..51e0dd35c
--- /dev/null
+++ b/Test/Source/Tests/Test_v3norm.cpp
@@ -0,0 +1,170 @@
+//
+//  Test_v3norm.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_v3norm.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static inline btVector3& v3norm_ref(btVector3& v);
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_v3norm(void)
+{
+    btVector3 v1, v2;
+   
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    v2 = v1; 
+
+    btVector3 correct_res, test_res;
+	 
+    {
+		float vNaN = BT_NAN;
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+		correct_res = v3norm_ref(v1);
+		test_res = v2.normalize();
+	   
+		if( fabs(correct_res.m_floats[0] - test_res.m_floats[0]) +  
+			fabs(correct_res.m_floats[1] - test_res.m_floats[1]) + 
+			fabs(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON * 4)
+		{	
+			vlog( "Error - v3norm result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2]);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr0[DATA_SIZE];
+	btVector3 vec3_arr1[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+				x = RANDF_01;
+				y = RANDF_01;
+				z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+	             vec3_arr0[k] = v3norm_ref(vec3_arr1[k]);
+	             vec3_arr0[k+1] = v3norm_ref(vec3_arr1[k+1]);
+	             vec3_arr0[k+2] = v3norm_ref(vec3_arr1[k+2]);
+	             vec3_arr0[k+3] = v3norm_ref(vec3_arr1[k+3]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+				x = RANDF_01;
+				y = RANDF_01;
+				z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				vec3_arr0[k] = vec3_arr1[k].normalize();
+				vec3_arr0[k+1] = vec3_arr1[k+1].normalize();
+				vec3_arr0[k+2] = vec3_arr1[k+2].normalize();
+				vec3_arr0[k+3] = vec3_arr1[k+3].normalize();
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+static btVector3& v3norm_ref(btVector3& v)
+{
+	float dot =	v.m_floats[0] * v.m_floats[0] +
+				v.m_floats[1] * v.m_floats[1] +
+				v.m_floats[2] * v.m_floats[2];
+	
+	dot = 1.0f / sqrtf(dot);
+	v.m_floats[0] *= dot; 
+	v.m_floats[1] *= dot;
+	v.m_floats[2] *= dot;
+
+	return v;
+}
+
+#endif //BT_USE_SSE
+
diff --git a/Test/Source/Tests/Test_v3norm.h b/Test/Source/Tests/Test_v3norm.h
new file mode 100644
index 000000000..5b86b4acd
--- /dev/null
+++ b/Test/Source/Tests/Test_v3norm.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3norm.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3norm_h
+#define BulletTest_Test_v3norm_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3norm(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_v3rotate.cpp b/Test/Source/Tests/Test_v3rotate.cpp
new file mode 100644
index 000000000..8c3aea99b
--- /dev/null
+++ b/Test/Source/Tests/Test_v3rotate.cpp
@@ -0,0 +1,194 @@
+//
+//  Test_v3rotate.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_v3rotate.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static inline 
+btVector3& v3rotate_ref(
+    btVector3& v0, 
+    btVector3& v1, 
+	const btScalar& s);
+
+#define LOOPCOUNT 2048
+#define NUM_CYCLES 1000
+
+int Test_v3rotate(void)
+{
+    btVector3 v1, v2;
+	float s;
+   
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    v2.setValue(x,y,z);
+	v2.setW(w);
+
+    s = RANDF_01 * (float) SIMD_PI;
+    
+    btVector3 correct_res, test_res;
+	 
+    {
+		float vNaN = BT_NAN;
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+		test_res = v1.rotate(v2, s);
+        correct_res = v3rotate_ref(v1, v2, s);
+		
+		if( fabs(correct_res.m_floats[0] - test_res.m_floats[0]) + 
+			fabs(correct_res.m_floats[1] - test_res.m_floats[1]) +
+			fabs(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON * 4)
+		{	
+			vlog( "Error - v3rotate result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2]);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr0[DATA_SIZE];
+	btVector3 vec3_arr1[DATA_SIZE];
+    btScalar  s_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr0[k].setValue(x,y,z);
+				vec3_arr0[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+				
+				s_arr[k] = RANDF_01 * (float)SIMD_PI;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+                vec3_arr0[k] = v3rotate_ref(vec3_arr0[k], vec3_arr1[k], s_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr0[k].setValue(x,y,z);
+				vec3_arr0[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+				
+				s_arr[k] = RANDF_01 * (float)SIMD_PI;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				vec3_arr0[k  ] = vec3_arr0[k  ].rotate(vec3_arr1[k  ], s_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+static inline 
+btVector3& 
+v3rotate_ref(
+    btVector3& v0, 
+    btVector3& wAxis, 
+    const btScalar& _angle)
+{
+	btVector3 o = wAxis * wAxis.dot( v0 );
+	btVector3 _x = v0 - o;
+	btVector3 _y;
+
+	_y = wAxis.cross( v0 );
+
+	v0 = o + _x * cosf( _angle ) + _y * sinf( _angle );
+
+	return v0;
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_v3rotate.h b/Test/Source/Tests/Test_v3rotate.h
new file mode 100644
index 000000000..0c40fc4a4
--- /dev/null
+++ b/Test/Source/Tests/Test_v3rotate.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3rotate.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3rotate_h
+#define BulletTest_Test_v3rotate_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3rotate(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_v3sdiv.cpp b/Test/Source/Tests/Test_v3sdiv.cpp
new file mode 100644
index 000000000..43d5735ad
--- /dev/null
+++ b/Test/Source/Tests/Test_v3sdiv.cpp
@@ -0,0 +1,181 @@
+//
+//  Test_v3sdiv.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_v3sdiv.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static inline 
+btVector3& v3sdiv_ref(
+    btVector3& v, 
+	const btScalar& s);
+
+#define LOOPCOUNT 2048
+#define NUM_CYCLES 1000
+
+int Test_v3sdiv(void)
+{
+    btVector3 v1, v2;
+	btScalar s;
+   
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    v2.setValue(x,y,z);
+	v2.setW(w);
+
+    s = (float) RANDF_16;
+    
+    btVector3 correct_res, test_res;
+	 
+    {
+		float vNaN = BT_NAN;
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+		correct_res = v3sdiv_ref(v1, s);
+		test_res = (v2 /= s);
+	   
+		if( fabs(correct_res.m_floats[0] - test_res.m_floats[0]) +  
+			fabs(correct_res.m_floats[1] - test_res.m_floats[1]) + 
+			fabs(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON * 4)
+		{	
+			vlog( "Error - v3sdiv result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2]);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr[DATA_SIZE];
+    btScalar  s_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = uint64_t(-1LL);
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr[k].setValue(x,y,z);
+				vec3_arr[k].setW(w);
+				
+				s_arr[k] = RANDF_01;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+                v3sdiv_ref( vec3_arr[k], s_arr[k]);
+                v3sdiv_ref( vec3_arr[k+1], s_arr[k+1]);
+                v3sdiv_ref( vec3_arr[k+2], s_arr[k+2]);
+                v3sdiv_ref( vec3_arr[k+3], s_arr[k+3]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr[k].setValue(x,y,z);
+				vec3_arr[k].setW(w);
+				
+				s_arr[k] = RANDF_01;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				vec3_arr[k] /= s_arr[k];
+				vec3_arr[k+1] /= s_arr[k+1];
+				vec3_arr[k+2] /= s_arr[k+2];
+				vec3_arr[k+3] /= s_arr[k+3];
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+static inline 
+btVector3& 
+v3sdiv_ref(
+    btVector3& v, 
+    const btScalar& s)
+{
+	btScalar recip = btScalar(1.0) / s;
+	
+	v.m_floats[0] *= recip; 
+	v.m_floats[1] *= recip;
+	v.m_floats[2] *= recip;
+
+	return v;
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_v3sdiv.h b/Test/Source/Tests/Test_v3sdiv.h
new file mode 100644
index 000000000..648715e09
--- /dev/null
+++ b/Test/Source/Tests/Test_v3sdiv.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3sdiv.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3sdiv_h
+#define BulletTest_Test_v3sdiv_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3sdiv(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_v3skew.cpp b/Test/Source/Tests/Test_v3skew.cpp
new file mode 100644
index 000000000..a9c90fb25
--- /dev/null
+++ b/Test/Source/Tests/Test_v3skew.cpp
@@ -0,0 +1,197 @@
+//
+//  Test_v3skew.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_v3skew.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static void 
+v3skew_ref(
+	const btVector3* v, 
+	btVector3* v1,
+	btVector3* v2,
+    btVector3* v3);
+
+#define LOOPCOUNT 2048
+#define NUM_CYCLES 10000
+
+int Test_v3skew(void)
+{
+    btVector3 v, v1, v2, v3, vt1, vt2, vt3;
+   
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v.setValue(x,y,z);
+	v.setW(w);
+
+    v1.setValue(w,w,w);
+	v1.setW(w);
+
+    vt3 = vt2 = vt1 = v3 = v2 = v1;
+	
+    {
+		v3skew_ref(&v, &v1, &v2, &v3);
+		v.getSkewSymmetricMatrix(&vt1, &vt2, &vt3);
+	   /*
+		if( v1.m_floats[0] != vt1.m_floats[0] || 
+			v1.m_floats[1] != vt1.m_floats[1] ||
+			v1.m_floats[2] != vt1.m_floats[2] )
+		*/
+        if(!(v1 == vt1))
+        {	
+			vlog( "Error - v3skew result error! "
+					"\ncorrect v1 = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  v1 = (%10.4f, %10.4f, %10.4f) \n", 
+					v1.m_floats[0], v1.m_floats[1], v1.m_floats[2], 
+					vt1.m_floats[0], vt1.m_floats[1], vt1.m_floats[2]);
+		
+			return 1;
+		}
+
+		/*
+        if( v2.m_floats[0] != vt2.m_floats[0] || 
+			v2.m_floats[1] != vt2.m_floats[1] ||
+			v2.m_floats[2] != vt2.m_floats[2] )
+		*/
+        if(!(v2 == vt2))
+        {	
+			vlog( "Error - v3skew result error! "
+					"\ncorrect v2 = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  v2 = (%10.4f, %10.4f, %10.4f) \n", 
+					v2.m_floats[0], v2.m_floats[1], v2.m_floats[2], 
+					vt2.m_floats[0], vt2.m_floats[1], vt2.m_floats[2]);
+		
+			return 1;
+		}
+
+		/*
+        if( v3.m_floats[0] != vt3.m_floats[0] || 
+			v3.m_floats[1] != vt3.m_floats[1] ||
+			v3.m_floats[2] != vt3.m_floats[2] )
+		*/
+        if(!(v3 == vt3))
+        {	
+			vlog( "Error - v3skew result error! "
+					"\ncorrect v3 = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  v3 = (%10.4f, %10.4f, %10.4f) \n", 
+					v3.m_floats[0], v3.m_floats[1], v3.m_floats[2], 
+					vt3.m_floats[0], vt3.m_floats[1], vt3.m_floats[2]);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE 256
+
+	btVector3 v3_arr0[DATA_SIZE];
+	btVector3 v3_arr1[DATA_SIZE];
+	btVector3 v3_arr2[DATA_SIZE];
+	btVector3 v3_arr3[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	for( k = 0; k < DATA_SIZE; k++ )
+	{
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+		v3_arr0[k].setValue(x,y,z);
+		v3_arr0[k].setW(w);
+
+		v3_arr1[k].setValue(w,w,w);
+		v3_arr1[k].setW(w);
+
+		v3_arr3[k] = v3_arr2[k] = v3_arr1[k];
+	}
+    
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				size_t k32 = (k & (DATA_SIZE-1)); 
+                v3skew_ref( &v3_arr0[k32], &v3_arr1[k32], &v3_arr2[k32], &v3_arr3[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				size_t k32 = (k & (DATA_SIZE -1)); 
+                v3_arr0[k32].getSkewSymmetricMatrix(&v3_arr1[k32], &v3_arr2[k32], &v3_arr3[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "    \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+
+static void 
+v3skew_ref(
+	const btVector3* v, 
+	btVector3* v1, 
+	btVector3* v2, 
+	btVector3* v3)
+{
+    v1->setValue(0.		,-v->z(),v->y());
+    v2->setValue(v->z()	,0.		,-v->x());
+    v3->setValue(-v->y(),v->x()	,0.);
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_v3skew.h b/Test/Source/Tests/Test_v3skew.h
new file mode 100644
index 000000000..255f4a225
--- /dev/null
+++ b/Test/Source/Tests/Test_v3skew.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3skew.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3skew_h
+#define BulletTest_Test_v3skew_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3skew(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Tests/Test_v3triple.cpp b/Test/Source/Tests/Test_v3triple.cpp
new file mode 100644
index 000000000..f0ecbd48e
--- /dev/null
+++ b/Test/Source/Tests/Test_v3triple.cpp
@@ -0,0 +1,180 @@
+//
+//  Test_v3triple.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_v3triple.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static btScalar 
+v3triple_ref(
+	const btVector3& v, 
+	const btVector3& v1,
+	const btVector3& v2);
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 10000
+
+int Test_v3triple(void)
+{
+    btVector3 v1, v2, v3;
+   
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    v2.setValue(x,y,z);
+	v2.setW(w);
+
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    v3.setValue(x,y,z);
+	v3.setW(w);
+	
+    float correctTriple0, testTriple0;
+	 
+    {
+		correctTriple0 = w;
+		testTriple0 = w;
+		testTriple0 = v3triple_ref(v1,v2,v3);
+		correctTriple0 = v1.triple(v2, v3);
+	   
+		if( fabsf(correctTriple0 - testTriple0) > FLT_EPSILON * 4 )
+		{
+			vlog( "Error - v3triple result error! %f != %f \n", correctTriple0, testTriple0);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE 1024
+
+	btVector3 v3_arr1[DATA_SIZE];
+	btVector3 v3_arr2[DATA_SIZE];
+	btVector3 v3_arr3[DATA_SIZE];
+    btScalar  res_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	for( k = 0; k < DATA_SIZE; k++ )
+	{
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+		v3_arr1[k].setValue(x,y,z);
+		v3_arr1[k].setW(w);
+
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+		v3_arr2[k].setValue(x,y,z);
+		v3_arr2[k].setW(w);
+
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+		v3_arr3[k].setValue(x,y,z);
+		v3_arr3[k].setW(w);
+	}
+    
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = uint64_t(-1LL);
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t k32 = (k & (DATA_SIZE-1)); 
+                res_arr[k32] = v3triple_ref( v3_arr1[k32], v3_arr2[k32], v3_arr3[k32]); k32++;
+				res_arr[k32] = v3triple_ref( v3_arr1[k32], v3_arr2[k32], v3_arr3[k32]); k32++;
+				res_arr[k32] = v3triple_ref( v3_arr1[k32], v3_arr2[k32], v3_arr3[k32]); k32++;
+				res_arr[k32] = v3triple_ref( v3_arr1[k32], v3_arr2[k32], v3_arr3[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = uint64_t(-1LL);
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t k32 = k & (DATA_SIZE -1); 
+                res_arr[k32] = v3_arr1[k32].triple(v3_arr2[k32], v3_arr3[k32]); k32++;
+                res_arr[k32] = v3_arr1[k32].triple(v3_arr2[k32], v3_arr3[k32]); k32++;
+                res_arr[k32] = v3_arr1[k32].triple(v3_arr2[k32], v3_arr3[k32]); k32++;
+                res_arr[k32] = v3_arr1[k32].triple(v3_arr2[k32], v3_arr3[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+
+static btScalar 
+v3triple_ref(
+	const btVector3& v, 
+	const btVector3& v1, 
+	const btVector3& v2)
+{
+	return 
+		v.m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) + 
+		v.m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) + 
+		v.m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]);
+}
+
+#endif //BT_USE_SSE
diff --git a/Test/Source/Tests/Test_v3triple.h b/Test/Source/Tests/Test_v3triple.h
new file mode 100644
index 000000000..16fcf2b74
--- /dev/null
+++ b/Test/Source/Tests/Test_v3triple.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3triple.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3triple_h
+#define BulletTest_Test_v3triple_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3triple(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
diff --git a/Test/Source/Utils.cpp b/Test/Source/Utils.cpp
new file mode 100644
index 000000000..830f53a84
--- /dev/null
+++ b/Test/Source/Utils.cpp
@@ -0,0 +1,272 @@
+//
+//  File.c
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#include <stdio.h>
+#ifdef __APPLE__
+#include <mach/mach_time.h>
+#include <sys/sysctl.h>
+#include <sys/mman.h>
+#include <errno.h>
+#else
+#include "LinearMath/btAlignedAllocator.h"
+
+#endif //__APPLE__
+
+#include <stdlib.h>
+
+#include "Utils.h"
+
+#pragma mark Timing
+
+int gReportNanoseconds = 0;
+
+#ifdef _WIN32
+#include <intrin.h>
+uint64_t ReadTicks( void )
+{
+	 return __rdtsc();
+}
+double  TicksToCycles( uint64_t delta )
+{
+	return double(delta);
+}
+
+double  TicksToSeconds( uint64_t delta )
+{
+	return double(delta);
+}
+
+void *GuardCalloc( size_t count, size_t size, size_t *objectStride )
+{
+	if (objectStride)
+		*objectStride = size;
+	return (void*) btAlignedAlloc(count * size,16);
+}
+void GuardFree( void *buf )
+{
+	btAlignedFree(buf);
+}
+
+#endif
+
+
+#ifdef __APPLE__
+
+uint64_t ReadTicks( void )
+{
+    return mach_absolute_time();
+}
+
+double  TicksToCycles( uint64_t delta )
+{
+    static long double conversion = 0.0L;
+    if( 0.0L == conversion )
+    {
+        // attempt to get conversion to nanoseconds
+        mach_timebase_info_data_t info;
+        int err = mach_timebase_info( &info );
+        if( err )
+            return __builtin_nanf("");
+        conversion = (long double) info.numer / info.denom;
+        
+        // attempt to get conversion to cycles
+        if( 0 == gReportNanoseconds )
+        {
+            uint64_t frequency = 0;
+            size_t freq_size = sizeof( frequency );
+            err = sysctlbyname( "hw.cpufrequency_max", &frequency, &freq_size, NULL, 0 );
+            if( err || 0 == frequency )
+                vlog( "Failed to get max cpu frequency. Reporting times as nanoseconds.\n" );
+            else
+            {
+                conversion *= 1e-9L /* sec / ns */  * frequency /* cycles / sec */;
+                vlog( "Reporting times as cycles. (%2.2f MHz)\n", 1e-6 * frequency );
+            }
+        }
+        else
+            vlog( "Reporting times as nanoseconds.\n" );
+    }
+    
+    return (double) (delta * conversion);
+}
+
+double  TicksToSeconds( uint64_t delta )
+{
+    static long double conversion = 0.0L;
+    if( 0.0L == conversion )
+    {
+        // attempt to get conversion to nanoseconds
+        mach_timebase_info_data_t info;
+        int err = mach_timebase_info( &info );
+        if( err )
+            return __builtin_nanf("");
+        conversion = info.numer / (1e9L * info.denom);
+    }
+    
+    return (double) (delta * conversion);
+}
+
+
+
+#pragma mark -
+#pragma mark GuardCalloc
+
+#define kPageSize 4096
+
+
+typedef struct BufInfo
+{
+    void    *head;
+    size_t  count;
+    size_t  stride;
+    size_t  totalSize;
+}BufInfo;
+
+static int GuardMarkBuffer( void *buffer, int flag );
+
+void *GuardCalloc( size_t count, size_t size, size_t *objectStride )
+{
+    if( objectStride )
+        *objectStride = 0;
+    
+    // Round size up to a multiple of a page size
+    size_t stride = (size + kPageSize - 1) & -kPageSize;
+    
+    //Calculate total size of the allocation
+    size_t totalSize = count * (stride + kPageSize) + kPageSize;
+
+    // Allocate
+    char *buf = (char*)mmap( NULL, 
+                     totalSize, 
+                     PROT_READ | PROT_WRITE, 
+                     MAP_ANON | MAP_SHARED,
+                     0, 0 );
+    if( MAP_FAILED == buf )
+    {
+        vlog( "mmap failed: %d\n", errno );
+        return NULL;
+    }
+
+    // Find the first byte of user data
+    char *result = buf + kPageSize;
+
+    // Record what we did for posterity
+    BufInfo *bptr = (BufInfo*) result - 1;
+    bptr->head = buf;
+    bptr->count = count;
+    bptr->stride = stride;
+    bptr->totalSize = totalSize;
+    
+    // Place the first guard page. Masks our record above.
+    if( mprotect(buf, kPageSize, PROT_NONE) )
+    {
+        munmap( buf, totalSize);
+        vlog( "mprotect -1 failed: %d\n", errno );
+        return NULL;
+    }
+    
+    // Place the rest of the guard pages
+    size_t i;
+    char *p = result;
+    for( i = 0; i < count; i++ )
+    {
+        p += stride;
+        if( mprotect(p, kPageSize, PROT_NONE) )
+        {
+            munmap( buf, totalSize);
+            vlog( "mprotect %lu failed: %d\n", i, errno );
+            return NULL;
+        }
+        p += kPageSize;
+    }
+    
+    // record the stride from object to object
+    if( objectStride )
+        *objectStride = stride + kPageSize;
+    
+    // return pointer to first object
+    return result;
+}
+
+
+void GuardFree( void *buf )
+{
+    if( mprotect((char*)buf - kPageSize, kPageSize, PROT_READ) )
+    {
+        vlog( "Unable to read buf info. GuardFree failed! %p  (%d)\n", buf, errno );
+        return;
+    }
+    
+    BufInfo *bptr = (BufInfo*) buf - 1;
+    
+    if( munmap( bptr->head, bptr->totalSize ) )
+        vlog( "Unable to unmap data. GuardFree failed! %p (%d)\n", buf, errno );
+}
+
+int GuardMarkReadOnly( void *buf )
+{
+    return GuardMarkBuffer(buf, PROT_READ);
+}
+
+int GuardMarkReadWrite( void *buf)
+{
+    return GuardMarkBuffer(buf, PROT_READ | PROT_WRITE);
+}
+
+int GuardMarkWriteOnly( void *buf)
+{
+    return GuardMarkBuffer(buf, PROT_WRITE);
+}
+
+static int GuardMarkBuffer( void *buf, int flag )
+{
+    if( mprotect((char*)buf - kPageSize, kPageSize, PROT_READ) )
+    {
+        vlog( "Unable to read buf info. GuardMarkBuffer %d failed! %p  (%d)\n", flag, buf, errno );
+        return errno;
+    }
+    
+    BufInfo *bptr = (BufInfo*) buf - 1;
+    
+    size_t count = bptr->count;
+    size_t stride = bptr->stride;
+    
+    size_t i;
+    for( i = 0; i < count; i++ )
+    {
+        if( mprotect(buf, stride, flag) )
+        {
+            vlog( "Unable to protect segment %ld. GuardMarkBuffer %d failed! %p  (%d)\n", i, flag, buf, errno );
+            return errno;
+        }
+        bptr += stride + kPageSize;
+    }
+        
+    if( mprotect((char*)buf - kPageSize, kPageSize, PROT_NONE) )
+    {
+        vlog( "Unable to protect leading guard page. GuardMarkBuffer %d failed! %p  (%d)\n", flag, buf, errno );
+        return errno;
+    }
+    
+    return 0;
+}
+#endif
+
+uint32_t random_number32(void)
+{
+    return ((uint32_t) rand() << 16) ^ rand();
+}
+
+
+uint64_t random_number64(void)
+{
+    return ((uint64_t) rand() << 48) ^
+            ((uint64_t) rand() << 32) ^
+            ((uint64_t) rand() << 16) ^
+            rand();
+}
+
diff --git a/Test/Source/Utils.h b/Test/Source/Utils.h
new file mode 100644
index 000000000..9d30cd450
--- /dev/null
+++ b/Test/Source/Utils.h
@@ -0,0 +1,72 @@
+//
+//  Utils.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Utils_h
+#define BulletTest_Utils_h
+
+#include "btIntDefines.h"
+
+
+
+#include <stddef.h>
+#include <stdio.h>
+
+#ifdef _WIN32
+#define LARGE_FLOAT17 (1.f * powf(2,17))
+#define RANDF_16   (random_number32() * powf(2,-16))
+#define RANDF_01   ( random_number32() * powf(2,-32) )
+#define RANDF      ( random_number32() * powf(2,-8) )
+#define RANDF_m1p1 (2.0f*( random_number32() * powf(2,-32)-1.0f))
+#else
+#define LARGE_FLOAT17 (0x1.0p17f)
+#define RANDF_16   (random_number32() * 0x1.0p-16f)
+#define RANDF_01   ( random_number32() * 0x1.0p-32f )
+#define RANDF      ( random_number32() * 0x1.0p-8f )
+#define RANDF_m1p1 (2.0f*( random_number32() * 0x1.0p-32f )-1.0f)
+#endif//_WIN32
+
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+    
+    /*********************
+     *      Timing       *
+     *********************/
+    extern int gReportNanoseconds;
+
+    uint64_t ReadTicks( void );
+    double  TicksToCycles( uint64_t delta );        // Performance data should be reported in cycles most of the time.
+    double  TicksToSeconds( uint64_t delta );
+
+    
+    /*********************
+     *    Guard Heap     *
+     *********************/
+    // return buffer containing count objects of size size, with guard pages in betweeen.
+    // The stride between one object and the next is given by objectStride. 
+    // objectStride may be NULL. Objects so created are freed with GuardFree
+    void *GuardCalloc( size_t count, size_t size, size_t *objectStride );
+    void GuardFree( void * );
+    // mark the contents of a guard buffer read-only or write-only. Return 0 on success.
+    int GuardMarkReadOnly( void *);
+    int GuardMarkWriteOnly( void *);
+    int GuardMarkReadWrite( void *);
+    
+    /*********************
+     *    Printing       *
+     *********************/
+    #define vlog( ... )        printf( __VA_ARGS__  )
+    uint32_t random_number32(void);    
+    uint64_t random_number64(void);
+    
+#ifdef __cplusplus
+    }
+#endif
+
+
+#endif
diff --git a/Test/Source/btIntDefines.h b/Test/Source/btIntDefines.h
new file mode 100644
index 000000000..a5e9e62ca
--- /dev/null
+++ b/Test/Source/btIntDefines.h
@@ -0,0 +1,19 @@
+
+#ifndef BT_INT_DEFINES_H
+#define BT_INT_DEFINES_H
+
+#ifdef __GNUC__
+	#include <stdint.h>
+#elif defined(_MSC_VER)
+	typedef __int32 int32_t;
+	typedef __int64 int64_t;
+	typedef unsigned __int32 uint32_t;
+	typedef unsigned __int64 uint64_t;
+#else
+	typedef int int32_t;
+	typedef long long int int64_t;
+	typedef unsigned int uint32_t;
+	typedef unsigned long long int uint64_t;
+#endif
+
+#endif //BT_INT_DEFINES_H
diff --git a/Test/Source/main.cpp b/Test/Source/main.cpp
new file mode 100644
index 000000000..2ae832cfc
--- /dev/null
+++ b/Test/Source/main.cpp
@@ -0,0 +1,326 @@
+//
+//  main.c
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+#include <stdio.h>
+#ifdef __APPLE__
+#include <libgen.h>
+#endif //__APPLE__
+
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "main.h"
+#include "Utils.h"
+#include "TestList.h"
+#include "LinearMath/btScalar.h"
+
+#if defined (BT_USE_NEON) || defined (BT_USE_SSE_IN_API)
+
+#ifdef _WIN32
+#define strcasecmp _stricmp
+#define basename(A) A
+#endif
+
+#define EXIT_NO_ERROR INT_MIN
+
+//int gReportNanoseconds = 0;    // in Utils.c
+
+int gReportAverageTimes = 0;
+int gExitOnError = 0;
+char *gFullPath = NULL;
+const char *gAppName = NULL;
+int gArgc;
+const char **gArgv;
+
+typedef struct TestNode
+{
+    struct TestNode *next;
+    const char      *name;
+}TestNode;
+
+TestNode *gNodeList = NULL;
+
+static int ParseArgs( int argc, const char *argv[] );
+static void PrintUsage( void );
+static int Init( void );
+static void ListTests(void );
+
+const char *gArch = 
+#ifdef __i386__ 
+    "i386";
+#elif defined __x86_64__
+    "x86_64";
+#elif defined __arm__
+    "arm";
+#elif defined _WIN64
+	"win64";
+#elif defined _WIN32
+	"win32";
+#else
+    #error unknown arch
+#endif
+
+
+
+
+
+#include <stdio.h>
+
+int main (int argc, const char * argv[])
+{
+
+   // Enable just one test programatically (instead of command-line param)
+   // TestNode *node = (TestNode*) malloc( sizeof( TestNode ) );
+   // node->name = "btDbvt";
+   // node->next = 0;
+   // gNodeList = node;
+    
+	srand(0.f);
+
+    int numPassedTests=0;
+	int numFailedTests= 0;
+
+    int err;
+    
+    // Parse arguments. Build gNodeList.
+    if( (err = ParseArgs( argc, argv ) ) )
+    {
+        if( EXIT_NO_ERROR == err )
+            return 0;
+        
+        PrintUsage();        
+        return err;
+    }
+    
+    printf("Arch: %s\n", gArch );
+    
+    if( gReportAverageTimes )
+        printf( "Reporting average times.\n" );
+    else
+        printf( "Reporting best times.\n" );
+    
+    // Set a few things up
+    if( (err = Init() ))
+    {
+        printf( "Init failed.\n" );
+        return err;
+    }
+    
+    if( NULL == gNodeList )
+    { // test everything
+        printf( "No function list found. Testing everything...\n" );
+        size_t i;
+        for( i = 0; NULL != gTestList[i].test_func; i++ )
+        {
+            printf( "\n----------------------------------------------\n" );
+            printf( "Testing %s:\n", gTestList[i].name );
+            printf( "----------------------------------------------\n" );
+            uint64_t startTime = ReadTicks();
+            int local_error = gTestList[i].test_func();
+            uint64_t currentTime = ReadTicks() - startTime;
+            if( local_error )
+            {
+				numFailedTests++;
+                printf( "*** %s test failed with error: %d\n", gTestList[i].name, local_error );
+                if( gExitOnError )
+                    return local_error;
+                if( 0 == err )
+                    err = local_error;
+            }
+            else
+			{
+				numPassedTests++;
+                printf("%s Passed.\t\t\t(%2.2gs)\n", gTestList[i].name, TicksToSeconds(currentTime));
+			}
+        }
+    }
+    else
+    { // test just the list
+        while( NULL != gNodeList )
+        {
+            TestNode *currentNode = gNodeList;
+            gNodeList = gNodeList->next;
+            
+            // Find the test with that name
+            size_t i;
+            for( i = 0; NULL != gTestList[i].test_func; i++ )
+                if( 0 == strcasecmp( currentNode->name, gTestList[i].name ) )
+                    break;
+            
+            if( NULL != gTestList[i].test_func )
+            {
+                printf( "\n----------------------------------------------\n" );
+                printf( "Testing %s:\n", gTestList[i].name );
+                printf( "----------------------------------------------\n" );
+                uint64_t startTime = ReadTicks();
+                int local_error = gTestList[i].test_func();
+                uint64_t currentTime = ReadTicks() - startTime;
+                if( local_error )
+                {
+					numFailedTests++;
+                    printf( "*** %s test failed with error: %d\n", gTestList[i].name, local_error );
+                    if( gExitOnError )
+                        return local_error;
+                    if( 0 == err )
+                        err = local_error;
+                }
+                else
+				{
+					numPassedTests++;
+                    printf("%s Passed.\t\t\t(%2.2gs)\n", gTestList[i].name, TicksToSeconds(currentTime));
+				}
+            }
+            else
+            {
+                printf( "\n***Error: Test name \"%s\" not found! Skipping.\n", currentNode->name );
+                err = -1;
+                if( gExitOnError )
+                    return -1;
+            }
+            
+            free( currentNode );
+        }
+    }
+	printf( "\n----------------------------------------------\n" );
+	printf("numPassedTests = %d, numFailedTests = %d\n",numPassedTests,numFailedTests);
+    
+    free(gFullPath);
+    return err;
+}
+
+static int Init( void )
+{
+    // init the timer
+    TicksToCycles(0);
+    
+    return 0;
+}
+
+static int ParseArgs( int argc, const char *argv[] )
+{
+    int listTests = 0;
+    TestNode *list = NULL;
+    
+    gArgc = argc;
+    gArgv = argv;
+    gFullPath = (char*)malloc( strlen(argv[0]) + 1);
+    strcpy(gFullPath, argv[0]);
+    gAppName = basename( gFullPath );
+    if( NULL == gAppName )
+        gAppName = "<unknown app name>";
+    
+    printf( "%s ", gAppName );
+    int skipremaining=0;
+    
+    size_t i;
+    for( i = 1; i < argc; i++ )
+    {
+        const char *arg = argv[i];
+        printf( "\t%s", arg );
+        if( arg[0] == '-' )
+        {
+            arg++;
+            while( arg[0] != '\0' )
+            {
+                int stop = 0;
+                switch( arg[0] )
+                {
+                    case 'a':
+                        gReportAverageTimes ^= 1;
+                        break;
+                    case 'e':
+                        gExitOnError ^= 1;
+                        break;
+                    case 'h':
+                        PrintUsage();
+                        return EXIT_NO_ERROR;
+                    case 'l':
+                        listTests ^= 1;
+                        return EXIT_NO_ERROR;
+                    case 's':
+                        gReportNanoseconds ^= 1;
+                        break;
+                    case ' ':
+                        stop = 1;
+                        break;
+                    case 'N'://ignore the -NSDocumentRevisionsDebugMode argument from XCode 4.3.2
+                        skipremaining = 1;
+                       stop = 1;
+                        break;
+                    default:
+                        printf( "\nError: Unknown flag \'%c\'\n", arg[0] );
+                        return -1;
+                }
+                if( stop )
+                    break;
+                arg++;
+            }
+        }
+        else
+        { // add function name to the list
+            TestNode *node = (TestNode*) malloc( sizeof( TestNode ) );
+            node->name = arg;
+            node->next = list;
+            list = node;
+        }
+        if (skipremaining)
+            break;
+    }
+    
+    // reverse the list of test names, and stick on gNodeList
+    while( list )
+    {
+        TestNode *node = list;
+        TestNode *next = node->next;
+        node->next = gNodeList;
+        gNodeList = node;
+        list = next;
+    }
+    
+    printf( "\n" );
+    if( listTests )
+        ListTests();
+    
+    return 0;
+}
+
+
+static void PrintUsage( void )
+{
+    printf("\nUsage:\n" );
+    printf("%s: <-aehls> <test names>", gAppName);
+    printf("Options:\n");
+    printf("\t-a\tToggle report average times vs. best times. (Default: best times)\n");
+    printf("\t-e\tToggle exit immediately on error behavior. (Default: off)\n");
+    printf("\t-h\tPrint this message.\n");
+    printf("\t-l\tToggle list available test names.  (Default: off)\n");
+    printf("\t-s\tToggle report times in cycles or nanoseconds. (Default: cycles)\n\n");
+    printf("\tOptions may be followed by one or more test names. If no test names \n" );
+    printf("\tare provided, then all tests are run.\n\n");
+}
+
+static void ListTests(void )
+{
+    size_t i;
+    
+    printf("\nTests:\n");
+    for( i = 0; NULL != gTestList[i].test_func; i++ )
+    {
+        printf( "%19s", gTestList[i].name );
+        if( NULL != gTestList[i].test_func )
+            printf( "," );
+        if( 3 == (i&3) )
+            printf( "\n" );
+    }
+}
+#else
+#include <stdio.h>
+int main(int argc, char* argv[])
+{
+	printf("error: no SIMD enabled through BT_USE_NEON or BT_USE_SSE_IN_API \n(enable in LinearMath/btScalar.h or through build system)\n");
+	return 0;
+}
+#endif
diff --git a/Test/Source/main.h b/Test/Source/main.h
new file mode 100644
index 000000000..e8e5dd284
--- /dev/null
+++ b/Test/Source/main.h
@@ -0,0 +1,25 @@
+//
+//  main.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_main_h
+#define BulletTest_main_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+    extern int gReportAverageTimes;     // if 0, report best times
+    extern int gExitOnError;            // if non-zero, exit as soon an an error is encountered
+    extern const char *gAppName;        // the name of this application
+    
+#ifdef __cplusplus
+}
+#endif
+
+
+
+#endif
diff --git a/Test/Source/vector.h b/Test/Source/vector.h
new file mode 100644
index 000000000..c476a9e16
--- /dev/null
+++ b/Test/Source/vector.h
@@ -0,0 +1,70 @@
+//
+//  vector.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_vector_h
+#define BulletTest_vector_h
+
+#ifdef __SSE__
+    typedef float float4            __attribute__ ((__vector_size__(16)));
+    #include <xmmintrin.h>
+#endif
+
+#ifdef __SSE2__
+    typedef double double2          __attribute__ ((__vector_size__(16)));
+    typedef char char16             __attribute__ ((__vector_size__(16)));
+    typedef unsigned char uchar16   __attribute__ ((__vector_size__(16)));
+    typedef short short8            __attribute__ ((__vector_size__(16)));
+    typedef unsigned short ushort8  __attribute__ ((__vector_size__(16)));
+    typedef int int4                __attribute__ ((__vector_size__(16)));
+   // typedef unsigned int uint4      __attribute__ ((__vector_size__(16)));
+    #ifdef __LP64__ 
+        typedef long long2              __attribute__ ((__vector_size__(16)));
+        typedef unsigned long ulong2    __attribute__ ((__vector_size__(16)));
+    #else
+        typedef long long long2         __attribute__ ((__vector_size__(16)));
+        typedef unsigned long long ulong2 __attribute__ ((__vector_size__(16)));
+    #endif
+    #include <emmintrin.h> 
+#endif
+
+#ifdef __SSE3__
+    #include <pmmintrin.h>
+#endif
+
+#ifdef __SSSE3__
+    #include <tmmintrin.h>
+#endif
+
+#ifdef __SSE4_1__
+    #include <smmintrin.h>
+#endif
+
+#ifdef __arm__
+    #include <arm/arch.h>
+    #ifdef _ARM_ARCH_7
+        #define ARM_NEON_GCC_COMPATIBILITY  1
+        #include <arm_neon.h>
+        typedef float float4            __attribute__ ((__vector_size__(16)));
+        typedef double double2          __attribute__ ((__vector_size__(16)));
+        typedef char char16             __attribute__ ((__vector_size__(16)));
+        typedef unsigned char uchar16   __attribute__ ((__vector_size__(16)));
+        typedef short short8            __attribute__ ((__vector_size__(16)));
+        typedef unsigned short ushort8  __attribute__ ((__vector_size__(16)));
+        typedef int int4                __attribute__ ((__vector_size__(16)));
+        typedef unsigned int uint4      __attribute__ ((__vector_size__(16)));
+        #ifdef __LP64__ 
+            typedef long long2              __attribute__ ((__vector_size__(16)));
+            typedef unsigned long ulong2    __attribute__ ((__vector_size__(16)));
+        #else
+            typedef long long long2         __attribute__ ((__vector_size__(16)));
+            typedef unsigned long long ulong2 __attribute__ ((__vector_size__(16)));
+        #endif
+    #endif
+#endif
+
+
+#endif
diff --git a/Test/premake4.lua b/Test/premake4.lua
new file mode 100644
index 000000000..ffd72987d
--- /dev/null
+++ b/Test/premake4.lua
@@ -0,0 +1,23 @@
+
+project "AppUnitTest"
+
+if _OPTIONS["ios"] then
+	kind "WindowedApp"
+else	
+	kind "ConsoleApp"
+end
+targetdir "bin"
+
+includedirs {"../src","Source", "Source/Tests"}
+
+links {
+	"BulletDynamics","BulletCollision", "LinearMath"
+}
+
+language "C++"
+
+files {
+	"Source/**.cpp",
+	"Source/**.h",
+}
+
diff --git a/build/Info.plist b/build/Info.plist
new file mode 100755
index 000000000..cffa5f3bd
--- /dev/null
+++ b/build/Info.plist
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleDisplayName</key>
+	<string>${PRODUCT_NAME}</string>
+	<key>CFBundleExecutable</key>
+	<string>${EXECUTABLE_NAME}</string>
+	<key>CFBundleIcons</key>
+	<dict>
+		<key>CFBundlePrimaryIcon</key>
+		<dict>
+			<key>CFBundleIconFiles</key>
+			<array>
+				<string>icon.png</string>
+			</array>
+		</dict>
+	</dict>
+	<key>CFBundleIdentifier</key>
+	<string>com.yourcompany.${PRODUCT_NAME:identifier}</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>${PRODUCT_NAME}</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleSignature</key>
+	<string>????</string>
+	<key>CFBundleVersion</key>
+	<string>1.5</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>UIStatusBarHidden</key>
+	<true/>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+	</array>
+</dict>
+</plist>
diff --git a/msvc/autoexp_dat.txt b/build/autoexp_dat.txt
similarity index 100%
rename from msvc/autoexp_dat.txt
rename to build/autoexp_dat.txt
diff --git a/msvc/bullet.rc b/build/bullet.rc
similarity index 100%
rename from msvc/bullet.rc
rename to build/bullet.rc
diff --git a/msvc/bullet_ico.ico b/build/bullet_ico.ico
similarity index 100%
rename from msvc/bullet_ico.ico
rename to build/bullet_ico.ico
diff --git a/msvc/findDirectX11.lua b/build/findDirectX11.lua
similarity index 100%
rename from msvc/findDirectX11.lua
rename to build/findDirectX11.lua
diff --git a/msvc/findOpenCL.lua b/build/findOpenCL.lua
similarity index 100%
rename from msvc/findOpenCL.lua
rename to build/findOpenCL.lua
diff --git a/build/fruitstrap_osx b/build/fruitstrap_osx
new file mode 100755
index 000000000..22475fa3a
Binary files /dev/null and b/build/fruitstrap_osx differ
diff --git a/build/icon.png b/build/icon.png
new file mode 100644
index 000000000..0e1c1f11d
Binary files /dev/null and b/build/icon.png differ
diff --git a/build/ios_build.sh b/build/ios_build.sh
new file mode 100755
index 000000000..9f7a1cc14
--- /dev/null
+++ b/build/ios_build.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+./premake4_osx --ios xcode4
+xcodebuild -project xcode4ios/AppUnitTest.xcodeproj -configuration Release -arch armv7
diff --git a/build/ios_run.sh b/build/ios_run.sh
new file mode 100755
index 000000000..1216d85c9
--- /dev/null
+++ b/build/ios_run.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+./fruitstrap_osx -d -n -b ../Test/bin/AppUnitTestx32ios.app
diff --git a/msvc/premake4.exe b/build/premake4.exe
similarity index 100%
rename from msvc/premake4.exe
rename to build/premake4.exe
diff --git a/msvc/premake4.lua b/build/premake4.lua
similarity index 68%
rename from msvc/premake4.lua
rename to build/premake4.lua
index f8a386645..6a56a9084 100644
--- a/msvc/premake4.lua
+++ b/build/premake4.lua
@@ -3,8 +3,16 @@
 
 solution "0BulletSolution"
 
-	
-	
+	 newoption {
+    trigger     = "ios",
+    description = "Enable iOS target (requires xcode4)"
+  }
+  
+	newoption {
+		trigger = "with-demos",
+	  description = "Enable demos and extras"	
+	}
+
 	newoption {
     trigger     = "with-nacl",
     description = "Enable Native Client build"
@@ -41,22 +49,53 @@ solution "0BulletSolution"
 	configuration "Debug"
 		flags { "Symbols", "StaticRuntime" , "NoMinimalRebuild", "NoEditAndContinue" ,"FloatFast"}
 		
-	platforms {"x32", "x64"}
-
-	configuration "x64"		
-		targetsuffix "_64"
-	configuration {"x64", "debug"}
-		targetsuffix "_x64_debug"
-	configuration {"x64", "release"}
-		targetsuffix "_x64"
-	configuration {"x32", "debug"}
-		targetsuffix "_debug"
+	--platforms {"x32", "x64"}
+	platforms {"x32"}
 
   configuration {"Windows"}
   	defines { "_CRT_SECURE_NO_WARNINGS","_CRT_SECURE_NO_DEPRECATE"}
   
 	configuration{}
 
+	postfix="";
+
+	if _ACTION == "xcode4" then
+		if _OPTIONS["ios"] then
+			postfix = "ios";
+			defines {"ARM_NEON_GCC_COMPATIBILITY"}
+			xcodebuildsettings
+			{
+				'INFOPLIST_FILE = "../../Test/Info.plist"',
+				'CODE_SIGN_IDENTITY = "iPhone Developer"',
+				"SDKROOT = iphoneos",
+				'ARCHS = "armv7"',
+				'TARGETED_DEVICE_FAMILY = "1,2"',
+				'VALID_ARCHS = "armv7"',
+			}	
+			else
+			xcodebuildsettings
+			{
+				'ARCHS = "$(ARCHS_STANDARD_32_BIT) $(ARCHS_STANDARD_64_BIT)"',
+				'VALID_ARCHS = "x86_64 i386"',
+			}
+		end
+	else
+	
+	end
+
+	configuration "x32"
+		targetsuffix ("x32" .. postfix)
+	configuration "x64"
+          targetsuffix ("x64" .. postfix)
+  configuration {"x64", "debug"}
+          targetsuffix ("x64Debug" .. postfix)
+  configuration {"x64", "release"}
+          targetsuffix ("x64" .. postfix)
+  configuration {"x32", "debug"}
+          targetsuffix ("Debug" .. postfix)                
+
+	configuration{}
+
 
 
 if not _OPTIONS["with-nacl"] then
@@ -86,8 +125,9 @@ end
 	
 	language "C++"
 	
-	location("./" .. _ACTION)
+	location("./" .. _ACTION .. postfix)
 
+	
 	if _OPTIONS["with-dx11"] then
 		include "../Demos/DX11ClothDemo"
 		include "../src/BulletMultiThreaded/GpuSoftBodySolvers/DX11"
@@ -121,17 +161,12 @@ end
 		include "../src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia"
 	end
 
-	if	not _OPTIONS["with-opencl-amd"] and
-			not _OPTIONS["with-opencl-nvidia"] and
-			not _OPTIONS["with-opencl-intel"] and
-			not _OPTIONS["with-opencl"] and
-			not _OPTIONS["with-dx11"] and 
-			not _OPTIONS["with-nacl"] then
-			
+	if _OPTIONS["with-demos"] then
 		include "../Demos"
   	include "../Extras"
   end
   
+  
    if _OPTIONS["with-nacl"] then
   	include "../Demos/NativeClient"
   else
@@ -140,3 +175,8 @@ end
 		include "../src/BulletDynamics"	
 		include "../src/BulletSoftBody"	
 	end
+	
+	include "../Test"
+	include "../Demos/HelloWorld"
+	include "../Demos/Benchmarks"
+	
diff --git a/build/premake4_linux b/build/premake4_linux
new file mode 100755
index 000000000..1ce066244
Binary files /dev/null and b/build/premake4_linux differ
diff --git a/build/premake4_osx b/build/premake4_osx
new file mode 100755
index 000000000..70115121d
Binary files /dev/null and b/build/premake4_osx differ
diff --git a/msvc/vs2005.bat b/build/vs2005.bat
similarity index 100%
rename from msvc/vs2005.bat
rename to build/vs2005.bat
diff --git a/msvc/vs2008.bat b/build/vs2008.bat
similarity index 100%
rename from msvc/vs2008.bat
rename to build/vs2008.bat
diff --git a/build/vs2010.bat b/build/vs2010.bat
new file mode 100644
index 000000000..c606584cb
--- /dev/null
+++ b/build/vs2010.bat
@@ -0,0 +1,4 @@
+
+premake4 --with-demos vs2010
+
+pause
\ No newline at end of file
diff --git a/msvc/vs2010_dx11.bat b/build/vs2010_dx11.bat
similarity index 100%
rename from msvc/vs2010_dx11.bat
rename to build/vs2010_dx11.bat
diff --git a/msvc/vs2010_opencl.bat b/build/vs2010_opencl.bat
similarity index 100%
rename from msvc/vs2010_opencl.bat
rename to build/vs2010_opencl.bat
diff --git a/msvc/vs_all.bat b/build/vs_all.bat
similarity index 100%
rename from msvc/vs_all.bat
rename to build/vs_all.bat
diff --git a/msvc/vs2010.bat b/msvc/vs2010.bat
deleted file mode 100644
index 839456178..000000000
--- a/msvc/vs2010.bat
+++ /dev/null
@@ -1,4 +0,0 @@
-
-premake4 vs2010
-
-pause
\ No newline at end of file
diff --git a/src/BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h b/src/BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h
index 36eec9717..405656236 100644
--- a/src/BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h
+++ b/src/BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h
@@ -23,6 +23,7 @@ struct btBroadphaseProxy;
 class btDispatcher;
 class btManifoldResult;
 class btCollisionObject;
+struct btCollisionObjectWrapper;
 struct btDispatcherInfo;
 class	btPersistentManifold;
 
@@ -69,7 +70,7 @@ public:
 
 	virtual ~btCollisionAlgorithm() {};
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut) = 0;
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut) = 0;
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut) = 0;
 
diff --git a/src/BulletCollision/BroadphaseCollision/btDbvt.h b/src/BulletCollision/BroadphaseCollision/btDbvt.h
index 409da80ae..b64936844 100644
--- a/src/BulletCollision/BroadphaseCollision/btDbvt.h
+++ b/src/BulletCollision/BroadphaseCollision/btDbvt.h
@@ -57,7 +57,7 @@ subject to the following restrictions:
 // Specific methods implementation
 
 //SSE gives errors on a MSVC 7.1
-#if defined (BT_USE_SSE) && defined (_WIN32)
+#if defined (BT_USE_SSE) //&& defined (_WIN32)
 #define DBVT_SELECT_IMPL		DBVT_IMPL_SSE
 #define DBVT_MERGE_IMPL			DBVT_IMPL_SSE
 #define DBVT_INT0_IMPL			DBVT_IMPL_SSE
@@ -160,6 +160,10 @@ struct	btDbvtAabbMm
 		btDbvtAabbMm& r);
 	DBVT_INLINE friend bool			NotEqual(	const btDbvtAabbMm& a,
 		const btDbvtAabbMm& b);
+    
+    DBVT_INLINE btVector3&	tMins()	{ return(mi); }
+	DBVT_INLINE btVector3&	tMaxs()	{ return(mx); }
+    
 private:
 	DBVT_INLINE void				AddSpan(const btVector3& d,btScalar& smi,btScalar& smx) const;
 private:
@@ -320,7 +324,7 @@ struct	btDbvt
 	DBVT_PREFIX
 		void		collideTV(	const btDbvtNode* root,
 		const btDbvtVolume& volume,
-		DBVT_IPOLICY);
+		DBVT_IPOLICY) const;
 	///rayTest is a re-entrant ray test, and can be called in parallel as long as the btAlignedAlloc is thread-safe (uses locking etc)
 	///rayTest is slower than rayTestInternal, because it builds a local stack, using memory allocations, and it recomputes signs/rayDirectionInverses each time
 	DBVT_PREFIX
@@ -519,7 +523,11 @@ DBVT_INLINE bool		Intersect(	const btDbvtAabbMm& a,
 #if	DBVT_INT0_IMPL == DBVT_IMPL_SSE
 	const __m128	rt(_mm_or_ps(	_mm_cmplt_ps(_mm_load_ps(b.mx),_mm_load_ps(a.mi)),
 		_mm_cmplt_ps(_mm_load_ps(a.mx),_mm_load_ps(b.mi))));
+#if defined (_WIN32)
 	const __int32*	pu((const __int32*)&rt);
+#else
+    const int*	pu((const int*)&rt);
+#endif
 	return((pu[0]|pu[1]|pu[2])==0);
 #else
 	return(	(a.mi.x()<=b.mx.x())&&
@@ -568,7 +576,12 @@ DBVT_INLINE int			Select(	const btDbvtAabbMm& o,
 							   const btDbvtAabbMm& b)
 {
 #if	DBVT_SELECT_IMPL == DBVT_IMPL_SSE
+    
+#if defined (_WIN32)
 	static ATTRIBUTE_ALIGNED16(const unsigned __int32)	mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
+#else
+    static ATTRIBUTE_ALIGNED16(const unsigned int)	mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x00000000 /*0x7fffffff*/};
+#endif
 	///@todo: the intrinsic version is 11% slower
 #if DBVT_USE_INTRINSIC_SSE
 
@@ -908,7 +921,7 @@ inline void		btDbvt::collideTT(	const btDbvtNode* root0,
 DBVT_PREFIX
 inline void		btDbvt::collideTV(	const btDbvtNode* root,
 								  const btDbvtVolume& vol,
-								  DBVT_IPOLICY)
+								  DBVT_IPOLICY) const
 {
 	DBVT_CHECKTYPE
 		if(root)
diff --git a/src/BulletCollision/BroadphaseCollision/btDispatcher.h b/src/BulletCollision/BroadphaseCollision/btDispatcher.h
index a79cf9402..1ebb37797 100644
--- a/src/BulletCollision/BroadphaseCollision/btDispatcher.h
+++ b/src/BulletCollision/BroadphaseCollision/btDispatcher.h
@@ -22,7 +22,7 @@ struct btBroadphaseProxy;
 class btRigidBody;
 class	btCollisionObject;
 class btOverlappingPairCache;
-
+struct btCollisionObjectWrapper;
 
 class btPersistentManifold;
 class btStackAlloc;
@@ -76,17 +76,17 @@ class btDispatcher
 public:
 	virtual ~btDispatcher() ;
 
-	virtual btCollisionAlgorithm* findAlgorithm(btCollisionObject* body0,btCollisionObject* body1,btPersistentManifold* sharedManifold=0) = 0;
+	virtual btCollisionAlgorithm* findAlgorithm(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,btPersistentManifold* sharedManifold=0) = 0;
 
-	virtual btPersistentManifold*	getNewManifold(void* body0,void* body1)=0;
+	virtual btPersistentManifold*	getNewManifold(const btCollisionObject* b0,const btCollisionObject* b1)=0;
 
 	virtual void releaseManifold(btPersistentManifold* manifold)=0;
 
 	virtual void clearManifold(btPersistentManifold* manifold)=0;
 
-	virtual bool	needsCollision(btCollisionObject* body0,btCollisionObject* body1) = 0;
+	virtual bool	needsCollision(const btCollisionObject* body0,const btCollisionObject* body1) = 0;
 
-	virtual bool	needsResponse(btCollisionObject* body0,btCollisionObject* body1)=0;
+	virtual bool	needsResponse(const btCollisionObject* body0,const btCollisionObject* body1)=0;
 
 	virtual void	dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo,btDispatcher* dispatcher)  =0;
 
diff --git a/src/BulletCollision/BroadphaseCollision/btQuantizedBvh.h b/src/BulletCollision/BroadphaseCollision/btQuantizedBvh.h
index 579cc9a5c..78382da79 100644
--- a/src/BulletCollision/BroadphaseCollision/btQuantizedBvh.h
+++ b/src/BulletCollision/BroadphaseCollision/btQuantizedBvh.h
@@ -78,8 +78,10 @@ ATTRIBUTE_ALIGNED16	(struct) btQuantizedBvhNode
 	int	getTriangleIndex() const
 	{
 		btAssert(isLeafNode());
+		unsigned int x=0;
+		unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
 		// Get only the lower bits where the triangle index is stored
-		return (m_escapeIndexOrTriangleIndex&~((~0)<<(31-MAX_NUM_PARTS_IN_BITS)));
+		return (m_escapeIndexOrTriangleIndex&~(y));
 	}
 	int	getPartId() const
 	{
diff --git a/src/BulletCollision/CollisionDispatch/SphereTriangleDetector.cpp b/src/BulletCollision/CollisionDispatch/SphereTriangleDetector.cpp
index 23a5c7526..634017809 100644
--- a/src/BulletCollision/CollisionDispatch/SphereTriangleDetector.cpp
+++ b/src/BulletCollision/CollisionDispatch/SphereTriangleDetector.cpp
@@ -158,7 +158,6 @@ bool SphereTriangleDetector::collide(const btVector3& sphereCenter,btVector3 &po
 				depth = -(radius-distance);
 			} else
 			{
-				btScalar distance = 0.f;
 				resultNormal = normal;
 				point = contactPoint;
 				depth = -radius;
diff --git a/src/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.cpp b/src/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.cpp
index 7e5da6c58..57f146493 100644
--- a/src/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.cpp
+++ b/src/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.cpp
@@ -24,7 +24,7 @@ btActivatingCollisionAlgorithm::btActivatingCollisionAlgorithm (const btCollisio
 //m_colObj1(0)
 {
 }
-btActivatingCollisionAlgorithm::btActivatingCollisionAlgorithm (const btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* colObj0,btCollisionObject* colObj1)
+btActivatingCollisionAlgorithm::btActivatingCollisionAlgorithm (const btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* ,const btCollisionObjectWrapper* )
 :btCollisionAlgorithm(ci)
 //,
 //m_colObj0(0),
diff --git a/src/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.h b/src/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.h
index 25fe08894..489812b96 100644
--- a/src/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.h
+++ b/src/BulletCollision/CollisionDispatch/btActivatingCollisionAlgorithm.h
@@ -28,7 +28,7 @@ public:
 
 	btActivatingCollisionAlgorithm (const btCollisionAlgorithmConstructionInfo& ci);
 
-	btActivatingCollisionAlgorithm (const btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* colObj0,btCollisionObject* colObj1);
+	btActivatingCollisionAlgorithm (const btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
 
 	virtual ~btActivatingCollisionAlgorithm();
 
diff --git a/src/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp b/src/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
index 2182d0d7e..ee3f95467 100644
--- a/src/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
+++ b/src/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
@@ -22,17 +22,18 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletCollision/CollisionDispatch/btBoxBoxDetector.h"
 #include "BulletCollision/CollisionShapes/btBox2dShape.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 #define USE_PERSISTENT_CONTACTS 1
 
-btBox2dBox2dCollisionAlgorithm::btBox2dBox2dCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* obj0,btCollisionObject* obj1)
-: btActivatingCollisionAlgorithm(ci,obj0,obj1),
+btBox2dBox2dCollisionAlgorithm::btBox2dBox2dCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* obj0Wrap,const btCollisionObjectWrapper* obj1Wrap)
+: btActivatingCollisionAlgorithm(ci,obj0Wrap,obj1Wrap),
 m_ownManifold(false),
 m_manifoldPtr(mf)
 {
-	if (!m_manifoldPtr && m_dispatcher->needsCollision(obj0,obj1))
+	if (!m_manifoldPtr && m_dispatcher->needsCollision(obj0Wrap->getCollisionObject(),obj1Wrap->getCollisionObject()))
 	{
-		m_manifoldPtr = m_dispatcher->getNewManifold(obj0,obj1);
+		m_manifoldPtr = m_dispatcher->getNewManifold(obj0Wrap->getCollisionObject(),obj1Wrap->getCollisionObject());
 		m_ownManifold = true;
 	}
 }
@@ -52,19 +53,18 @@ btBox2dBox2dCollisionAlgorithm::~btBox2dBox2dCollisionAlgorithm()
 void b2CollidePolygons(btManifoldResult* manifold,  const btBox2dShape* polyA, const btTransform& xfA, const btBox2dShape* polyB, const btTransform& xfB);
 
 //#include <stdio.h>
-void btBox2dBox2dCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btBox2dBox2dCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 	if (!m_manifoldPtr)
 		return;
 
-	btCollisionObject*	col0 = body0;
-	btCollisionObject*	col1 = body1;
-	btBox2dShape* box0 = (btBox2dShape*)col0->getCollisionShape();
-	btBox2dShape* box1 = (btBox2dShape*)col1->getCollisionShape();
+	
+	const btBox2dShape* box0 = (const btBox2dShape*)body0Wrap->getCollisionShape();
+	const btBox2dShape* box1 = (const btBox2dShape*)body1Wrap->getCollisionShape();
 
 	resultOut->setPersistentManifold(m_manifoldPtr);
 
-	b2CollidePolygons(resultOut,box0,col0->getWorldTransform(),box1,col1->getWorldTransform());
+	b2CollidePolygons(resultOut,box0,body0Wrap->getWorldTransform(),box1,body1Wrap->getWorldTransform());
 
 	//  refreshContactPoints is only necessary when using persistent contact points. otherwise all points are newly added
 	if (m_ownManifold)
@@ -151,15 +151,8 @@ static btScalar EdgeSeparation(const btBox2dShape* poly1, const btTransform& xf1
 	int index = 0;
 	btScalar minDot = BT_LARGE_FLOAT;
 
-	for (int i = 0; i < count2; ++i)
-	{
-		btScalar dot = b2Dot(vertices2[i], normal1);
-		if (dot < minDot)
-		{
-			minDot = dot;
-			index = i;
-		}
-	}
+    if( count2 > 0 )
+        index = (int) normal1.minDot( vertices2, count2, minDot);
 
 	btVector3 v1 = b2Mul(xf1, vertices1[edge1]);
 	btVector3 v2 = b2Mul(xf2, vertices2[index]);
@@ -181,16 +174,9 @@ static btScalar FindMaxSeparation(int* edgeIndex,
 
 	// Find edge normal on poly1 that has the largest projection onto d.
 	int edge = 0;
-	btScalar maxDot = -BT_LARGE_FLOAT;
-	for (int i = 0; i < count1; ++i)
-	{
-		btScalar dot = b2Dot(normals1[i], dLocal1);
-		if (dot > maxDot)
-		{
-			maxDot = dot;
-			edge = i;
-		}
-	}
+    btScalar maxDot;
+    if( count1 > 0 )
+        edge = (int) dLocal1.maxDot( normals1, count1, maxDot);
 
 	// Get the separation for the edge normal.
 	btScalar s = EdgeSeparation(poly1, xf1, edge, poly2, xf2);
diff --git a/src/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.h b/src/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.h
index 97c5be770..6ea6e89bd 100644
--- a/src/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.h
+++ b/src/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.h
@@ -33,11 +33,11 @@ public:
 	btBox2dBox2dCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci)
 		: btActivatingCollisionAlgorithm(ci) {}
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
-	btBox2dBox2dCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
+	btBox2dBox2dCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
 
 	virtual ~btBox2dBox2dCollisionAlgorithm();
 
@@ -52,11 +52,11 @@ public:
 
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			int bbsize = sizeof(btBox2dBox2dCollisionAlgorithm);
 			void* ptr = ci.m_dispatcher1->allocateCollisionAlgorithm(bbsize);
-			return new(ptr) btBox2dBox2dCollisionAlgorithm(0,ci,body0,body1);
+			return new(ptr) btBox2dBox2dCollisionAlgorithm(0,ci,body0Wrap,body1Wrap);
 		}
 	};
 
diff --git a/src/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.cpp b/src/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.cpp
index 496288534..ac68968f5 100644
--- a/src/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.cpp
+++ b/src/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.cpp
@@ -18,17 +18,17 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btBoxShape.h"
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "btBoxBoxDetector.h"
-
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 #define USE_PERSISTENT_CONTACTS 1
 
-btBoxBoxCollisionAlgorithm::btBoxBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* obj0,btCollisionObject* obj1)
-: btActivatingCollisionAlgorithm(ci,obj0,obj1),
+btBoxBoxCollisionAlgorithm::btBoxBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
+: btActivatingCollisionAlgorithm(ci,body0Wrap,body1Wrap),
 m_ownManifold(false),
 m_manifoldPtr(mf)
 {
-	if (!m_manifoldPtr && m_dispatcher->needsCollision(obj0,obj1))
+	if (!m_manifoldPtr && m_dispatcher->needsCollision(body0Wrap->getCollisionObject(),body1Wrap->getCollisionObject()))
 	{
-		m_manifoldPtr = m_dispatcher->getNewManifold(obj0,obj1);
+		m_manifoldPtr = m_dispatcher->getNewManifold(body0Wrap->getCollisionObject(),body1Wrap->getCollisionObject());
 		m_ownManifold = true;
 	}
 }
@@ -42,15 +42,14 @@ btBoxBoxCollisionAlgorithm::~btBoxBoxCollisionAlgorithm()
 	}
 }
 
-void btBoxBoxCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btBoxBoxCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 	if (!m_manifoldPtr)
 		return;
 
-	btCollisionObject*	col0 = body0;
-	btCollisionObject*	col1 = body1;
-	btBoxShape* box0 = (btBoxShape*)col0->getCollisionShape();
-	btBoxShape* box1 = (btBoxShape*)col1->getCollisionShape();
+	
+	const btBoxShape* box0 = (btBoxShape*)body0Wrap->getCollisionShape();
+	const btBoxShape* box1 = (btBoxShape*)body1Wrap->getCollisionShape();
 
 
 
@@ -62,8 +61,8 @@ void btBoxBoxCollisionAlgorithm::processCollision (btCollisionObject* body0,btCo
 
 	btDiscreteCollisionDetectorInterface::ClosestPointInput input;
 	input.m_maximumDistanceSquared = BT_LARGE_FLOAT;
-	input.m_transformA = body0->getWorldTransform();
-	input.m_transformB = body1->getWorldTransform();
+	input.m_transformA = body0Wrap->getWorldTransform();
+	input.m_transformB = body1Wrap->getWorldTransform();
 
 	btBoxBoxDetector detector(box0,box1);
 	detector.getClosestPoints(input,*resultOut,dispatchInfo.m_debugDraw);
diff --git a/src/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.h b/src/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.h
index f0bbae61e..59808df5a 100644
--- a/src/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.h
+++ b/src/BulletCollision/CollisionDispatch/btBoxBoxCollisionAlgorithm.h
@@ -33,11 +33,11 @@ public:
 	btBoxBoxCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci)
 		: btActivatingCollisionAlgorithm(ci) {}
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
-	btBoxBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
+	btBoxBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
 
 	virtual ~btBoxBoxCollisionAlgorithm();
 
@@ -52,11 +52,11 @@ public:
 
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			int bbsize = sizeof(btBoxBoxCollisionAlgorithm);
 			void* ptr = ci.m_dispatcher1->allocateCollisionAlgorithm(bbsize);
-			return new(ptr) btBoxBoxCollisionAlgorithm(0,ci,body0,body1);
+			return new(ptr) btBoxBoxCollisionAlgorithm(0,ci,body0Wrap,body1Wrap);
 		}
 	};
 
diff --git a/src/BulletCollision/CollisionDispatch/btBoxBoxDetector.cpp b/src/BulletCollision/CollisionDispatch/btBoxBoxDetector.cpp
index a7c8cf140..7043bde34 100644
--- a/src/BulletCollision/CollisionDispatch/btBoxBoxDetector.cpp
+++ b/src/BulletCollision/CollisionDispatch/btBoxBoxDetector.cpp
@@ -24,7 +24,7 @@ subject to the following restrictions:
 #include <float.h>
 #include <string.h>
 
-btBoxBoxDetector::btBoxBoxDetector(btBoxShape* box1,btBoxShape* box2)
+btBoxBoxDetector::btBoxBoxDetector(const btBoxShape* box1,const btBoxShape* box2)
 : m_box1(box1),
 m_box2(box2)
 {
diff --git a/src/BulletCollision/CollisionDispatch/btBoxBoxDetector.h b/src/BulletCollision/CollisionDispatch/btBoxBoxDetector.h
index 3c941f7de..392437770 100644
--- a/src/BulletCollision/CollisionDispatch/btBoxBoxDetector.h
+++ b/src/BulletCollision/CollisionDispatch/btBoxBoxDetector.h
@@ -28,12 +28,12 @@ class btBoxShape;
 /// re-distributed under the Zlib license with permission from Russell L. Smith
 struct btBoxBoxDetector : public btDiscreteCollisionDetectorInterface
 {
-	btBoxShape* m_box1;
-	btBoxShape* m_box2;
+	const btBoxShape* m_box1;
+	const btBoxShape* m_box2;
 
 public:
 
-	btBoxBoxDetector(btBoxShape* box1,btBoxShape* box2);
+	btBoxBoxDetector(const btBoxShape* box1,const btBoxShape* box2);
 
 	virtual ~btBoxBoxDetector() {};
 
diff --git a/src/BulletCollision/CollisionDispatch/btCollisionCreateFunc.h b/src/BulletCollision/CollisionDispatch/btCollisionCreateFunc.h
index 1d7e74401..62ee66c4e 100644
--- a/src/BulletCollision/CollisionDispatch/btCollisionCreateFunc.h
+++ b/src/BulletCollision/CollisionDispatch/btCollisionCreateFunc.h
@@ -19,7 +19,7 @@ subject to the following restrictions:
 #include "LinearMath/btAlignedObjectArray.h"
 class btCollisionAlgorithm;
 class btCollisionObject;
-
+struct btCollisionObjectWrapper;
 struct btCollisionAlgorithmConstructionInfo;
 
 ///Used by the btCollisionDispatcher to register and create instances for btCollisionAlgorithm
@@ -33,11 +33,11 @@ struct btCollisionAlgorithmCreateFunc
 	}
 	virtual ~btCollisionAlgorithmCreateFunc(){};
 
-	virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& , btCollisionObject* body0,btCollisionObject* body1)
+	virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& , const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 	{
 		
-		(void)body0;
-		(void)body1;
+		(void)body0Wrap;
+		(void)body1Wrap;
 		return 0;
 	}
 };
diff --git a/src/BulletCollision/CollisionDispatch/btCollisionDispatcher.cpp b/src/BulletCollision/CollisionDispatch/btCollisionDispatcher.cpp
index 29674f3be..e5418c0ab 100644
--- a/src/BulletCollision/CollisionDispatch/btCollisionDispatcher.cpp
+++ b/src/BulletCollision/CollisionDispatch/btCollisionDispatcher.cpp
@@ -25,6 +25,7 @@ subject to the following restrictions:
 #include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
 #include "LinearMath/btPoolAllocator.h"
 #include "BulletCollision/CollisionDispatch/btCollisionConfiguration.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 int gNumManifold = 0;
 
@@ -67,15 +68,13 @@ btCollisionDispatcher::~btCollisionDispatcher()
 {
 }
 
-btPersistentManifold*	btCollisionDispatcher::getNewManifold(void* b0,void* b1) 
+btPersistentManifold*	btCollisionDispatcher::getNewManifold(const btCollisionObject* body0,const btCollisionObject* body1) 
 { 
 	gNumManifold++;
 	
 	//btAssert(gNumManifold < 65535);
 	
 
-	btCollisionObject* body0 = (btCollisionObject*)b0;
-	btCollisionObject* body1 = (btCollisionObject*)b1;
 
 	//optional relative contact breaking threshold, turned on by default (use setDispatcherFlags to switch off feature for improved performance)
 	
@@ -143,14 +142,14 @@ void btCollisionDispatcher::releaseManifold(btPersistentManifold* manifold)
 
 	
 
-btCollisionAlgorithm* btCollisionDispatcher::findAlgorithm(btCollisionObject* body0,btCollisionObject* body1,btPersistentManifold* sharedManifold)
+btCollisionAlgorithm* btCollisionDispatcher::findAlgorithm(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,btPersistentManifold* sharedManifold)
 {
 	
 	btCollisionAlgorithmConstructionInfo ci;
 
 	ci.m_dispatcher1 = this;
 	ci.m_manifold = sharedManifold;
-	btCollisionAlgorithm* algo = m_doubleDispatch[body0->getCollisionShape()->getShapeType()][body1->getCollisionShape()->getShapeType()]->CreateCollisionAlgorithm(ci,body0,body1);
+	btCollisionAlgorithm* algo = m_doubleDispatch[body0Wrap->getCollisionShape()->getShapeType()][body1Wrap->getCollisionShape()->getShapeType()]->CreateCollisionAlgorithm(ci,body0Wrap,body1Wrap);
 
 	return algo;
 }
@@ -158,7 +157,7 @@ btCollisionAlgorithm* btCollisionDispatcher::findAlgorithm(btCollisionObject* bo
 
 
 
-bool	btCollisionDispatcher::needsResponse(btCollisionObject* body0,btCollisionObject* body1)
+bool	btCollisionDispatcher::needsResponse(const btCollisionObject* body0,const btCollisionObject* body1)
 {
 	//here you can do filtering
 	bool hasResponse = 
@@ -169,7 +168,7 @@ bool	btCollisionDispatcher::needsResponse(btCollisionObject* body0,btCollisionOb
 	return hasResponse;
 }
 
-bool	btCollisionDispatcher::needsCollision(btCollisionObject* body0,btCollisionObject* body1)
+bool	btCollisionDispatcher::needsCollision(const btCollisionObject* body0,const btCollisionObject* body1)
 {
 	btAssert(body0);
 	btAssert(body1);
@@ -259,20 +258,25 @@ void btCollisionDispatcher::defaultNearCallback(btBroadphasePair& collisionPair,
 
 		if (dispatcher.needsCollision(colObj0,colObj1))
 		{
+			btCollisionObjectWrapper obj0Wrap(0,colObj0->getCollisionShape(),colObj0,colObj0->getWorldTransform());
+			btCollisionObjectWrapper obj1Wrap(0,colObj1->getCollisionShape(),colObj1,colObj1->getWorldTransform());
+
+
 			//dispatcher will keep algorithms persistent in the collision pair
 			if (!collisionPair.m_algorithm)
 			{
-				collisionPair.m_algorithm = dispatcher.findAlgorithm(colObj0,colObj1);
+				collisionPair.m_algorithm = dispatcher.findAlgorithm(&obj0Wrap,&obj1Wrap);
 			}
 
 			if (collisionPair.m_algorithm)
 			{
-				btManifoldResult contactPointResult(colObj0,colObj1);
+				btManifoldResult contactPointResult(&obj0Wrap,&obj1Wrap);
 				
 				if (dispatchInfo.m_dispatchFunc == 		btDispatcherInfo::DISPATCH_DISCRETE)
 				{
 					//discrete collision detection query
-					collisionPair.m_algorithm->processCollision(colObj0,colObj1,dispatchInfo,&contactPointResult);
+					
+					collisionPair.m_algorithm->processCollision(&obj0Wrap,&obj1Wrap,dispatchInfo,&contactPointResult);
 				} else
 				{
 					//continuous collision detection query, time of impact (toi)
diff --git a/src/BulletCollision/CollisionDispatch/btCollisionDispatcher.h b/src/BulletCollision/CollisionDispatch/btCollisionDispatcher.h
index 5accad9a9..92696ee54 100644
--- a/src/BulletCollision/CollisionDispatch/btCollisionDispatcher.h
+++ b/src/BulletCollision/CollisionDispatch/btCollisionDispatcher.h
@@ -108,19 +108,18 @@ public:
 
 	virtual ~btCollisionDispatcher();
 
-	virtual btPersistentManifold*	getNewManifold(void* b0,void* b1);
+	virtual btPersistentManifold*	getNewManifold(const btCollisionObject* b0,const btCollisionObject* b1);
 	
 	virtual void releaseManifold(btPersistentManifold* manifold);
 
 
 	virtual void clearManifold(btPersistentManifold* manifold);
 
-			
-	btCollisionAlgorithm* findAlgorithm(btCollisionObject* body0,btCollisionObject* body1,btPersistentManifold* sharedManifold = 0);
+	btCollisionAlgorithm* findAlgorithm(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,btPersistentManifold* sharedManifold = 0);
 		
-	virtual bool	needsCollision(btCollisionObject* body0,btCollisionObject* body1);
+	virtual bool	needsCollision(const btCollisionObject* body0,const btCollisionObject* body1);
 	
-	virtual bool	needsResponse(btCollisionObject* body0,btCollisionObject* body1);
+	virtual bool	needsResponse(const btCollisionObject* body0,const btCollisionObject* body1);
 	
 	virtual void	dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo,btDispatcher* dispatcher) ;
 
diff --git a/src/BulletCollision/CollisionDispatch/btCollisionObject.cpp b/src/BulletCollision/CollisionDispatch/btCollisionObject.cpp
index 580ea3458..88d2ba55b 100644
--- a/src/BulletCollision/CollisionDispatch/btCollisionObject.cpp
+++ b/src/BulletCollision/CollisionDispatch/btCollisionObject.cpp
@@ -46,18 +46,18 @@ btCollisionObject::~btCollisionObject()
 {
 }
 
-void btCollisionObject::setActivationState(int newState) 
+void btCollisionObject::setActivationState(int newState) const
 { 
 	if ( (m_activationState1 != DISABLE_DEACTIVATION) && (m_activationState1 != DISABLE_SIMULATION))
 		m_activationState1 = newState;
 }
 
-void btCollisionObject::forceActivationState(int newState)
+void btCollisionObject::forceActivationState(int newState) const
 {
 	m_activationState1 = newState;
 }
 
-void btCollisionObject::activate(bool forceActivation)
+void btCollisionObject::activate(bool forceActivation) const
 {
 	if (forceActivation || !(m_collisionFlags & (CF_STATIC_OBJECT|CF_KINEMATIC_OBJECT)))
 	{
@@ -85,7 +85,6 @@ const char* btCollisionObject::serialize(void* dataBuffer, btSerializer* seriali
 	dataOut->m_islandTag1 = m_islandTag1;
 	dataOut->m_companionId = m_companionId;
 	dataOut->m_activationState1 = m_activationState1;
-	dataOut->m_activationState1 = m_activationState1;
 	dataOut->m_deactivationTime = m_deactivationTime;
 	dataOut->m_friction = m_friction;
 	dataOut->m_restitution = m_restitution;
@@ -100,7 +99,6 @@ const char* btCollisionObject::serialize(void* dataBuffer, btSerializer* seriali
 	dataOut->m_hitFraction = m_hitFraction;
 	dataOut->m_ccdSweptSphereRadius = m_ccdSweptSphereRadius;
 	dataOut->m_ccdMotionThreshold = m_ccdMotionThreshold;
-	dataOut->m_ccdMotionThreshold = m_ccdMotionThreshold;
 	dataOut->m_checkCollideWith = m_checkCollideWith;
 
 	return btCollisionObjectDataName;
diff --git a/src/BulletCollision/CollisionDispatch/btCollisionObject.h b/src/BulletCollision/CollisionDispatch/btCollisionObject.h
index 3a11c967a..f1b8b0739 100644
--- a/src/BulletCollision/CollisionDispatch/btCollisionObject.h
+++ b/src/BulletCollision/CollisionDispatch/btCollisionObject.h
@@ -80,8 +80,8 @@ protected:
 	int				m_islandTag1;
 	int				m_companionId;
 
-	int				m_activationState1;
-	btScalar			m_deactivationTime;
+	mutable int				m_activationState1;
+	mutable btScalar			m_deactivationTime;
 
 	btScalar		m_friction;
 	btScalar		m_restitution;
@@ -105,7 +105,7 @@ protected:
 	/// If some object should have elaborate collision filtering by sub-classes
 	int			m_checkCollideWith;
 
-	virtual bool	checkCollideWithOverride(btCollisionObject* /* co */)
+	virtual bool	checkCollideWithOverride(const btCollisionObject* /* co */) const
 	{
 		return true;
 	}
@@ -207,22 +207,9 @@ public:
 		return m_collisionShape;
 	}
 
-	SIMD_FORCE_INLINE const btCollisionShape*	getRootCollisionShape() const
-	{
-		return m_rootCollisionShape;
-	}
+	
 
-	SIMD_FORCE_INLINE btCollisionShape*	getRootCollisionShape()
-	{
-		return m_rootCollisionShape;
-	}
-
-	///Avoid using this internal API call
-	///internalSetTemporaryCollisionShape is used to temporary replace the actual collision shape by a child collision shape.
-	void	internalSetTemporaryCollisionShape(btCollisionShape* collisionShape)
-	{
-		m_collisionShape = collisionShape;
-	}
+	
 
 	///Avoid using this internal API call, the extension pointer is used by some Bullet extensions. 
 	///If you need to store your own user pointer, use 'setUserPointer/getUserPointer' instead.
@@ -239,7 +226,7 @@ public:
 
 	SIMD_FORCE_INLINE	int	getActivationState() const { return m_activationState1;}
 	
-	void setActivationState(int newState);
+	void setActivationState(int newState) const;
 
 	void	setDeactivationTime(btScalar time)
 	{
@@ -250,9 +237,9 @@ public:
 		return m_deactivationTime;
 	}
 
-	void forceActivationState(int newState);
+	void forceActivationState(int newState) const;
 
-	void	activate(bool forceActivation = false);
+	void	activate(bool forceActivation = false) const;
 
 	SIMD_FORCE_INLINE bool isActive() const
 	{
@@ -433,7 +420,7 @@ public:
 	}
 
 
-	inline bool checkCollideWith(btCollisionObject* co)
+	inline bool checkCollideWith(const btCollisionObject* co) const
 	{
 		if (m_checkCollideWith)
 			return checkCollideWithOverride(co);
diff --git a/src/BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h b/src/BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h
new file mode 100644
index 000000000..ac2e85cd0
--- /dev/null
+++ b/src/BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h
@@ -0,0 +1,40 @@
+#ifndef BT_COLLISION_OBJECT_WRAPPER_H
+#define BT_COLLISION_OBJECT_WRAPPER_H
+
+///btCollisionObjectWrapperis an internal data structure. 
+///Most users can ignore this and use btCollisionObject and btCollisionShape instead
+class btCollisionShape;
+class btCollisionObject;
+class btTransform;
+#include "LinearMath/btScalar.h" // for SIMD_FORCE_INLINE definition
+
+#define BT_DECLARE_STACK_ONLY_OBJECT \
+	private: \
+		void* operator new(size_t size); \
+		void operator delete(void*);
+
+struct btCollisionObjectWrapper;
+struct btCollisionObjectWrapper
+{
+BT_DECLARE_STACK_ONLY_OBJECT
+
+private:
+	btCollisionObjectWrapper(const btCollisionObjectWrapper&); // not implemented. Not allowed.
+	btCollisionObjectWrapper* operator=(const btCollisionObjectWrapper&);
+
+public:
+	const btCollisionObjectWrapper* m_parent;
+	const btCollisionShape* m_shape;
+	const btCollisionObject* m_collisionObject;
+	const btTransform& m_worldTransform;
+
+	btCollisionObjectWrapper(const btCollisionObjectWrapper* parent, const btCollisionShape* shape, const btCollisionObject* collisionObject, const btTransform& worldTransform)
+	: m_parent(parent), m_shape(shape), m_collisionObject(collisionObject), m_worldTransform(worldTransform)
+	{}
+
+	SIMD_FORCE_INLINE const btTransform& getWorldTransform() const { return m_worldTransform; }
+	SIMD_FORCE_INLINE const btCollisionObject* getCollisionObject() const { return m_collisionObject; }
+	SIMD_FORCE_INLINE const btCollisionShape* getCollisionShape() const { return m_shape; }
+};
+
+#endif //BT_COLLISION_OBJECT_WRAPPER_H
diff --git a/src/BulletCollision/CollisionDispatch/btCollisionWorld.cpp b/src/BulletCollision/CollisionDispatch/btCollisionWorld.cpp
index 66b93b88e..a9050976b 100644
--- a/src/BulletCollision/CollisionDispatch/btCollisionWorld.cpp
+++ b/src/BulletCollision/CollisionDispatch/btCollisionWorld.cpp
@@ -34,6 +34,7 @@ subject to the following restrictions:
 #include "LinearMath/btStackAlloc.h"
 #include "LinearMath/btSerializer.h"
 #include "BulletCollision/CollisionShapes/btConvexPolyhedron.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 //#define DISABLE_DBVT_COMPOUNDSHAPE_RAYCAST_ACCELERATION
 
@@ -260,16 +261,25 @@ void	btCollisionWorld::removeCollisionObject(btCollisionObject* collisionObject)
 }
 
 
-
 void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTransform& rayToTrans,
 										btCollisionObject* collisionObject,
 										const btCollisionShape* collisionShape,
 										const btTransform& colObjWorldTransform,
 										RayResultCallback& resultCallback)
+{
+	btCollisionObjectWrapper colObWrap(0,collisionShape,collisionObject,colObjWorldTransform);
+	btCollisionWorld::rayTestSingleInternal(rayFromTrans,rayToTrans,&colObWrap,resultCallback);
+}
+
+void	btCollisionWorld::rayTestSingleInternal(const btTransform& rayFromTrans,const btTransform& rayToTrans,
+										const btCollisionObjectWrapper* collisionObjectWrap,
+										RayResultCallback& resultCallback)
 {
 	btSphereShape pointShape(btScalar(0.0));
 	pointShape.setMargin(0.f);
 	const btConvexShape* castShape = &pointShape;
+	const btCollisionShape* collisionShape = collisionObjectWrap->getCollisionShape();
+	const btTransform& colObjWorldTransform = collisionObjectWrap->getWorldTransform();
 
 	if (collisionShape->isConvex())
 	{
@@ -302,7 +312,7 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 					castResult.m_normal.normalize();
 					btCollisionWorld::LocalRayResult localRayResult
 						(
-						collisionObject,
+						collisionObjectWrap->getCollisionObject(),
 						0,
 						castResult.m_normal,
 						castResult.m_fraction
@@ -330,13 +340,13 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 				struct BridgeTriangleRaycastCallback : public btTriangleRaycastCallback
 				{
 					btCollisionWorld::RayResultCallback* m_resultCallback;
-					btCollisionObject*	m_collisionObject;
+					const btCollisionObject*	m_collisionObject;
 					btTriangleMeshShape*	m_triangleMesh;
 
 					btTransform m_colObjWorldTransform;
 
 					BridgeTriangleRaycastCallback( const btVector3& from,const btVector3& to,
-						btCollisionWorld::RayResultCallback* resultCallback, btCollisionObject* collisionObject,btTriangleMeshShape*	triangleMesh,const btTransform& colObjWorldTransform):
+						btCollisionWorld::RayResultCallback* resultCallback, const btCollisionObject* collisionObject,btTriangleMeshShape*	triangleMesh,const btTransform& colObjWorldTransform):
 					//@BP Mod
 					btTriangleRaycastCallback(from,to, resultCallback->m_flags),
 						m_resultCallback(resultCallback),
@@ -367,7 +377,7 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 
 				};
 
-				BridgeTriangleRaycastCallback rcb(rayFromLocal,rayToLocal,&resultCallback,collisionObject,triangleMesh,colObjWorldTransform);
+				BridgeTriangleRaycastCallback rcb(rayFromLocal,rayToLocal,&resultCallback,collisionObjectWrap->getCollisionObject(),triangleMesh,colObjWorldTransform);
 				rcb.m_hitFraction = resultCallback.m_closestHitFraction;
 				triangleMesh->performRaycast(&rcb,rayFromLocal,rayToLocal);
 			} else
@@ -385,13 +395,13 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 				struct BridgeTriangleRaycastCallback : public btTriangleRaycastCallback
 				{
 					btCollisionWorld::RayResultCallback* m_resultCallback;
-					btCollisionObject*	m_collisionObject;
+					const btCollisionObject*	m_collisionObject;
 					btConcaveShape*	m_triangleMesh;
 
 					btTransform m_colObjWorldTransform;
 
 					BridgeTriangleRaycastCallback( const btVector3& from,const btVector3& to,
-						btCollisionWorld::RayResultCallback* resultCallback, btCollisionObject* collisionObject,btConcaveShape*	triangleMesh, const btTransform& colObjWorldTransform):
+						btCollisionWorld::RayResultCallback* resultCallback, const btCollisionObject* collisionObject,btConcaveShape*	triangleMesh, const btTransform& colObjWorldTransform):
 					//@BP Mod
 					btTriangleRaycastCallback(from,to, resultCallback->m_flags),
 						m_resultCallback(resultCallback),
@@ -423,7 +433,7 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 				};
 
 
-				BridgeTriangleRaycastCallback	rcb(rayFromLocal,rayToLocal,&resultCallback,collisionObject,concaveShape, colObjWorldTransform);
+				BridgeTriangleRaycastCallback	rcb(rayFromLocal,rayToLocal,&resultCallback,collisionObjectWrap->getCollisionObject(),concaveShape, colObjWorldTransform);
 				rcb.m_hitFraction = resultCallback.m_closestHitFraction;
 
 				btVector3 rayAabbMinLocal = rayFromLocal;
@@ -468,14 +478,14 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 				
 				struct RayTester : btDbvt::ICollide
 				{
-					btCollisionObject* m_collisionObject;
+					const btCollisionObject* m_collisionObject;
 					const btCompoundShape* m_compoundShape;
 					const btTransform& m_colObjWorldTransform;
 					const btTransform& m_rayFromTrans;
 					const btTransform& m_rayToTrans;
 					RayResultCallback& m_resultCallback;
 					
-					RayTester(btCollisionObject* collisionObject,
+					RayTester(const btCollisionObject* collisionObject,
 							const btCompoundShape* compoundShape,
 							const btTransform& colObjWorldTransform,
 							const btTransform& rayFromTrans,
@@ -497,22 +507,19 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 						const btTransform& childTrans = m_compoundShape->getChildTransform(i);
 						btTransform childWorldTrans = m_colObjWorldTransform * childTrans;
 						
+						btCollisionObjectWrapper tmpOb(0,childCollisionShape,m_collisionObject,childWorldTrans);
 						// replace collision shape so that callback can determine the triangle
-						btCollisionShape* saveCollisionShape = m_collisionObject->getCollisionShape();
-						m_collisionObject->internalSetTemporaryCollisionShape((btCollisionShape*)childCollisionShape);
+
+						
 
 						LocalInfoAdder2 my_cb(i, &m_resultCallback);
 
-						rayTestSingle(
+						rayTestSingleInternal(
 							m_rayFromTrans,
 							m_rayToTrans,
-							m_collisionObject,
-							childCollisionShape,
-							childWorldTrans,
+							&tmpOb,
 							my_cb);
 						
-						// restore
-						m_collisionObject->internalSetTemporaryCollisionShape(saveCollisionShape);
 					}
 					
 					void Process(const btDbvtNode* leaf)
@@ -526,7 +533,7 @@ void	btCollisionWorld::rayTestSingle(const btTransform& rayFromTrans,const btTra
 
 
 				RayTester rayCB(
-					collisionObject,
+					collisionObjectWrap->getCollisionObject(),
 					compoundShape,
 					colObjWorldTransform,
 					rayFromTrans,
@@ -558,6 +565,17 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 											const btTransform& colObjWorldTransform,
 											ConvexResultCallback& resultCallback, btScalar allowedPenetration)
 {
+	btCollisionObjectWrapper tmpOb(0,collisionShape,collisionObject,colObjWorldTransform);
+	btCollisionWorld::objectQuerySingleInternal(castShape,convexFromTrans,convexToTrans,&tmpOb,resultCallback,allowedPenetration);
+}
+
+void	btCollisionWorld::objectQuerySingleInternal(const btConvexShape* castShape,const btTransform& convexFromTrans,const btTransform& convexToTrans,
+											const btCollisionObjectWrapper* colObjWrap,
+											ConvexResultCallback& resultCallback, btScalar allowedPenetration)
+{
+	const btCollisionShape* collisionShape = colObjWrap->getCollisionShape();
+	const btTransform& colObjWorldTransform = colObjWrap->getWorldTransform();
+
 	if (collisionShape->isConvex())
 	{
 		//BT_PROFILE("convexSweepConvex");
@@ -587,7 +605,7 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 					castResult.m_normal.normalize();
 					btCollisionWorld::LocalConvexResult localConvexResult
 						(
-						collisionObject,
+						colObjWrap->getCollisionObject(),
 						0,
 						castResult.m_normal,
 						castResult.m_hitPoint,
@@ -617,11 +635,11 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 				struct BridgeTriangleConvexcastCallback : public btTriangleConvexcastCallback
 				{
 					btCollisionWorld::ConvexResultCallback* m_resultCallback;
-					btCollisionObject*	m_collisionObject;
+					const btCollisionObject*	m_collisionObject;
 					btTriangleMeshShape*	m_triangleMesh;
 
 					BridgeTriangleConvexcastCallback(const btConvexShape* castShape, const btTransform& from,const btTransform& to,
-						btCollisionWorld::ConvexResultCallback* resultCallback, btCollisionObject* collisionObject,btTriangleMeshShape*	triangleMesh, const btTransform& triangleToWorld):
+						btCollisionWorld::ConvexResultCallback* resultCallback, const btCollisionObject* collisionObject,btTriangleMeshShape*	triangleMesh, const btTransform& triangleToWorld):
 					btTriangleConvexcastCallback(castShape, from,to, triangleToWorld, triangleMesh->getMargin()),
 						m_resultCallback(resultCallback),
 						m_collisionObject(collisionObject),
@@ -655,7 +673,7 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 
 				};
 
-				BridgeTriangleConvexcastCallback tccb(castShape, convexFromTrans,convexToTrans,&resultCallback,collisionObject,triangleMesh, colObjWorldTransform);
+				BridgeTriangleConvexcastCallback tccb(castShape, convexFromTrans,convexToTrans,&resultCallback,colObjWrap->getCollisionObject(),triangleMesh, colObjWorldTransform);
 				tccb.m_hitFraction = resultCallback.m_closestHitFraction;
 				tccb.m_allowedPenetration = allowedPenetration;
 				btVector3 boxMinLocal, boxMaxLocal;
@@ -682,7 +700,7 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 								castResult.m_normal.normalize();
 								btCollisionWorld::LocalConvexResult localConvexResult
 									(
-									collisionObject,
+									colObjWrap->getCollisionObject(),
 									0,
 									castResult.m_normal,
 									castResult.m_hitPoint,
@@ -709,11 +727,11 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 					struct BridgeTriangleConvexcastCallback : public btTriangleConvexcastCallback
 					{
 						btCollisionWorld::ConvexResultCallback* m_resultCallback;
-						btCollisionObject*	m_collisionObject;
+						const btCollisionObject*	m_collisionObject;
 						btConcaveShape*	m_triangleMesh;
 
 						BridgeTriangleConvexcastCallback(const btConvexShape* castShape, const btTransform& from,const btTransform& to,
-							btCollisionWorld::ConvexResultCallback* resultCallback, btCollisionObject* collisionObject,btConcaveShape*	triangleMesh, const btTransform& triangleToWorld):
+							btCollisionWorld::ConvexResultCallback* resultCallback, const btCollisionObject* collisionObject,btConcaveShape*	triangleMesh, const btTransform& triangleToWorld):
 						btTriangleConvexcastCallback(castShape, from,to, triangleToWorld, triangleMesh->getMargin()),
 							m_resultCallback(resultCallback),
 							m_collisionObject(collisionObject),
@@ -746,7 +764,7 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 
 					};
 
-					BridgeTriangleConvexcastCallback tccb(castShape, convexFromTrans,convexToTrans,&resultCallback,collisionObject,concaveShape, colObjWorldTransform);
+					BridgeTriangleConvexcastCallback tccb(castShape, convexFromTrans,convexToTrans,&resultCallback,colObjWrap->getCollisionObject(),concaveShape, colObjWorldTransform);
 					tccb.m_hitFraction = resultCallback.m_closestHitFraction;
 					tccb.m_allowedPenetration = allowedPenetration;
 					btVector3 boxMinLocal, boxMaxLocal;
@@ -773,9 +791,7 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 					btTransform childTrans = compoundShape->getChildTransform(i);
 					const btCollisionShape* childCollisionShape = compoundShape->getChildShape(i);
 					btTransform childWorldTrans = colObjWorldTransform * childTrans;
-					// replace collision shape so that callback can determine the triangle
-					btCollisionShape* saveCollisionShape = collisionObject->getCollisionShape();
-					collisionObject->internalSetTemporaryCollisionShape((btCollisionShape*)childCollisionShape);
+					
                     struct	LocalInfoAdder : public ConvexResultCallback {
                             ConvexResultCallback* m_userCallback;
 							int m_i;
@@ -805,14 +821,11 @@ void	btCollisionWorld::objectQuerySingle(const btConvexShape* castShape,const bt
 
                     LocalInfoAdder my_cb(i, &resultCallback);
 					
+					btCollisionObjectWrapper tmpObj(colObjWrap,childCollisionShape,colObjWrap->getCollisionObject(),childWorldTrans);
 
-					objectQuerySingle(castShape, convexFromTrans,convexToTrans,
-						collisionObject,
-						childCollisionShape,
-						childWorldTrans,
-						my_cb, allowedPenetration);
-					// restore
-					collisionObject->internalSetTemporaryCollisionShape(saveCollisionShape);
+					objectQuerySingleInternal(castShape, convexFromTrans,convexToTrans,
+						&tmpObj,my_cb, allowedPenetration);
+					
 				}
 			}
 		}
@@ -993,13 +1006,13 @@ void	btCollisionWorld::convexSweepTest(const btConvexShape* castShape, const btT
 	/* Compute AABB that encompasses angular movement */
 	{
 		btVector3 linVel, angVel;
-		btTransformUtil::calculateVelocity (convexFromTrans, convexToTrans, 1.0, linVel, angVel);
+		btTransformUtil::calculateVelocity (convexFromTrans, convexToTrans, 1.0f, linVel, angVel);
 		btVector3 zeroLinVel;
 		zeroLinVel.setValue(0,0,0);
 		btTransform R;
 		R.setIdentity ();
 		R.setRotation (convexFromTrans.getRotation());
-		castShape->calculateTemporalAabb (R, zeroLinVel, angVel, 1.0, castShapeAabbMin, castShapeAabbMax);
+		castShape->calculateTemporalAabb (R, zeroLinVel, angVel, 1.0f, castShapeAabbMin, castShapeAabbMax);
 	}
 
 #ifndef USE_BRUTEFORCE_RAYBROADPHASE
@@ -1044,26 +1057,26 @@ struct btBridgedManifoldResult : public btManifoldResult
 
 	btCollisionWorld::ContactResultCallback&	m_resultCallback;
 
-	btBridgedManifoldResult( btCollisionObject* obj0,btCollisionObject* obj1,btCollisionWorld::ContactResultCallback& resultCallback )
-		:btManifoldResult(obj0,obj1),
+	btBridgedManifoldResult( const btCollisionObjectWrapper* obj0Wrap,const btCollisionObjectWrapper* obj1Wrap,btCollisionWorld::ContactResultCallback& resultCallback )
+		:btManifoldResult(obj0Wrap,obj1Wrap),
 		m_resultCallback(resultCallback)
 	{
 	}
 
 	virtual void addContactPoint(const btVector3& normalOnBInWorld,const btVector3& pointInWorld,btScalar depth)
 	{
-		bool isSwapped = m_manifoldPtr->getBody0() != m_body0;
+		bool isSwapped = m_manifoldPtr->getBody0() != m_body0Wrap->getCollisionObject();
 		btVector3 pointA = pointInWorld + normalOnBInWorld * depth;
 		btVector3 localA;
 		btVector3 localB;
 		if (isSwapped)
 		{
-			localA = m_rootTransB.invXform(pointA );
-			localB = m_rootTransA.invXform(pointInWorld);
+			localA = m_body1Wrap->getCollisionObject()->getWorldTransform().invXform(pointA );
+			localB = m_body0Wrap->getCollisionObject()->getWorldTransform().invXform(pointInWorld);
 		} else
 		{
-			localA = m_rootTransA.invXform(pointA );
-			localB = m_rootTransB.invXform(pointInWorld);
+			localA = m_body0Wrap->getCollisionObject()->getWorldTransform().invXform(pointA );
+			localB = m_body1Wrap->getCollisionObject()->getWorldTransform().invXform(pointInWorld);
 		}
 		
 		btManifoldPoint newPt(localA,localB,normalOnBInWorld,depth);
@@ -1086,9 +1099,9 @@ struct btBridgedManifoldResult : public btManifoldResult
 		}
 
 		//experimental feature info, for per-triangle material etc.
-		btCollisionObject* obj0 = isSwapped? m_body1 : m_body0;
-		btCollisionObject* obj1 = isSwapped? m_body0 : m_body1;
-		m_resultCallback.addSingleResult(newPt,obj0,newPt.m_partId0,newPt.m_index0,obj1,newPt.m_partId1,newPt.m_index1);
+		const btCollisionObjectWrapper* obj0Wrap = isSwapped? m_body1Wrap : m_body0Wrap;
+		const btCollisionObjectWrapper* obj1Wrap = isSwapped? m_body0Wrap : m_body1Wrap;
+		m_resultCallback.addSingleResult(newPt,obj0Wrap,newPt.m_partId0,newPt.m_index0,obj1Wrap,newPt.m_partId1,newPt.m_index1);
 
 	}
 	
@@ -1120,12 +1133,16 @@ struct btSingleContactCallback : public btBroadphaseAabbCallback
 		//only perform raycast if filterMask matches
 		if(m_resultCallback.needsCollision(collisionObject->getBroadphaseHandle())) 
 		{
-			btCollisionAlgorithm* algorithm = m_world->getDispatcher()->findAlgorithm(m_collisionObject,collisionObject);
+			btCollisionObjectWrapper ob0(0,m_collisionObject->getCollisionShape(),m_collisionObject,m_collisionObject->getWorldTransform());
+			btCollisionObjectWrapper ob1(0,collisionObject->getCollisionShape(),collisionObject,collisionObject->getWorldTransform());
+
+			btCollisionAlgorithm* algorithm = m_world->getDispatcher()->findAlgorithm(&ob0,&ob1);
 			if (algorithm)
 			{
-				btBridgedManifoldResult contactPointResult(m_collisionObject,collisionObject, m_resultCallback);
+				btBridgedManifoldResult contactPointResult(&ob0,&ob1, m_resultCallback);
 				//discrete collision detection query
-				algorithm->processCollision(m_collisionObject,collisionObject, m_world->getDispatchInfo(),&contactPointResult);
+				
+				algorithm->processCollision(&ob0,&ob1, m_world->getDispatchInfo(),&contactPointResult);
 
 				algorithm->~btCollisionAlgorithm();
 				m_world->getDispatcher()->freeCollisionAlgorithm(algorithm);
@@ -1152,12 +1169,15 @@ void	btCollisionWorld::contactTest( btCollisionObject* colObj, ContactResultCall
 ///it reports one or more contact points (including the one with deepest penetration)
 void	btCollisionWorld::contactPairTest(btCollisionObject* colObjA, btCollisionObject* colObjB, ContactResultCallback& resultCallback)
 {
-	btCollisionAlgorithm* algorithm = getDispatcher()->findAlgorithm(colObjA,colObjB);
+	btCollisionObjectWrapper obA(0,colObjA->getCollisionShape(),colObjA,colObjA->getWorldTransform());
+	btCollisionObjectWrapper obB(0,colObjB->getCollisionShape(),colObjB,colObjB->getWorldTransform());
+
+	btCollisionAlgorithm* algorithm = getDispatcher()->findAlgorithm(&obA,&obB);
 	if (algorithm)
 	{
-		btBridgedManifoldResult contactPointResult(colObjA,colObjB, resultCallback);
+		btBridgedManifoldResult contactPointResult(&obA,&obB, resultCallback);
 		//discrete collision detection query
-		algorithm->processCollision(colObjA,colObjB, getDispatchInfo(),&contactPointResult);
+		algorithm->processCollision(&obA,&obB, getDispatchInfo(),&contactPointResult);
 
 		algorithm->~btCollisionAlgorithm();
 		getDispatcher()->freeCollisionAlgorithm(algorithm);
diff --git a/src/BulletCollision/CollisionDispatch/btCollisionWorld.h b/src/BulletCollision/CollisionDispatch/btCollisionWorld.h
index 0a92d2d6e..26220e160 100644
--- a/src/BulletCollision/CollisionDispatch/btCollisionWorld.h
+++ b/src/BulletCollision/CollisionDispatch/btCollisionWorld.h
@@ -173,7 +173,7 @@ public:
 
 	struct	LocalRayResult
 	{
-		LocalRayResult(btCollisionObject*	collisionObject, 
+		LocalRayResult(const btCollisionObject*	collisionObject, 
 			LocalShapeInfo*	localShapeInfo,
 			const btVector3&		hitNormalLocal,
 			btScalar hitFraction)
@@ -184,7 +184,7 @@ public:
 		{
 		}
 
-		btCollisionObject*		m_collisionObject;
+		const btCollisionObject*		m_collisionObject;
 		LocalShapeInfo*			m_localShapeInfo;
 		btVector3				m_hitNormalLocal;
 		btScalar				m_hitFraction;
@@ -195,7 +195,7 @@ public:
 	struct	RayResultCallback
 	{
 		btScalar	m_closestHitFraction;
-		btCollisionObject*		m_collisionObject;
+		const btCollisionObject*		m_collisionObject;
 		short int	m_collisionFilterGroup;
 		short int	m_collisionFilterMask;
       //@BP Mod - Custom flags, currently used to enable backface culling on tri-meshes, see btRaycastCallback
@@ -272,7 +272,7 @@ public:
 		{
 		}
 
-		btAlignedObjectArray<btCollisionObject*>		m_collisionObjects;
+		btAlignedObjectArray<const btCollisionObject*>		m_collisionObjects;
 
 		btVector3	m_rayFromWorld;//used to calculate hitPointWorld from hitFraction
 		btVector3	m_rayToWorld;
@@ -306,7 +306,7 @@ public:
 
 	struct LocalConvexResult
 	{
-		LocalConvexResult(btCollisionObject*	hitCollisionObject, 
+		LocalConvexResult(const btCollisionObject*	hitCollisionObject, 
 			LocalShapeInfo*	localShapeInfo,
 			const btVector3&		hitNormalLocal,
 			const btVector3&		hitPointLocal,
@@ -320,7 +320,7 @@ public:
 		{
 		}
 
-		btCollisionObject*		m_hitCollisionObject;
+		const btCollisionObject*		m_hitCollisionObject;
 		LocalShapeInfo*			m_localShapeInfo;
 		btVector3				m_hitNormalLocal;
 		btVector3				m_hitPointLocal;
@@ -376,7 +376,7 @@ public:
 
 		btVector3	m_hitNormalWorld;
 		btVector3	m_hitPointWorld;
-		btCollisionObject*	m_hitCollisionObject;
+		const btCollisionObject*	m_hitCollisionObject;
 		
 		virtual	btScalar	addSingleResult(LocalConvexResult& convexResult,bool normalInWorldSpace)
 		{
@@ -421,7 +421,7 @@ public:
 			return collides;
 		}
 
-		virtual	btScalar	addSingleResult(btManifoldPoint& cp,	const btCollisionObject* colObj0,int partId0,int index0,const btCollisionObject* colObj1,int partId1,int index1) = 0;
+		virtual	btScalar	addSingleResult(btManifoldPoint& cp,	const btCollisionObjectWrapper* colObj0Wrap,int partId0,int index0,const btCollisionObjectWrapper* colObj1Wrap,int partId1,int index1) = 0;
 	};
 
 
@@ -457,6 +457,10 @@ public:
 					  const btTransform& colObjWorldTransform,
 					  RayResultCallback& resultCallback);
 
+	static void	rayTestSingleInternal(const btTransform& rayFromTrans,const btTransform& rayToTrans,
+					  const btCollisionObjectWrapper* collisionObjectWrap,
+					  RayResultCallback& resultCallback);
+
 	/// objectQuerySingle performs a collision detection query and calls the resultCallback. It is used internally by rayTest.
 	static void	objectQuerySingle(const btConvexShape* castShape, const btTransform& rayFromTrans,const btTransform& rayToTrans,
 					  btCollisionObject* collisionObject,
@@ -464,6 +468,10 @@ public:
 					  const btTransform& colObjWorldTransform,
 					  ConvexResultCallback& resultCallback, btScalar	allowedPenetration);
 
+	static void	objectQuerySingleInternal(const btConvexShape* castShape,const btTransform& convexFromTrans,const btTransform& convexToTrans,
+											const btCollisionObjectWrapper* colObjWrap,
+											ConvexResultCallback& resultCallback, btScalar allowedPenetration);
+
 	virtual void	addCollisionObject(btCollisionObject* collisionObject,short int collisionFilterGroup=btBroadphaseProxy::DefaultFilter,short int collisionFilterMask=btBroadphaseProxy::AllFilter);
 
 	btCollisionObjectArray& getCollisionObjectArray()
diff --git a/src/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp b/src/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp
index 54889a637..06992416d 100644
--- a/src/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp
+++ b/src/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp
@@ -20,30 +20,32 @@ subject to the following restrictions:
 #include "LinearMath/btIDebugDraw.h"
 #include "LinearMath/btAabbUtil2.h"
 #include "btManifoldResult.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
-btCompoundCollisionAlgorithm::btCompoundCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped)
-:btActivatingCollisionAlgorithm(ci,body0,body1),
+btCompoundCollisionAlgorithm::btCompoundCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped)
+:btActivatingCollisionAlgorithm(ci,body0Wrap,body1Wrap),
 m_isSwapped(isSwapped),
 m_sharedManifold(ci.m_manifold)
 {
 	m_ownsManifold = false;
 
-	btCollisionObject* colObj = m_isSwapped? body1 : body0;
-	btAssert (colObj->getCollisionShape()->isCompound());
+	const btCollisionObjectWrapper* colObjWrap = m_isSwapped? body1Wrap : body0Wrap;
+	btAssert (colObjWrap->getCollisionShape()->isCompound());
 	
-	btCompoundShape* compoundShape = static_cast<btCompoundShape*>(colObj->getCollisionShape());
+	const btCompoundShape* compoundShape = static_cast<const btCompoundShape*>(colObjWrap->getCollisionShape());
 	m_compoundShapeRevision = compoundShape->getUpdateRevision();
 	
-	preallocateChildAlgorithms(body0,body1);
+	
+	preallocateChildAlgorithms(body0Wrap,body1Wrap);
 }
 
-void	btCompoundCollisionAlgorithm::preallocateChildAlgorithms(btCollisionObject* body0,btCollisionObject* body1)
+void	btCompoundCollisionAlgorithm::preallocateChildAlgorithms(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 {
-	btCollisionObject* colObj = m_isSwapped? body1 : body0;
-	btCollisionObject* otherObj = m_isSwapped? body0 : body1;
-	btAssert (colObj->getCollisionShape()->isCompound());
+	const btCollisionObjectWrapper* colObjWrap = m_isSwapped? body1Wrap : body0Wrap;
+	const btCollisionObjectWrapper* otherObjWrap = m_isSwapped? body0Wrap : body1Wrap;
+	btAssert (colObjWrap->getCollisionShape()->isCompound());
 	
-	btCompoundShape* compoundShape = static_cast<btCompoundShape*>(colObj->getCollisionShape());
+	const btCompoundShape* compoundShape = static_cast<const btCompoundShape*>(colObjWrap->getCollisionShape());
 
 	int numChildren = compoundShape->getNumChildShapes();
 	int i;
@@ -56,11 +58,11 @@ void	btCompoundCollisionAlgorithm::preallocateChildAlgorithms(btCollisionObject*
 			m_childCollisionAlgorithms[i] = 0;
 		} else
 		{
-			btCollisionShape* tmpShape = colObj->getCollisionShape();
-			btCollisionShape* childShape = compoundShape->getChildShape(i);
-			colObj->internalSetTemporaryCollisionShape( childShape );
-			m_childCollisionAlgorithms[i] = m_dispatcher->findAlgorithm(colObj,otherObj,m_sharedManifold);
-			colObj->internalSetTemporaryCollisionShape( tmpShape );
+			
+			const btCollisionShape* childShape = compoundShape->getChildShape(i);
+
+			btCollisionObjectWrapper childWrap(colObjWrap,childShape,colObjWrap->getCollisionObject(),colObjWrap->getWorldTransform());//wrong child trans, but unused (hopefully)
+			m_childCollisionAlgorithms[i] = m_dispatcher->findAlgorithm(&childWrap,otherObjWrap,m_sharedManifold);
 		}
 	}
 }
@@ -92,19 +94,16 @@ struct	btCompoundLeafCallback : btDbvt::ICollide
 
 public:
 
-	btCollisionObject* m_compoundColObj;
-	btCollisionObject* m_otherObj;
+	const btCollisionObjectWrapper* m_compoundColObjWrap;
+	const btCollisionObjectWrapper* m_otherObjWrap;
 	btDispatcher* m_dispatcher;
 	const btDispatcherInfo& m_dispatchInfo;
 	btManifoldResult*	m_resultOut;
 	btCollisionAlgorithm**	m_childCollisionAlgorithms;
 	btPersistentManifold*	m_sharedManifold;
-
-
-
-
-	btCompoundLeafCallback (btCollisionObject* compoundObj,btCollisionObject* otherObj,btDispatcher* dispatcher,const btDispatcherInfo& dispatchInfo,btManifoldResult*	resultOut,btCollisionAlgorithm**	childCollisionAlgorithms,btPersistentManifold*	sharedManifold)
-		:m_compoundColObj(compoundObj),m_otherObj(otherObj),m_dispatcher(dispatcher),m_dispatchInfo(dispatchInfo),m_resultOut(resultOut),
+	
+	btCompoundLeafCallback (const btCollisionObjectWrapper* compoundObjWrap,const btCollisionObjectWrapper* otherObjWrap,btDispatcher* dispatcher,const btDispatcherInfo& dispatchInfo,btManifoldResult*	resultOut,btCollisionAlgorithm**	childCollisionAlgorithms,btPersistentManifold*	sharedManifold)
+		:m_compoundColObjWrap(compoundObjWrap),m_otherObjWrap(otherObjWrap),m_dispatcher(dispatcher),m_dispatchInfo(dispatchInfo),m_resultOut(resultOut),
 		m_childCollisionAlgorithms(childCollisionAlgorithms),
 		m_sharedManifold(sharedManifold)
 	{
@@ -112,70 +111,81 @@ public:
 	}
 
 
-	void	ProcessChildShape(btCollisionShape* childShape,int index)
+	void	ProcessChildShape(const btCollisionShape* childShape,int index)
 	{
 		btAssert(index>=0);
-		btCompoundShape* compoundShape = static_cast<btCompoundShape*>(m_compoundColObj->getCollisionShape());
+		const btCompoundShape* compoundShape = static_cast<const btCompoundShape*>(m_compoundColObjWrap->getCollisionShape());
 		btAssert(index<compoundShape->getNumChildShapes());
 
 
 		//backup
-		btTransform	orgTrans = m_compoundColObj->getWorldTransform();
-		btTransform	orgInterpolationTrans = m_compoundColObj->getInterpolationWorldTransform();
+		btTransform	orgTrans = m_compoundColObjWrap->getWorldTransform();
+		btTransform	orgInterpolationTrans = m_compoundColObjWrap->getWorldTransform();
 		const btTransform& childTrans = compoundShape->getChildTransform(index);
 		btTransform	newChildWorldTrans = orgTrans*childTrans ;
 
 		//perform an AABB check first
 		btVector3 aabbMin0,aabbMax0,aabbMin1,aabbMax1;
 		childShape->getAabb(newChildWorldTrans,aabbMin0,aabbMax0);
-		m_otherObj->getCollisionShape()->getAabb(m_otherObj->getWorldTransform(),aabbMin1,aabbMax1);
+		m_otherObjWrap->getCollisionShape()->getAabb(m_otherObjWrap->getWorldTransform(),aabbMin1,aabbMax1);
 
 		if (TestAabbAgainstAabb2(aabbMin0,aabbMax0,aabbMin1,aabbMax1))
 		{
 
-			m_compoundColObj->setWorldTransform( newChildWorldTrans);
-			m_compoundColObj->setInterpolationWorldTransform(newChildWorldTrans);
+			btCollisionObjectWrapper compoundWrap(this->m_compoundColObjWrap,childShape,m_compoundColObjWrap->getCollisionObject(),newChildWorldTrans);
+
 
 			//the contactpoint is still projected back using the original inverted worldtrans
-			btCollisionShape* tmpShape = m_compoundColObj->getCollisionShape();
-			m_compoundColObj->internalSetTemporaryCollisionShape( childShape );
-
 			if (!m_childCollisionAlgorithms[index])
-				m_childCollisionAlgorithms[index] = m_dispatcher->findAlgorithm(m_compoundColObj,m_otherObj,m_sharedManifold);
+				m_childCollisionAlgorithms[index] = m_dispatcher->findAlgorithm(&compoundWrap,m_otherObjWrap,m_sharedManifold);
+
+			
+			const btCollisionObjectWrapper* tmpWrap = 0;
 
 			///detect swapping case
-			if (m_resultOut->getBody0Internal() == m_compoundColObj)
+			if (m_resultOut->getBody0Internal() == m_compoundColObjWrap->getCollisionObject())
 			{
+				tmpWrap = m_resultOut->getBody0Wrap();
+				m_resultOut->setBody0Wrap(&compoundWrap);
 				m_resultOut->setShapeIdentifiersA(-1,index);
 			} else
 			{
+				tmpWrap = m_resultOut->getBody1Wrap();
+				m_resultOut->setBody1Wrap(&compoundWrap);
 				m_resultOut->setShapeIdentifiersB(-1,index);
 			}
 
-			m_childCollisionAlgorithms[index]->processCollision(m_compoundColObj,m_otherObj,m_dispatchInfo,m_resultOut);
+
+			m_childCollisionAlgorithms[index]->processCollision(&compoundWrap,m_otherObjWrap,m_dispatchInfo,m_resultOut);
+
 			if (m_dispatchInfo.m_debugDraw && (m_dispatchInfo.m_debugDraw->getDebugMode() & btIDebugDraw::DBG_DrawAabb))
 			{
 				btVector3 worldAabbMin,worldAabbMax;
 				m_dispatchInfo.m_debugDraw->drawAabb(aabbMin0,aabbMax0,btVector3(1,1,1));
 				m_dispatchInfo.m_debugDraw->drawAabb(aabbMin1,aabbMax1,btVector3(1,1,1));
 			}
+
+			if (m_resultOut->getBody0Internal() == m_compoundColObjWrap->getCollisionObject())
+			{
+				m_resultOut->setBody0Wrap(tmpWrap);
+			} else
+			{
+				m_resultOut->setBody1Wrap(tmpWrap);
+			}
 			
-			//revert back transform
-			m_compoundColObj->internalSetTemporaryCollisionShape( tmpShape);
-			m_compoundColObj->setWorldTransform(  orgTrans );
-			m_compoundColObj->setInterpolationWorldTransform(orgInterpolationTrans);
 		}
 	}
 	void		Process(const btDbvtNode* leaf)
 	{
 		int index = leaf->dataAsInt;
 
-		btCompoundShape* compoundShape = static_cast<btCompoundShape*>(m_compoundColObj->getCollisionShape());
-		btCollisionShape* childShape = compoundShape->getChildShape(index);
+		const btCompoundShape* compoundShape = static_cast<const btCompoundShape*>(m_compoundColObjWrap->getCollisionShape());
+		const btCollisionShape* childShape = compoundShape->getChildShape(index);
+
 		if (m_dispatchInfo.m_debugDraw && (m_dispatchInfo.m_debugDraw->getDebugMode() & btIDebugDraw::DBG_DrawAabb))
 		{
 			btVector3 worldAabbMin,worldAabbMax;
-			btTransform	orgTrans = m_compoundColObj->getWorldTransform();
+			btTransform	orgTrans = m_compoundColObjWrap->getWorldTransform();
 			btTransformAabb(leaf->volume.Mins(),leaf->volume.Maxs(),0.,orgTrans,worldAabbMin,worldAabbMax);
 			m_dispatchInfo.m_debugDraw->drawAabb(worldAabbMin,worldAabbMax,btVector3(1,0,0));
 		}
@@ -189,15 +199,13 @@ public:
 
 
 
-void btCompoundCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btCompoundCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
-	btCollisionObject* colObj = m_isSwapped? body1 : body0;
-	btCollisionObject* otherObj = m_isSwapped? body0 : body1;
+	const btCollisionObjectWrapper* colObjWrap = m_isSwapped? body1Wrap : body0Wrap;
+	const btCollisionObjectWrapper* otherObjWrap = m_isSwapped? body0Wrap : body1Wrap;
 
-	
-
-	btAssert (colObj->getCollisionShape()->isCompound());
-	btCompoundShape* compoundShape = static_cast<btCompoundShape*>(colObj->getCollisionShape());
+	btAssert (colObjWrap->getCollisionShape()->isCompound());
+	const btCompoundShape* compoundShape = static_cast<const btCompoundShape*>(colObjWrap->getCollisionShape());
 
 	///btCompoundShape might have changed:
 	////make sure the internal child collision algorithm caches are still valid
@@ -206,13 +214,13 @@ void btCompoundCollisionAlgorithm::processCollision (btCollisionObject* body0,bt
 		///clear and update all
 		removeChildAlgorithms();
 		
-		preallocateChildAlgorithms(body0,body1);
+		preallocateChildAlgorithms(body0Wrap,body1Wrap);
 	}
 
 
-	btDbvt* tree = compoundShape->getDynamicAabbTree();
+	const btDbvt* tree = compoundShape->getDynamicAabbTree();
 	//use a dynamic aabb tree to cull potential child-overlaps
-	btCompoundLeafCallback  callback(colObj,otherObj,m_dispatcher,dispatchInfo,resultOut,&m_childCollisionAlgorithms[0],m_sharedManifold);
+	btCompoundLeafCallback  callback(colObjWrap,otherObjWrap,m_dispatcher,dispatchInfo,resultOut,&m_childCollisionAlgorithms[0],m_sharedManifold);
 
 	///we need to refresh all contact manifolds
 	///note that we should actually recursively traverse all children, btCompoundShape can nested more then 1 level deep
@@ -244,8 +252,8 @@ void btCompoundCollisionAlgorithm::processCollision (btCollisionObject* body0,bt
 
 		btVector3 localAabbMin,localAabbMax;
 		btTransform otherInCompoundSpace;
-		otherInCompoundSpace = colObj->getWorldTransform().inverse() * otherObj->getWorldTransform();
-		otherObj->getCollisionShape()->getAabb(otherInCompoundSpace,localAabbMin,localAabbMax);
+		otherInCompoundSpace = colObjWrap->getWorldTransform().inverse() * otherObjWrap->getWorldTransform();
+		otherObjWrap->getCollisionShape()->getAabb(otherInCompoundSpace,localAabbMin,localAabbMax);
 
 		const ATTRIBUTE_ALIGNED16(btDbvtVolume)	bounds=btDbvtVolume::FromMM(localAabbMin,localAabbMax);
 		//process all children, that overlap with  the given AABB bounds
@@ -267,7 +275,7 @@ void btCompoundCollisionAlgorithm::processCollision (btCollisionObject* body0,bt
 		int numChildren = m_childCollisionAlgorithms.size();
 		int i;
 		btManifoldArray	manifoldArray;
-        btCollisionShape* childShape = 0;
+        const btCollisionShape* childShape = 0;
         btTransform	orgTrans;
         btTransform	orgInterpolationTrans;
         btTransform	newChildWorldTrans;
@@ -279,14 +287,14 @@ void btCompoundCollisionAlgorithm::processCollision (btCollisionObject* body0,bt
 			{
 				childShape = compoundShape->getChildShape(i);
 			//if not longer overlapping, remove the algorithm
-                orgTrans = colObj->getWorldTransform();
-                orgInterpolationTrans = colObj->getInterpolationWorldTransform();
+                orgTrans = colObjWrap->getWorldTransform();
+                orgInterpolationTrans = colObjWrap->getWorldTransform();
 				const btTransform& childTrans = compoundShape->getChildTransform(i);
                 newChildWorldTrans = orgTrans*childTrans ;
 
 				//perform an AABB check first
 				childShape->getAabb(newChildWorldTrans,aabbMin0,aabbMax0);
-				otherObj->getCollisionShape()->getAabb(otherObj->getWorldTransform(),aabbMin1,aabbMax1);
+				otherObjWrap->getCollisionShape()->getAabb(otherObjWrap->getWorldTransform(),aabbMin1,aabbMax1);
 
 				if (!TestAabbAgainstAabb2(aabbMin0,aabbMax0,aabbMin1,aabbMax1))
 				{
@@ -301,7 +309,8 @@ void btCompoundCollisionAlgorithm::processCollision (btCollisionObject* body0,bt
 
 btScalar	btCompoundCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
-
+	btAssert(0);
+	//needs to be fixed, using btCollisionObjectWrapper and NOT modifying internal data structures
 	btCollisionObject* colObj = m_isSwapped? body1 : body0;
 	btCollisionObject* otherObj = m_isSwapped? body0 : body1;
 
@@ -324,8 +333,7 @@ btScalar	btCompoundCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject*
     btScalar frac;
 	for (i=0;i<numChildren;i++)
 	{
-		//temporarily exchange parent btCollisionShape with childShape, and recurse
-		btCollisionShape* childShape = compoundShape->getChildShape(i);
+		//btCollisionShape* childShape = compoundShape->getChildShape(i);
 
 		//backup
         orgTrans = colObj->getWorldTransform();
@@ -334,15 +342,15 @@ btScalar	btCompoundCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject*
 		//btTransform	newChildWorldTrans = orgTrans*childTrans ;
 		colObj->setWorldTransform( orgTrans*childTrans );
 
-		btCollisionShape* tmpShape = colObj->getCollisionShape();
-		colObj->internalSetTemporaryCollisionShape( childShape );
+		//btCollisionShape* tmpShape = colObj->getCollisionShape();
+		//colObj->internalSetTemporaryCollisionShape( childShape );
         frac = m_childCollisionAlgorithms[i]->calculateTimeOfImpact(colObj,otherObj,dispatchInfo,resultOut);
 		if (frac<hitFraction)
 		{
 			hitFraction = frac;
 		}
 		//revert back
-		colObj->internalSetTemporaryCollisionShape( tmpShape);
+		//colObj->internalSetTemporaryCollisionShape( tmpShape);
 		colObj->setWorldTransform( orgTrans);
 	}
 	return hitFraction;
diff --git a/src/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.h b/src/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.h
index 404574989..b16fc5246 100644
--- a/src/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.h
+++ b/src/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.h
@@ -41,15 +41,15 @@ class btCompoundCollisionAlgorithm  : public btActivatingCollisionAlgorithm
 	
 	void	removeChildAlgorithms();
 	
-	void	preallocateChildAlgorithms(btCollisionObject* body0,btCollisionObject* body1);
+	void	preallocateChildAlgorithms(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
 
 public:
 
-	btCompoundCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped);
+	btCompoundCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped);
 
 	virtual ~btCompoundCollisionAlgorithm();
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	btScalar	calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -65,19 +65,19 @@ public:
 
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btCompoundCollisionAlgorithm));
-			return new(mem) btCompoundCollisionAlgorithm(ci,body0,body1,false);
+			return new(mem) btCompoundCollisionAlgorithm(ci,body0Wrap,body1Wrap,false);
 		}
 	};
 
 	struct SwappedCreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btCompoundCollisionAlgorithm));
-			return new(mem) btCompoundCollisionAlgorithm(ci,body0,body1,true);
+			return new(mem) btCompoundCollisionAlgorithm(ci,body0Wrap,body1Wrap,true);
 		}
 	};
 
diff --git a/src/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.cpp b/src/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.cpp
index db7f884ac..3e1afede1 100644
--- a/src/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.cpp
+++ b/src/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.cpp
@@ -43,7 +43,7 @@ subject to the following restrictions:
 
 #include "BulletCollision/NarrowPhaseCollision/btGjkEpa2.h"
 #include "BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.h"
-
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 btConvex2dConvex2dAlgorithm::CreateFunc::CreateFunc(btSimplexSolverInterface*			simplexSolver, btConvexPenetrationDepthSolver* pdSolver)
 {
@@ -57,8 +57,8 @@ btConvex2dConvex2dAlgorithm::CreateFunc::~CreateFunc()
 { 
 }
 
-btConvex2dConvex2dAlgorithm::btConvex2dConvex2dAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver,int numPerturbationIterations, int minimumPointsPerturbationThreshold)
-: btActivatingCollisionAlgorithm(ci,body0,body1),
+btConvex2dConvex2dAlgorithm::btConvex2dConvex2dAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver,int numPerturbationIterations, int minimumPointsPerturbationThreshold)
+: btActivatingCollisionAlgorithm(ci,body0Wrap,body1Wrap),
 m_simplexSolver(simplexSolver),
 m_pdSolver(pdSolver),
 m_ownManifold (false),
@@ -67,8 +67,8 @@ m_lowLevelOfDetail(false),
  m_numPerturbationIterations(numPerturbationIterations),
 m_minimumPointsPerturbationThreshold(minimumPointsPerturbationThreshold)
 {
-	(void)body0;
-	(void)body1;
+	(void)body0Wrap;
+	(void)body1Wrap;
 }
 
 
@@ -96,13 +96,13 @@ extern btScalar gContactBreakingThreshold;
 //
 // Convex-Convex collision algorithm
 //
-void btConvex2dConvex2dAlgorithm ::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btConvex2dConvex2dAlgorithm ::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 
 	if (!m_manifoldPtr)
 	{
 		//swapped?
-		m_manifoldPtr = m_dispatcher->getNewManifold(body0,body1);
+		m_manifoldPtr = m_dispatcher->getNewManifold(body0Wrap->getCollisionObject(),body1Wrap->getCollisionObject());
 		m_ownManifold = true;
 	}
 	resultOut->setPersistentManifold(m_manifoldPtr);
@@ -111,8 +111,8 @@ void btConvex2dConvex2dAlgorithm ::processCollision (btCollisionObject* body0,bt
 	//resultOut->getPersistentManifold()->clearManifold();
 
 
-	btConvexShape* min0 = static_cast<btConvexShape*>(body0->getCollisionShape());
-	btConvexShape* min1 = static_cast<btConvexShape*>(body1->getCollisionShape());
+	const btConvexShape* min0 = static_cast<const btConvexShape*>(body0Wrap->getCollisionShape());
+	const btConvexShape* min1 = static_cast<const btConvexShape*>(body1Wrap->getCollisionShape());
 
 	btVector3  normalOnB;
 	btVector3  pointOnBWorld;
@@ -133,8 +133,8 @@ void btConvex2dConvex2dAlgorithm ::processCollision (btCollisionObject* body0,bt
 		}
 
 		input.m_stackAlloc = dispatchInfo.m_stackAllocator;
-		input.m_transformA = body0->getWorldTransform();
-		input.m_transformB = body1->getWorldTransform();
+		input.m_transformA = body0Wrap->getWorldTransform();
+		input.m_transformB = body1Wrap->getWorldTransform();
 
 		gjkPairDetector.getClosestPoints(input,*resultOut,dispatchInfo.m_debugDraw);
 
diff --git a/src/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.h b/src/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.h
index 53d13b871..18d9385a1 100644
--- a/src/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.h
+++ b/src/BulletCollision/CollisionDispatch/btConvex2dConvex2dAlgorithm.h
@@ -45,12 +45,12 @@ class btConvex2dConvex2dAlgorithm : public btActivatingCollisionAlgorithm
 
 public:
 
-	btConvex2dConvex2dAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver, int numPerturbationIterations, int minimumPointsPerturbationThreshold);
+	btConvex2dConvex2dAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver, int numPerturbationIterations, int minimumPointsPerturbationThreshold);
 
 
 	virtual ~btConvex2dConvex2dAlgorithm();
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -82,10 +82,10 @@ public:
 		
 		virtual ~CreateFunc();
 
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btConvex2dConvex2dAlgorithm));
-			return new(mem) btConvex2dConvex2dAlgorithm(ci.m_manifold,ci,body0,body1,m_simplexSolver,m_pdSolver,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
+			return new(mem) btConvex2dConvex2dAlgorithm(ci.m_manifold,ci,body0Wrap,body1Wrap,m_simplexSolver,m_pdSolver,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
 		}
 	};
 
diff --git a/src/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.cpp b/src/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.cpp
index d2b2c2214..6905e9737 100644
--- a/src/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.cpp
+++ b/src/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.cpp
@@ -25,11 +25,12 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btSphereShape.h"
 #include "LinearMath/btIDebugDraw.h"
 #include "BulletCollision/NarrowPhaseCollision/btSubSimplexConvexCast.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
-btConvexConcaveCollisionAlgorithm::btConvexConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1,bool isSwapped)
-: btActivatingCollisionAlgorithm(ci,body0,body1),
+btConvexConcaveCollisionAlgorithm::btConvexConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped)
+: btActivatingCollisionAlgorithm(ci,body0Wrap,body1Wrap),
 m_isSwapped(isSwapped),
-m_btConvexTriangleCallback(ci.m_dispatcher1,body0,body1,isSwapped)
+m_btConvexTriangleCallback(ci.m_dispatcher1,body0Wrap,body1Wrap,isSwapped)
 {
 }
 
@@ -46,17 +47,17 @@ void	btConvexConcaveCollisionAlgorithm::getAllContactManifolds(btManifoldArray&
 }
 
 
-btConvexTriangleCallback::btConvexTriangleCallback(btDispatcher*  dispatcher,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped):
+btConvexTriangleCallback::btConvexTriangleCallback(btDispatcher*  dispatcher,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped):
 	  m_dispatcher(dispatcher),
 	m_dispatchInfoPtr(0)
 {
-	m_convexBody = isSwapped? body1:body0;
-	m_triBody = isSwapped? body0:body1;
+	m_convexBodyWrap = isSwapped? body1Wrap:body0Wrap;
+	m_triBodyWrap = isSwapped? body0Wrap:body1Wrap;
 	
 	  //
 	  // create the manifold from the dispatcher 'manifold pool'
 	  //
-	  m_manifoldPtr = m_dispatcher->getNewManifold(m_convexBody,m_triBody);
+	  m_manifoldPtr = m_dispatcher->getNewManifold(m_convexBodyWrap->getCollisionObject(),m_triBodyWrap->getCollisionObject());
 
   	  clearCache();
 }
@@ -88,7 +89,7 @@ void btConvexTriangleCallback::processTriangle(btVector3* triangle,int partId, i
 	btCollisionAlgorithmConstructionInfo ci;
 	ci.m_dispatcher1 = m_dispatcher;
 
-	btCollisionObject* ob = static_cast<btCollisionObject*>(m_triBody);
+	//const btCollisionObject* ob = static_cast<btCollisionObject*>(m_triBodyWrap->getCollisionObject());
 
 
 #if 0	
@@ -103,46 +104,63 @@ void btConvexTriangleCallback::processTriangle(btVector3* triangle,int partId, i
 	}
 #endif
 	
-	if (m_convexBody->getCollisionShape()->isConvex())
+	if (m_convexBodyWrap->getCollisionShape()->isConvex())
 	{
 		btTriangleShape tm(triangle[0],triangle[1],triangle[2]);	
 		tm.setMargin(m_collisionMarginTriangle);
 		
-		btCollisionShape* tmpShape = ob->getCollisionShape();
-		ob->internalSetTemporaryCollisionShape( &tm );
+		
+		btCollisionObjectWrapper triObWrap(m_triBodyWrap,&tm,m_triBodyWrap->getCollisionObject(),m_triBodyWrap->getWorldTransform());//correct transform?
+		btCollisionAlgorithm* colAlgo = ci.m_dispatcher1->findAlgorithm(m_convexBodyWrap,&triObWrap,m_manifoldPtr);
 
-		btCollisionAlgorithm* colAlgo = ci.m_dispatcher1->findAlgorithm(m_convexBody,m_triBody,m_manifoldPtr);
+		const btCollisionObjectWrapper* tmpWrap = 0;
 
-		if (m_resultOut->getBody0Internal() == m_triBody)
+		if (m_resultOut->getBody0Internal() == m_triBodyWrap->getCollisionObject())
 		{
+			tmpWrap = m_resultOut->getBody0Wrap();
+			m_resultOut->setBody0Wrap(&triObWrap);
 			m_resultOut->setShapeIdentifiersA(partId,triangleIndex);
 		}
 		else
 		{
+			tmpWrap = m_resultOut->getBody1Wrap();
+			m_resultOut->setBody1Wrap(&triObWrap);
 			m_resultOut->setShapeIdentifiersB(partId,triangleIndex);
 		}
 	
-		colAlgo->processCollision(m_convexBody,m_triBody,*m_dispatchInfoPtr,m_resultOut);
+		colAlgo->processCollision(m_convexBodyWrap,&triObWrap,*m_dispatchInfoPtr,m_resultOut);
+
+		if (m_resultOut->getBody0Internal() == m_triBodyWrap->getCollisionObject())
+		{
+			m_resultOut->setBody0Wrap(tmpWrap);
+		} else
+		{
+			m_resultOut->setBody1Wrap(tmpWrap);
+		}
+		
+
+
 		colAlgo->~btCollisionAlgorithm();
 		ci.m_dispatcher1->freeCollisionAlgorithm(colAlgo);
-		ob->internalSetTemporaryCollisionShape( tmpShape);
 	}
 
-
 }
 
 
 
-void	btConvexTriangleCallback::setTimeStepAndCounters(btScalar collisionMarginTriangle,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void	btConvexTriangleCallback::setTimeStepAndCounters(btScalar collisionMarginTriangle,const btDispatcherInfo& dispatchInfo,const btCollisionObjectWrapper* convexBodyWrap, const btCollisionObjectWrapper* triBodyWrap, btManifoldResult* resultOut)
 {
+	m_convexBodyWrap = convexBodyWrap;
+	m_triBodyWrap = triBodyWrap;
+
 	m_dispatchInfoPtr = &dispatchInfo;
 	m_collisionMarginTriangle = collisionMarginTriangle;
 	m_resultOut = resultOut;
 
 	//recalc aabbs
 	btTransform convexInTriangleSpace;
-	convexInTriangleSpace = m_triBody->getWorldTransform().inverse() * m_convexBody->getWorldTransform();
-	btCollisionShape* convexShape = static_cast<btCollisionShape*>(m_convexBody->getCollisionShape());
+	convexInTriangleSpace = m_triBodyWrap->getWorldTransform().inverse() * m_convexBodyWrap->getWorldTransform();
+	const btCollisionShape* convexShape = static_cast<const btCollisionShape*>(m_convexBodyWrap->getCollisionShape());
 	//CollisionShape* triangleShape = static_cast<btCollisionShape*>(triBody->m_collisionShape);
 	convexShape->getAabb(convexInTriangleSpace,m_aabbMin,m_aabbMax);
 	btScalar extraMargin = collisionMarginTriangle;
@@ -159,35 +177,34 @@ void btConvexConcaveCollisionAlgorithm::clearCache()
 
 }
 
-void btConvexConcaveCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btConvexConcaveCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 	
 	
-	btCollisionObject* convexBody = m_isSwapped ? body1 : body0;
-	btCollisionObject* triBody = m_isSwapped ? body0 : body1;
+	const btCollisionObjectWrapper* convexBodyWrap = m_isSwapped ? body1Wrap : body0Wrap;
+	const btCollisionObjectWrapper* triBodyWrap = m_isSwapped ? body0Wrap : body1Wrap;
 
-	if (triBody->getCollisionShape()->isConcave())
+	if (triBodyWrap->getCollisionShape()->isConcave())
 	{
 
 
-		btCollisionObject*	triOb = triBody;
-		btConcaveShape* concaveShape = static_cast<btConcaveShape*>( triOb->getCollisionShape());
 		
-		if (convexBody->getCollisionShape()->isConvex())
+		const btConcaveShape* concaveShape = static_cast<const btConcaveShape*>( triBodyWrap->getCollisionShape());
+		
+		if (convexBodyWrap->getCollisionShape()->isConvex())
 		{
 			btScalar collisionMarginTriangle = concaveShape->getMargin();
 					
 			resultOut->setPersistentManifold(m_btConvexTriangleCallback.m_manifoldPtr);
-			m_btConvexTriangleCallback.setTimeStepAndCounters(collisionMarginTriangle,dispatchInfo,resultOut);
+			m_btConvexTriangleCallback.setTimeStepAndCounters(collisionMarginTriangle,dispatchInfo,convexBodyWrap,triBodyWrap,resultOut);
 
-			//Disable persistency. previously, some older algorithm calculated all contacts in one go, so you can clear it here.
-			//m_dispatcher->clearManifold(m_btConvexTriangleCallback.m_manifoldPtr);
-
-			m_btConvexTriangleCallback.m_manifoldPtr->setBodies(convexBody,triBody);
+			m_btConvexTriangleCallback.m_manifoldPtr->setBodies(convexBodyWrap->getCollisionObject(),triBodyWrap->getCollisionObject());
 
 			concaveShape->processAllTriangles( &m_btConvexTriangleCallback,m_btConvexTriangleCallback.getAabbMin(),m_btConvexTriangleCallback.getAabbMax());
 			
 			resultOut->refreshContactPoints();
+
+			m_btConvexTriangleCallback.clearWrapperData();
 	
 		}
 	
diff --git a/src/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.h b/src/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.h
index f718d1dec..e90d06eb1 100644
--- a/src/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.h
+++ b/src/BulletCollision/CollisionDispatch/btConvexConcaveCollisionAlgorithm.h
@@ -28,8 +28,8 @@ class btDispatcher;
 ///For each triangle in the concave mesh that overlaps with the AABB of a convex (m_convexProxy), processTriangle is called.
 class btConvexTriangleCallback : public btTriangleCallback
 {
-	btCollisionObject* m_convexBody;
-	btCollisionObject* m_triBody;
+	const btCollisionObjectWrapper* m_convexBodyWrap;
+	const btCollisionObjectWrapper* m_triBodyWrap;
 
 	btVector3	m_aabbMin;
 	btVector3	m_aabbMax ;
@@ -45,10 +45,15 @@ int	m_triangleCount;
 	
 	btPersistentManifold*	m_manifoldPtr;
 
-	btConvexTriangleCallback(btDispatcher* dispatcher,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped);
+	btConvexTriangleCallback(btDispatcher* dispatcher,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped);
 
-	void	setTimeStepAndCounters(btScalar collisionMarginTriangle,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	void	setTimeStepAndCounters(btScalar collisionMarginTriangle,const btDispatcherInfo& dispatchInfo,const btCollisionObjectWrapper* convexBodyWrap, const btCollisionObjectWrapper* triBodyWrap, btManifoldResult* resultOut);
 
+	void	clearWrapperData()
+	{
+		m_convexBodyWrap = 0;
+		m_triBodyWrap = 0;
+	}
 	virtual ~btConvexTriangleCallback();
 
 	virtual void processTriangle(btVector3* triangle, int partId, int triangleIndex);
@@ -81,11 +86,11 @@ class btConvexConcaveCollisionAlgorithm  : public btActivatingCollisionAlgorithm
 
 public:
 
-	btConvexConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped);
+	btConvexConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped);
 
 	virtual ~btConvexConcaveCollisionAlgorithm();
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	btScalar	calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -95,19 +100,19 @@ public:
 
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btConvexConcaveCollisionAlgorithm));
-			return new(mem) btConvexConcaveCollisionAlgorithm(ci,body0,body1,false);
+			return new(mem) btConvexConcaveCollisionAlgorithm(ci,body0Wrap,body1Wrap,false);
 		}
 	};
 
 	struct SwappedCreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btConvexConcaveCollisionAlgorithm));
-			return new(mem) btConvexConcaveCollisionAlgorithm(ci,body0,body1,true);
+			return new(mem) btConvexConcaveCollisionAlgorithm(ci,body0Wrap,body1Wrap,true);
 		}
 	};
 
diff --git a/src/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.cpp b/src/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.cpp
index dd1f3e249..72a39c530 100644
--- a/src/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.cpp
+++ b/src/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.cpp
@@ -52,7 +52,7 @@ subject to the following restrictions:
 #include "BulletCollision/NarrowPhaseCollision/btGjkEpa2.h"
 #include "BulletCollision/NarrowPhaseCollision/btGjkEpaPenetrationDepthSolver.h"
 #include "BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.h"
-
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 ///////////
 
@@ -191,8 +191,8 @@ btConvexConvexAlgorithm::CreateFunc::~CreateFunc()
 { 
 }
 
-btConvexConvexAlgorithm::btConvexConvexAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver,int numPerturbationIterations, int minimumPointsPerturbationThreshold)
-: btActivatingCollisionAlgorithm(ci,body0,body1),
+btConvexConvexAlgorithm::btConvexConvexAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver,int numPerturbationIterations, int minimumPointsPerturbationThreshold)
+: btActivatingCollisionAlgorithm(ci,body0Wrap,body1Wrap),
 m_simplexSolver(simplexSolver),
 m_pdSolver(pdSolver),
 m_ownManifold (false),
@@ -205,8 +205,8 @@ m_sepDistance((static_cast<btConvexShape*>(body0->getCollisionShape()))->getAngu
 m_numPerturbationIterations(numPerturbationIterations),
 m_minimumPointsPerturbationThreshold(minimumPointsPerturbationThreshold)
 {
-	(void)body0;
-	(void)body1;
+	(void)body0Wrap;
+	(void)body1Wrap;
 }
 
 
@@ -289,13 +289,13 @@ extern btScalar gContactBreakingThreshold;
 //
 // Convex-Convex collision algorithm
 //
-void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btConvexConvexAlgorithm ::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 
 	if (!m_manifoldPtr)
 	{
 		//swapped?
-		m_manifoldPtr = m_dispatcher->getNewManifold(body0,body1);
+		m_manifoldPtr = m_dispatcher->getNewManifold(body0Wrap->getCollisionObject(),body1Wrap->getCollisionObject());
 		m_ownManifold = true;
 	}
 	resultOut->setPersistentManifold(m_manifoldPtr);
@@ -304,8 +304,8 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 	//resultOut->getPersistentManifold()->clearManifold();
 	
 
-	btConvexShape* min0 = static_cast<btConvexShape*>(body0->getCollisionShape());
-	btConvexShape* min1 = static_cast<btConvexShape*>(body1->getCollisionShape());
+	const btConvexShape* min0 = static_cast<const btConvexShape*>(body0Wrap->getCollisionShape());
+	const btConvexShape* min1 = static_cast<const btConvexShape*>(body1Wrap->getCollisionShape());
 
 	btVector3  normalOnB;
 		btVector3  pointOnBWorld;
@@ -321,7 +321,7 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 
 		btScalar dist = capsuleCapsuleDistance(normalOnB,	pointOnBWorld,capsuleA->getHalfHeight(),capsuleA->getRadius(),
 			capsuleB->getHalfHeight(),capsuleB->getRadius(),capsuleA->getUpAxis(),capsuleB->getUpAxis(),
-			body0->getWorldTransform(),body1->getWorldTransform(),threshold);
+			body0Wrap->getWorldTransform(),body1Wrap->getWorldTransform(),threshold);
 
 		if (dist<threshold)
 		{
@@ -374,8 +374,8 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 	}
 
 	input.m_stackAlloc = dispatchInfo.m_stackAllocator;
-	input.m_transformA = body0->getWorldTransform();
-	input.m_transformB = body1->getWorldTransform();
+	input.m_transformA = body0Wrap->getWorldTransform();
+	input.m_transformB = body1Wrap->getWorldTransform();
 
 
 
@@ -429,8 +429,8 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 			{
 				foundSepAxis = btPolyhedralContactClipping::findSeparatingAxis(
 					*polyhedronA->getConvexPolyhedron(), *polyhedronB->getConvexPolyhedron(),
-					body0->getWorldTransform(), 
-					body1->getWorldTransform(),
+					body0Wrap->getWorldTransform(), 
+					body1Wrap->getWorldTransform(),
 					sepNormalWorldSpace);
 			} else
 			{
@@ -460,8 +460,8 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 //				printf("sepNormalWorldSpace=%f,%f,%f\n",sepNormalWorldSpace.getX(),sepNormalWorldSpace.getY(),sepNormalWorldSpace.getZ());
 
 				btPolyhedralContactClipping::clipHullAgainstHull(sepNormalWorldSpace, *polyhedronA->getConvexPolyhedron(), *polyhedronB->getConvexPolyhedron(),
-					body0->getWorldTransform(), 
-					body1->getWorldTransform(), minDist-threshold, threshold, *resultOut);
+					body0Wrap->getWorldTransform(), 
+					body1Wrap->getWorldTransform(), minDist-threshold, threshold, *resultOut);
  				
 			}
 			if (m_ownManifold)
@@ -478,9 +478,9 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 
 				btVertexArray vertices;
 				btTriangleShape* tri = (btTriangleShape*)polyhedronB;
-				vertices.push_back(	body1->getWorldTransform()*tri->m_vertices1[0]);
-				vertices.push_back(	body1->getWorldTransform()*tri->m_vertices1[1]);
-				vertices.push_back(	body1->getWorldTransform()*tri->m_vertices1[2]);
+				vertices.push_back(	body1Wrap->getWorldTransform()*tri->m_vertices1[0]);
+				vertices.push_back(	body1Wrap->getWorldTransform()*tri->m_vertices1[1]);
+				vertices.push_back(	body1Wrap->getWorldTransform()*tri->m_vertices1[2]);
 				
 				//tri->initializePolyhedralFeatures();
 
@@ -496,8 +496,8 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 					polyhedronB->initializePolyhedralFeatures();
 					 foundSepAxis = btPolyhedralContactClipping::findSeparatingAxis(
 					*polyhedronA->getConvexPolyhedron(), *polyhedronB->getConvexPolyhedron(),
-					body0->getWorldTransform(), 
-					body1->getWorldTransform(),
+					body0Wrap->getWorldTransform(), 
+					body1Wrap->getWorldTransform(),
 					sepNormalWorldSpace);
 				//	 printf("sepNormalWorldSpace=%f,%f,%f\n",sepNormalWorldSpace.getX(),sepNormalWorldSpace.getY(),sepNormalWorldSpace.getZ());
 
@@ -525,7 +525,7 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 			if (foundSepAxis)
 			{
 				btPolyhedralContactClipping::clipFaceAgainstHull(sepNormalWorldSpace, *polyhedronA->getConvexPolyhedron(), 
-					body0->getWorldTransform(), vertices, minDist-threshold, maxDist, *resultOut);
+					body0Wrap->getWorldTransform(), vertices, minDist-threshold, maxDist, *resultOut);
 			}
 				
 				
@@ -599,15 +599,15 @@ void btConvexConvexAlgorithm ::processCollision (btCollisionObject* body0,btColl
 				
 				if (perturbeA)
 				{
-					input.m_transformA.setBasis(  btMatrix3x3(rotq.inverse()*perturbeRot*rotq)*body0->getWorldTransform().getBasis());
-					input.m_transformB = body1->getWorldTransform();
+					input.m_transformA.setBasis(  btMatrix3x3(rotq.inverse()*perturbeRot*rotq)*body0Wrap->getWorldTransform().getBasis());
+					input.m_transformB = body1Wrap->getWorldTransform();
 	#ifdef DEBUG_CONTACTS
 					dispatchInfo.m_debugDraw->drawTransform(input.m_transformA,10.0);
 	#endif //DEBUG_CONTACTS
 				} else
 				{
-					input.m_transformA = body0->getWorldTransform();
-					input.m_transformB.setBasis( btMatrix3x3(rotq.inverse()*perturbeRot*rotq)*body1->getWorldTransform().getBasis());
+					input.m_transformA = body0Wrap->getWorldTransform();
+					input.m_transformB.setBasis( btMatrix3x3(rotq.inverse()*perturbeRot*rotq)*body1Wrap->getWorldTransform().getBasis());
 	#ifdef DEBUG_CONTACTS
 					dispatchInfo.m_debugDraw->drawTransform(input.m_transformB,10.0);
 	#endif
diff --git a/src/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h b/src/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h
index 4380b80eb..51db0c654 100644
--- a/src/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h
+++ b/src/BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h
@@ -59,12 +59,11 @@ class btConvexConvexAlgorithm : public btActivatingCollisionAlgorithm
 
 public:
 
-	btConvexConvexAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver, int numPerturbationIterations, int minimumPointsPerturbationThreshold);
-
+	btConvexConvexAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver, int numPerturbationIterations, int minimumPointsPerturbationThreshold);
 
 	virtual ~btConvexConvexAlgorithm();
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -96,10 +95,10 @@ public:
 		
 		virtual ~CreateFunc();
 
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btConvexConvexAlgorithm));
-			return new(mem) btConvexConvexAlgorithm(ci.m_manifold,ci,body0,body1,m_simplexSolver,m_pdSolver,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
+			return new(mem) btConvexConvexAlgorithm(ci.m_manifold,ci,body0Wrap,body1Wrap,m_simplexSolver,m_pdSolver,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
 		}
 	};
 
diff --git a/src/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.cpp b/src/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.cpp
index b2e9bfaf5..cce2d95bc 100644
--- a/src/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.cpp
+++ b/src/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.cpp
@@ -19,10 +19,11 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletCollision/CollisionShapes/btConvexShape.h"
 #include "BulletCollision/CollisionShapes/btStaticPlaneShape.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 //#include <stdio.h>
 
-btConvexPlaneCollisionAlgorithm::btConvexPlaneCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1, bool isSwapped, int numPerturbationIterations,int minimumPointsPerturbationThreshold)
+btConvexPlaneCollisionAlgorithm::btConvexPlaneCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap, bool isSwapped, int numPerturbationIterations,int minimumPointsPerturbationThreshold)
 : btCollisionAlgorithm(ci),
 m_ownManifold(false),
 m_manifoldPtr(mf),
@@ -30,12 +31,12 @@ m_isSwapped(isSwapped),
 m_numPerturbationIterations(numPerturbationIterations),
 m_minimumPointsPerturbationThreshold(minimumPointsPerturbationThreshold)
 {
-	btCollisionObject* convexObj = m_isSwapped? col1 : col0;
-	btCollisionObject* planeObj = m_isSwapped? col0 : col1;
+	const btCollisionObjectWrapper* convexObjWrap = m_isSwapped? col1Wrap : col0Wrap;
+	const btCollisionObjectWrapper* planeObjWrap = m_isSwapped? col0Wrap : col1Wrap;
 
-	if (!m_manifoldPtr && m_dispatcher->needsCollision(convexObj,planeObj))
+	if (!m_manifoldPtr && m_dispatcher->needsCollision(convexObjWrap->getCollisionObject(),planeObjWrap->getCollisionObject()))
 	{
-		m_manifoldPtr = m_dispatcher->getNewManifold(convexObj,planeObj);
+		m_manifoldPtr = m_dispatcher->getNewManifold(convexObjWrap->getCollisionObject(),planeObjWrap->getCollisionObject());
 		m_ownManifold = true;
 	}
 }
@@ -50,25 +51,25 @@ btConvexPlaneCollisionAlgorithm::~btConvexPlaneCollisionAlgorithm()
 	}
 }
 
-void btConvexPlaneCollisionAlgorithm::collideSingleContact (const btQuaternion& perturbeRot, btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btConvexPlaneCollisionAlgorithm::collideSingleContact (const btQuaternion& perturbeRot, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
-    btCollisionObject* convexObj = m_isSwapped? body1 : body0;
-	btCollisionObject* planeObj = m_isSwapped? body0: body1;
+    const btCollisionObjectWrapper* convexObjWrap = m_isSwapped? body1Wrap : body0Wrap;
+	const btCollisionObjectWrapper* planeObjWrap = m_isSwapped? body0Wrap: body1Wrap;
 
-	btConvexShape* convexShape = (btConvexShape*) convexObj->getCollisionShape();
-	btStaticPlaneShape* planeShape = (btStaticPlaneShape*) planeObj->getCollisionShape();
+	btConvexShape* convexShape = (btConvexShape*) convexObjWrap->getCollisionShape();
+	btStaticPlaneShape* planeShape = (btStaticPlaneShape*) planeObjWrap->getCollisionShape();
 
     bool hasCollision = false;
 	const btVector3& planeNormal = planeShape->getPlaneNormal();
 	const btScalar& planeConstant = planeShape->getPlaneConstant();
 	
-	btTransform convexWorldTransform = convexObj->getWorldTransform();
+	btTransform convexWorldTransform = convexObjWrap->getWorldTransform();
 	btTransform convexInPlaneTrans;
-	convexInPlaneTrans= planeObj->getWorldTransform().inverse() * convexWorldTransform;
+	convexInPlaneTrans= planeObjWrap->getWorldTransform().inverse() * convexWorldTransform;
 	//now perturbe the convex-world transform
 	convexWorldTransform.getBasis()*=btMatrix3x3(perturbeRot);
 	btTransform planeInConvex;
-	planeInConvex= convexWorldTransform.inverse() * planeObj->getWorldTransform();
+	planeInConvex= convexWorldTransform.inverse() * planeObjWrap->getWorldTransform();
 	
 	btVector3 vtx = convexShape->localGetSupportingVertex(planeInConvex.getBasis()*-planeNormal);
 
@@ -76,53 +77,53 @@ void btConvexPlaneCollisionAlgorithm::collideSingleContact (const btQuaternion&
 	btScalar distance = (planeNormal.dot(vtxInPlane) - planeConstant);
 
 	btVector3 vtxInPlaneProjected = vtxInPlane - distance*planeNormal;
-	btVector3 vtxInPlaneWorld = planeObj->getWorldTransform() * vtxInPlaneProjected;
+	btVector3 vtxInPlaneWorld = planeObjWrap->getWorldTransform() * vtxInPlaneProjected;
 
 	hasCollision = distance < m_manifoldPtr->getContactBreakingThreshold();
 	resultOut->setPersistentManifold(m_manifoldPtr);
 	if (hasCollision)
 	{
 		/// report a contact. internally this will be kept persistent, and contact reduction is done
-		btVector3 normalOnSurfaceB = planeObj->getWorldTransform().getBasis() * planeNormal;
+		btVector3 normalOnSurfaceB = planeObjWrap->getWorldTransform().getBasis() * planeNormal;
 		btVector3 pOnB = vtxInPlaneWorld;
 		resultOut->addContactPoint(normalOnSurfaceB,pOnB,distance);
 	}
 }
 
 
-void btConvexPlaneCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btConvexPlaneCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 	(void)dispatchInfo;
 	if (!m_manifoldPtr)
 		return;
 
-	btCollisionObject* convexObj = m_isSwapped? body1 : body0;
-	btCollisionObject* planeObj = m_isSwapped? body0: body1;
+	const btCollisionObjectWrapper* convexObjWrap = m_isSwapped? body1Wrap : body0Wrap;
+	const btCollisionObjectWrapper* planeObjWrap = m_isSwapped? body0Wrap: body1Wrap;
 
-	btConvexShape* convexShape = (btConvexShape*) convexObj->getCollisionShape();
-	btStaticPlaneShape* planeShape = (btStaticPlaneShape*) planeObj->getCollisionShape();
+	btConvexShape* convexShape = (btConvexShape*) convexObjWrap->getCollisionShape();
+	btStaticPlaneShape* planeShape = (btStaticPlaneShape*) planeObjWrap->getCollisionShape();
 
 	bool hasCollision = false;
 	const btVector3& planeNormal = planeShape->getPlaneNormal();
 	const btScalar& planeConstant = planeShape->getPlaneConstant();
 	btTransform planeInConvex;
-	planeInConvex= convexObj->getWorldTransform().inverse() * planeObj->getWorldTransform();
+	planeInConvex= convexObjWrap->getWorldTransform().inverse() * planeObjWrap->getWorldTransform();
 	btTransform convexInPlaneTrans;
-	convexInPlaneTrans= planeObj->getWorldTransform().inverse() * convexObj->getWorldTransform();
+	convexInPlaneTrans= planeObjWrap->getWorldTransform().inverse() * convexObjWrap->getWorldTransform();
 
 	btVector3 vtx = convexShape->localGetSupportingVertex(planeInConvex.getBasis()*-planeNormal);
 	btVector3 vtxInPlane = convexInPlaneTrans(vtx);
 	btScalar distance = (planeNormal.dot(vtxInPlane) - planeConstant);
 
 	btVector3 vtxInPlaneProjected = vtxInPlane - distance*planeNormal;
-	btVector3 vtxInPlaneWorld = planeObj->getWorldTransform() * vtxInPlaneProjected;
+	btVector3 vtxInPlaneWorld = planeObjWrap->getWorldTransform() * vtxInPlaneProjected;
 
 	hasCollision = distance < m_manifoldPtr->getContactBreakingThreshold();
 	resultOut->setPersistentManifold(m_manifoldPtr);
 	if (hasCollision)
 	{
 		/// report a contact. internally this will be kept persistent, and contact reduction is done
-		btVector3 normalOnSurfaceB = planeObj->getWorldTransform().getBasis() * planeNormal;
+		btVector3 normalOnSurfaceB = planeObjWrap->getWorldTransform().getBasis() * planeNormal;
 		btVector3 pOnB = vtxInPlaneWorld;
 		resultOut->addContactPoint(normalOnSurfaceB,pOnB,distance);
 	}
@@ -148,7 +149,7 @@ void btConvexPlaneCollisionAlgorithm::processCollision (btCollisionObject* body0
 		{
 			btScalar iterationAngle = i*(SIMD_2_PI/btScalar(m_numPerturbationIterations));
 			btQuaternion rotq(planeNormal,iterationAngle);
-			collideSingleContact(rotq.inverse()*perturbeRot*rotq,body0,body1,dispatchInfo,resultOut);
+			collideSingleContact(rotq.inverse()*perturbeRot*rotq,body0Wrap,body1Wrap,dispatchInfo,resultOut);
 		}
 	}
 
diff --git a/src/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.h b/src/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.h
index b9494f5ad..d28c430c4 100644
--- a/src/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.h
+++ b/src/BulletCollision/CollisionDispatch/btConvexPlaneCollisionAlgorithm.h
@@ -36,13 +36,13 @@ class btConvexPlaneCollisionAlgorithm : public btCollisionAlgorithm
 
 public:
 
-	btConvexPlaneCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1, bool isSwapped, int numPerturbationIterations,int minimumPointsPerturbationThreshold);
+	btConvexPlaneCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap, bool isSwapped, int numPerturbationIterations,int minimumPointsPerturbationThreshold);
 
 	virtual ~btConvexPlaneCollisionAlgorithm();
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
-	void collideSingleContact (const btQuaternion& perturbeRot, btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	void collideSingleContact (const btQuaternion& perturbeRot, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -65,15 +65,15 @@ public:
 		{
 		}
 		
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btConvexPlaneCollisionAlgorithm));
 			if (!m_swapped)
 			{
-				return new(mem) btConvexPlaneCollisionAlgorithm(0,ci,body0,body1,false,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
+				return new(mem) btConvexPlaneCollisionAlgorithm(0,ci,body0Wrap,body1Wrap,false,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
 			} else
 			{
-				return new(mem) btConvexPlaneCollisionAlgorithm(0,ci,body0,body1,true,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
+				return new(mem) btConvexPlaneCollisionAlgorithm(0,ci,body0Wrap,body1Wrap,true,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
 			}
 		}
 	};
diff --git a/src/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.cpp b/src/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.cpp
index 936054387..5fa1c8be5 100644
--- a/src/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.cpp
+++ b/src/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.cpp
@@ -22,7 +22,7 @@ btEmptyAlgorithm::btEmptyAlgorithm(const btCollisionAlgorithmConstructionInfo& c
 {
 }
 
-void btEmptyAlgorithm::processCollision (btCollisionObject* ,btCollisionObject* ,const btDispatcherInfo& ,btManifoldResult* )
+void btEmptyAlgorithm::processCollision (const btCollisionObjectWrapper* ,const btCollisionObjectWrapper* ,const btDispatcherInfo& ,btManifoldResult* )
 {
 }
 
diff --git a/src/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h b/src/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h
index f03c9dc38..6d426ba28 100644
--- a/src/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h
+++ b/src/BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h
@@ -30,7 +30,7 @@ public:
 	
 	btEmptyAlgorithm(const btCollisionAlgorithmConstructionInfo& ci);
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
diff --git a/src/BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp b/src/BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp
index 4353cdac0..a8d526f53 100644
--- a/src/BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp
+++ b/src/BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp
@@ -6,7 +6,7 @@
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletCollision/NarrowPhaseCollision/btManifoldPoint.h"
 #include "LinearMath/btIDebugDraw.h"
-
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 //#define DEBUG_INTERNAL_EDGE
 
@@ -450,18 +450,18 @@ bool	btClampNormal(const btVector3& edge,const btVector3& tri_normal_org,const b
 
 
 /// Changes a btManifoldPoint collision normal to the normal from the mesh.
-void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject* colObj0,const btCollisionObject* colObj1, int partId0, int index0, int normalAdjustFlags)
+void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObjectWrapper* colObj0Wrap,const btCollisionObjectWrapper* colObj1Wrap, int partId0, int index0, int normalAdjustFlags)
 {
 	//btAssert(colObj0->getCollisionShape()->getShapeType() == TRIANGLE_SHAPE_PROXYTYPE);
-	if (colObj0->getCollisionShape()->getShapeType() != TRIANGLE_SHAPE_PROXYTYPE)
+	if (colObj0Wrap->getCollisionShape()->getShapeType() != TRIANGLE_SHAPE_PROXYTYPE)
 		return;
 
 	btBvhTriangleMeshShape* trimesh = 0;
 	
-	if( colObj0->getRootCollisionShape()->getShapeType() == SCALED_TRIANGLE_MESH_SHAPE_PROXYTYPE )
-	   trimesh = ((btScaledBvhTriangleMeshShape*)colObj0->getRootCollisionShape())->getChildShape();
+	if( colObj0Wrap->getCollisionObject()->getCollisionShape()->getShapeType() == SCALED_TRIANGLE_MESH_SHAPE_PROXYTYPE )
+	   trimesh = ((btScaledBvhTriangleMeshShape*)colObj0Wrap->getCollisionObject()->getCollisionShape())->getChildShape();
    else	   
-	   trimesh = (btBvhTriangleMeshShape*)colObj0->getRootCollisionShape();
+	   trimesh = (btBvhTriangleMeshShape*)colObj0Wrap->getCollisionObject()->getCollisionShape();
 	   
    	btTriangleInfoMap* triangleInfoMapPtr = (btTriangleInfoMap*) trimesh->getTriangleInfoMap();
 	if (!triangleInfoMapPtr)
@@ -476,7 +476,7 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
 
 	btScalar frontFacing = (normalAdjustFlags & BT_TRIANGLE_CONVEX_BACKFACE_MODE)==0? 1.f : -1.f;
 	
-	const btTriangleShape* tri_shape = static_cast<const btTriangleShape*>(colObj0->getCollisionShape());
+	const btTriangleShape* tri_shape = static_cast<const btTriangleShape*>(colObj0Wrap->getCollisionShape());
 	btVector3 v0,v1,v2;
 	tri_shape->getVertex(0,v0);
 	tri_shape->getVertex(1,v1);
@@ -505,7 +505,7 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
 	int numConcaveEdgeHits = 0;
 	int numConvexEdgeHits = 0;
 
-	btVector3 localContactNormalOnB = colObj0->getWorldTransform().getBasis().transpose() * cp.m_normalWorldOnB;
+	btVector3 localContactNormalOnB = colObj0Wrap->getWorldTransform().getBasis().transpose() * cp.m_normalWorldOnB;
 	localContactNormalOnB.normalize();//is this necessary?
 	
 	// Get closest edge
@@ -613,12 +613,12 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
 					{
 						if (((normalAdjustFlags & BT_TRIANGLE_CONVEX_DOUBLE_SIDED)!=0) || (clampedLocalNormal.dot(frontFacing*tri_normal)>0))
 						{
-							btVector3 newNormal = colObj0->getWorldTransform().getBasis() * clampedLocalNormal;
+							btVector3 newNormal = colObj0Wrap->getWorldTransform().getBasis() * clampedLocalNormal;
 							//					cp.m_distance1 = cp.m_distance1 * newNormal.dot(cp.m_normalWorldOnB);
 							cp.m_normalWorldOnB = newNormal;
 							// Reproject collision point along normal. (what about cp.m_distance1?)
 							cp.m_positionWorldOnB = cp.m_positionWorldOnA - cp.m_normalWorldOnB * cp.m_distance1;
-							cp.m_localPointB = colObj0->getWorldTransform().invXform(cp.m_positionWorldOnB);
+							cp.m_localPointB = colObj0Wrap->getWorldTransform().invXform(cp.m_positionWorldOnB);
 							
 						}
 					}
@@ -694,19 +694,19 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
 				else
 				{
 					numConvexEdgeHits++;
-					btVector3 localContactNormalOnB = colObj0->getWorldTransform().getBasis().transpose() * cp.m_normalWorldOnB;
+					btVector3 localContactNormalOnB = colObj0Wrap->getWorldTransform().getBasis().transpose() * cp.m_normalWorldOnB;
 					btVector3 clampedLocalNormal;
 					bool isClamped = btClampNormal(edge,swapFactor*tri_normal,localContactNormalOnB, info->m_edgeV1V2Angle,clampedLocalNormal);
 					if (isClamped)
 					{
 						if (((normalAdjustFlags & BT_TRIANGLE_CONVEX_DOUBLE_SIDED)!=0) || (clampedLocalNormal.dot(frontFacing*tri_normal)>0))
 						{
-							btVector3 newNormal = colObj0->getWorldTransform().getBasis() * clampedLocalNormal;
+							btVector3 newNormal = colObj0Wrap->getWorldTransform().getBasis() * clampedLocalNormal;
 							//					cp.m_distance1 = cp.m_distance1 * newNormal.dot(cp.m_normalWorldOnB);
 							cp.m_normalWorldOnB = newNormal;
 							// Reproject collision point along normal.
 							cp.m_positionWorldOnB = cp.m_positionWorldOnA - cp.m_normalWorldOnB * cp.m_distance1;
-							cp.m_localPointB = colObj0->getWorldTransform().invXform(cp.m_positionWorldOnB);
+							cp.m_localPointB = colObj0Wrap->getWorldTransform().invXform(cp.m_positionWorldOnB);
 						}
 					}
 				}
@@ -779,19 +779,19 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
 					//				printf("hitting convex edge\n");
 
 
-					btVector3 localContactNormalOnB = colObj0->getWorldTransform().getBasis().transpose() * cp.m_normalWorldOnB;
+					btVector3 localContactNormalOnB = colObj0Wrap->getWorldTransform().getBasis().transpose() * cp.m_normalWorldOnB;
 					btVector3 clampedLocalNormal;
 					bool isClamped = btClampNormal(edge,swapFactor*tri_normal,localContactNormalOnB,info->m_edgeV2V0Angle,clampedLocalNormal);
 					if (isClamped)
 					{
 						if (((normalAdjustFlags & BT_TRIANGLE_CONVEX_DOUBLE_SIDED)!=0) || (clampedLocalNormal.dot(frontFacing*tri_normal)>0))
 						{
-							btVector3 newNormal = colObj0->getWorldTransform().getBasis() * clampedLocalNormal;
+							btVector3 newNormal = colObj0Wrap->getWorldTransform().getBasis() * clampedLocalNormal;
 							//					cp.m_distance1 = cp.m_distance1 * newNormal.dot(cp.m_normalWorldOnB);
 							cp.m_normalWorldOnB = newNormal;
 							// Reproject collision point along normal.
 							cp.m_positionWorldOnB = cp.m_positionWorldOnA - cp.m_normalWorldOnB * cp.m_distance1;
-							cp.m_localPointB = colObj0->getWorldTransform().invXform(cp.m_positionWorldOnB);
+							cp.m_localPointB = colObj0Wrap->getWorldTransform().invXform(cp.m_positionWorldOnB);
 						}
 					}
 				} 
@@ -820,7 +820,7 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
 				{
 					tri_normal *= -1;
 				}
-				cp.m_normalWorldOnB = colObj0->getWorldTransform().getBasis()*tri_normal;
+				cp.m_normalWorldOnB = colObj0Wrap->getWorldTransform().getBasis()*tri_normal;
 			} else
 			{
 				btVector3 newNormal = tri_normal *frontFacing;
@@ -831,12 +831,12 @@ void btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject*
 					return;
 				}
 				//modify the normal to be the triangle normal (or backfacing normal)
-				cp.m_normalWorldOnB = colObj0->getWorldTransform().getBasis() *newNormal;
+				cp.m_normalWorldOnB = colObj0Wrap->getWorldTransform().getBasis() *newNormal;
 			}
 						
 			// Reproject collision point along normal.
 			cp.m_positionWorldOnB = cp.m_positionWorldOnA - cp.m_normalWorldOnB * cp.m_distance1;
-			cp.m_localPointB = colObj0->getWorldTransform().invXform(cp.m_positionWorldOnB);
+			cp.m_localPointB = colObj0Wrap->getWorldTransform().invXform(cp.m_positionWorldOnB);
 		}
 	}
 }
diff --git a/src/BulletCollision/CollisionDispatch/btInternalEdgeUtility.h b/src/BulletCollision/CollisionDispatch/btInternalEdgeUtility.h
index 9efb0122b..7d9aafeee 100644
--- a/src/BulletCollision/CollisionDispatch/btInternalEdgeUtility.h
+++ b/src/BulletCollision/CollisionDispatch/btInternalEdgeUtility.h
@@ -12,6 +12,7 @@
 
 class btBvhTriangleMeshShape;
 class btCollisionObject;
+struct btCollisionObjectWrapper;
 class btManifoldPoint;
 class btIDebugDraw;
 
@@ -31,7 +32,7 @@ void	btGenerateInternalEdgeInfo (btBvhTriangleMeshShape*trimeshShape, btTriangle
 
 ///Call the btFixMeshNormal to adjust the collision normal, using the triangle info map (generated using btGenerateInternalEdgeInfo)
 ///If this info map is missing, or the triangle is not store in this map, nothing will be done
-void	btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObject* trimeshColObj0,const btCollisionObject* otherColObj1, int partId0, int index0, int normalAdjustFlags = 0);
+void	btAdjustInternalEdgeContacts(btManifoldPoint& cp, const btCollisionObjectWrapper* trimeshColObj0Wrap,const btCollisionObjectWrapper* otherColObj1Wrap, int partId0, int index0, int normalAdjustFlags = 0);
 
 ///Enable the BT_INTERNAL_EDGE_DEBUG_DRAW define and call btSetDebugDrawer, to get visual info to see if the internal edge utility works properly.
 ///If the utility doesn't work properly, you might have to adjust the threshold values in btTriangleInfoMap
diff --git a/src/BulletCollision/CollisionDispatch/btManifoldResult.cpp b/src/BulletCollision/CollisionDispatch/btManifoldResult.cpp
index bf24246ea..6745e23da 100644
--- a/src/BulletCollision/CollisionDispatch/btManifoldResult.cpp
+++ b/src/BulletCollision/CollisionDispatch/btManifoldResult.cpp
@@ -17,7 +17,7 @@ subject to the following restrictions:
 #include "btManifoldResult.h"
 #include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
-
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 ///This is to allow MaterialCombiner/Custom Friction/Restitution values
 ContactAddedCallback		gContactAddedCallback=0;
@@ -43,10 +43,10 @@ inline btScalar	calculateCombinedRestitution(const btCollisionObject* body0,cons
 
 
 
-btManifoldResult::btManifoldResult(btCollisionObject* body0,btCollisionObject* body1)
+btManifoldResult::btManifoldResult(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		:m_manifoldPtr(0),
-		m_body0(body0),
-		m_body1(body1)
+		m_body0Wrap(body0Wrap),
+		m_body1Wrap(body1Wrap)
 #ifdef DEBUG_PART_INDEX
 		,m_partId0(-1),
 	m_partId1(-1),
@@ -54,8 +54,6 @@ btManifoldResult::btManifoldResult(btCollisionObject* body0,btCollisionObject* b
 	m_index1(-1)
 #endif //DEBUG_PART_INDEX
 {
-	m_rootTransA = body0->getWorldTransform();
-	m_rootTransB = body1->getWorldTransform();
 }
 
 
@@ -68,7 +66,7 @@ void btManifoldResult::addContactPoint(const btVector3& normalOnBInWorld,const b
 //	if (depth > m_manifoldPtr->getContactProcessingThreshold())
 		return;
 
-	bool isSwapped = m_manifoldPtr->getBody0() != m_body0;
+	bool isSwapped = m_manifoldPtr->getBody0() != m_body0Wrap->getCollisionObject();
 
 	btVector3 pointA = pointInWorld + normalOnBInWorld * depth;
 
@@ -77,12 +75,12 @@ void btManifoldResult::addContactPoint(const btVector3& normalOnBInWorld,const b
 	
 	if (isSwapped)
 	{
-		localA = m_rootTransB.invXform(pointA );
-		localB = m_rootTransA.invXform(pointInWorld);
+		localA = m_body1Wrap->getCollisionObject()->getWorldTransform().invXform(pointA );
+		localB = m_body0Wrap->getCollisionObject()->getWorldTransform().invXform(pointInWorld);
 	} else
 	{
-		localA = m_rootTransA.invXform(pointA );
-		localB = m_rootTransB.invXform(pointInWorld);
+		localA = m_body0Wrap->getCollisionObject()->getWorldTransform().invXform(pointA );
+		localB = m_body1Wrap->getCollisionObject()->getWorldTransform().invXform(pointInWorld);
 	}
 
 	btManifoldPoint newPt(localA,localB,normalOnBInWorld,depth);
@@ -91,8 +89,8 @@ void btManifoldResult::addContactPoint(const btVector3& normalOnBInWorld,const b
 	
 	int insertIndex = m_manifoldPtr->getCacheEntry(newPt);
 
-	newPt.m_combinedFriction = calculateCombinedFriction(m_body0,m_body1);
-	newPt.m_combinedRestitution = calculateCombinedRestitution(m_body0,m_body1);
+	newPt.m_combinedFriction = calculateCombinedFriction(m_body0Wrap->getCollisionObject(),m_body1Wrap->getCollisionObject());
+	newPt.m_combinedRestitution = calculateCombinedRestitution(m_body0Wrap->getCollisionObject(),m_body1Wrap->getCollisionObject());
 
    //BP mod, store contact triangles.
 	if (isSwapped)
@@ -122,13 +120,13 @@ void btManifoldResult::addContactPoint(const btVector3& normalOnBInWorld,const b
 	//User can override friction and/or restitution
 	if (gContactAddedCallback &&
 		//and if either of the two bodies requires custom material
-		 ((m_body0->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK) ||
-		   (m_body1->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)))
+		 ((m_body0Wrap->getCollisionObject()->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK) ||
+		   (m_body1Wrap->getCollisionObject()->getCollisionFlags() & btCollisionObject::CF_CUSTOM_MATERIAL_CALLBACK)))
 	{
 		//experimental feature info, for per-triangle material etc.
-		btCollisionObject* obj0 = isSwapped? m_body1 : m_body0;
-		btCollisionObject* obj1 = isSwapped? m_body0 : m_body1;
-		(*gContactAddedCallback)(m_manifoldPtr->getContactPoint(insertIndex),obj0,newPt.m_partId0,newPt.m_index0,obj1,newPt.m_partId1,newPt.m_index1);
+		const btCollisionObjectWrapper* obj0Wrap = isSwapped? m_body1Wrap : m_body0Wrap;
+		const btCollisionObjectWrapper* obj1Wrap = isSwapped? m_body0Wrap : m_body1Wrap;
+		(*gContactAddedCallback)(m_manifoldPtr->getContactPoint(insertIndex),obj0Wrap,newPt.m_partId0,newPt.m_index0,obj1Wrap,newPt.m_partId1,newPt.m_index1);
 	}
 
 }
diff --git a/src/BulletCollision/CollisionDispatch/btManifoldResult.h b/src/BulletCollision/CollisionDispatch/btManifoldResult.h
index 18199b497..ec45524e9 100644
--- a/src/BulletCollision/CollisionDispatch/btManifoldResult.h
+++ b/src/BulletCollision/CollisionDispatch/btManifoldResult.h
@@ -18,14 +18,18 @@ subject to the following restrictions:
 #define BT_MANIFOLD_RESULT_H
 
 class btCollisionObject;
+struct btCollisionObjectWrapper;
+
 #include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
 class btManifoldPoint;
 
 #include "BulletCollision/NarrowPhaseCollision/btDiscreteCollisionDetectorInterface.h"
 
 #include "LinearMath/btTransform.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 
-typedef bool (*ContactAddedCallback)(btManifoldPoint& cp,	const btCollisionObject* colObj0,int partId0,int index0,const btCollisionObject* colObj1,int partId1,int index1);
+typedef bool (*ContactAddedCallback)(btManifoldPoint& cp,	const btCollisionObjectWrapper* colObj0Wrap,int partId0,int index0,const btCollisionObjectWrapper* colObj1Wrap,int partId1,int index1);
 extern ContactAddedCallback		gContactAddedCallback;
 
 //#define DEBUG_PART_INDEX 1
@@ -38,12 +42,8 @@ protected:
 
 	btPersistentManifold* m_manifoldPtr;
 
-	//we need this for compounds
-	btTransform	m_rootTransA;
-	btTransform	m_rootTransB;
-
-	btCollisionObject* m_body0;
-	btCollisionObject* m_body1;
+	const btCollisionObjectWrapper* m_body0Wrap;
+	const btCollisionObjectWrapper* m_body1Wrap;
 	int	m_partId0;
 	int m_partId1;
 	int m_index0;
@@ -63,7 +63,7 @@ public:
 	{
 	}
 
-	btManifoldResult(btCollisionObject* body0,btCollisionObject* body1);
+	btManifoldResult(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
 
 	virtual ~btManifoldResult() {};
 
@@ -102,25 +102,44 @@ public:
 		if (!m_manifoldPtr->getNumContacts())
 			return;
 
-		bool isSwapped = m_manifoldPtr->getBody0() != m_body0;
+		bool isSwapped = m_manifoldPtr->getBody0() != m_body0Wrap->getCollisionObject();
 
 		if (isSwapped)
 		{
-			m_manifoldPtr->refreshContactPoints(m_rootTransB,m_rootTransA);
+			m_manifoldPtr->refreshContactPoints(m_body1Wrap->getCollisionObject()->getWorldTransform(),m_body0Wrap->getCollisionObject()->getWorldTransform());
 		} else
 		{
-			m_manifoldPtr->refreshContactPoints(m_rootTransA,m_rootTransB);
+			m_manifoldPtr->refreshContactPoints(m_body0Wrap->getCollisionObject()->getWorldTransform(),m_body1Wrap->getCollisionObject()->getWorldTransform());
 		}
 	}
 
+	const btCollisionObjectWrapper* getBody0Wrap() const
+	{
+		return m_body0Wrap;
+	}
+	const btCollisionObjectWrapper* getBody1Wrap() const
+	{
+		return m_body1Wrap;
+	}
+
+	void setBody0Wrap(const btCollisionObjectWrapper* obj0Wrap)
+	{
+		m_body0Wrap = obj0Wrap;
+	}
+
+	void setBody1Wrap(const btCollisionObjectWrapper* obj1Wrap)
+	{
+		m_body1Wrap = obj1Wrap;
+	}
+
 	const btCollisionObject* getBody0Internal() const
 	{
-		return m_body0;
+		return m_body0Wrap->getCollisionObject();
 	}
 
 	const btCollisionObject* getBody1Internal() const
 	{
-		return m_body1;
+		return m_body1Wrap->getCollisionObject();
 	}
 	
 };
diff --git a/src/BulletCollision/CollisionDispatch/btSimulationIslandManager.cpp b/src/BulletCollision/CollisionDispatch/btSimulationIslandManager.cpp
index 871c64415..134478225 100644
--- a/src/BulletCollision/CollisionDispatch/btSimulationIslandManager.cpp
+++ b/src/BulletCollision/CollisionDispatch/btSimulationIslandManager.cpp
@@ -319,8 +319,8 @@ void btSimulationIslandManager::buildIslands(btDispatcher* dispatcher,btCollisio
 	{
 		 btPersistentManifold* manifold = dispatcher->getManifoldByIndexInternal(i);
 		 
-		 btCollisionObject* colObj0 = static_cast<btCollisionObject*>(manifold->getBody0());
-		 btCollisionObject* colObj1 = static_cast<btCollisionObject*>(manifold->getBody1());
+		 const btCollisionObject* colObj0 = static_cast<const btCollisionObject*>(manifold->getBody0());
+		 const btCollisionObject* colObj1 = static_cast<const btCollisionObject*>(manifold->getBody1());
 		
 		 ///@todo: check sleeping conditions!
 		 if (((colObj0) && colObj0->getActivationState() != ISLAND_SLEEPING) ||
diff --git a/src/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.cpp b/src/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.cpp
index 8df876928..46b5a9811 100644
--- a/src/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.cpp
+++ b/src/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.cpp
@@ -18,20 +18,21 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btSphereShape.h"
 #include "BulletCollision/CollisionShapes/btBoxShape.h"
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 //#include <stdio.h>
 
-btSphereBoxCollisionAlgorithm::btSphereBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1, bool isSwapped)
-: btActivatingCollisionAlgorithm(ci,col0,col1),
+btSphereBoxCollisionAlgorithm::btSphereBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap, bool isSwapped)
+: btActivatingCollisionAlgorithm(ci,col0Wrap,col1Wrap),
 m_ownManifold(false),
 m_manifoldPtr(mf),
 m_isSwapped(isSwapped)
 {
-	btCollisionObject* sphereObj = m_isSwapped? col1 : col0;
-	btCollisionObject* boxObj = m_isSwapped? col0 : col1;
+	const btCollisionObjectWrapper* sphereObjWrap = m_isSwapped? col1Wrap : col0Wrap;
+	const btCollisionObjectWrapper* boxObjWrap = m_isSwapped? col0Wrap : col1Wrap;
 	
-	if (!m_manifoldPtr && m_dispatcher->needsCollision(sphereObj,boxObj))
+	if (!m_manifoldPtr && m_dispatcher->needsCollision(sphereObjWrap->getCollisionObject(),boxObjWrap->getCollisionObject()))
 	{
-		m_manifoldPtr = m_dispatcher->getNewManifold(sphereObj,boxObj);
+		m_manifoldPtr = m_dispatcher->getNewManifold(sphereObjWrap->getCollisionObject(),boxObjWrap->getCollisionObject());
 		m_ownManifold = true;
 	}
 }
@@ -48,25 +49,25 @@ btSphereBoxCollisionAlgorithm::~btSphereBoxCollisionAlgorithm()
 
 
 
-void btSphereBoxCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btSphereBoxCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 	(void)dispatchInfo;
 	(void)resultOut;
 	if (!m_manifoldPtr)
 		return;
 
-	btCollisionObject* sphereObj = m_isSwapped? body1 : body0;
-	btCollisionObject* boxObj = m_isSwapped? body0 : body1;
+	const btCollisionObjectWrapper* sphereObjWrap = m_isSwapped? body1Wrap : body0Wrap;
+	const btCollisionObjectWrapper* boxObjWrap = m_isSwapped? body0Wrap : body1Wrap;
 
 
-	btSphereShape* sphere0 = (btSphereShape*)sphereObj->getCollisionShape();
+	const btSphereShape* sphere0 = (const btSphereShape*)sphereObjWrap->getCollisionShape();
 
 	btVector3 normalOnSurfaceB;
 	btVector3 pOnBox,pOnSphere;
-	btVector3 sphereCenter = sphereObj->getWorldTransform().getOrigin();
+	btVector3 sphereCenter = sphereObjWrap->getWorldTransform().getOrigin();
 	btScalar radius = sphere0->getRadius();
 	
-	btScalar dist = getSphereDistance(boxObj,pOnBox,pOnSphere,sphereCenter,radius);
+	btScalar dist = getSphereDistance(boxObjWrap,pOnBox,pOnSphere,sphereCenter,radius);
 
 	resultOut->setPersistentManifold(m_manifoldPtr);
 
@@ -102,19 +103,19 @@ btScalar btSphereBoxCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject*
 }
 
 
-btScalar btSphereBoxCollisionAlgorithm::getSphereDistance(btCollisionObject* boxObj, btVector3& pointOnBox, btVector3& v3PointOnSphere, const btVector3& sphereCenter, btScalar fRadius ) 
+btScalar btSphereBoxCollisionAlgorithm::getSphereDistance(const btCollisionObjectWrapper* boxObjWrap, btVector3& pointOnBox, btVector3& v3PointOnSphere, const btVector3& sphereCenter, btScalar fRadius ) 
 {
 
 	btScalar margins;
 	btVector3 bounds[2];
-	btBoxShape* boxShape= (btBoxShape*)boxObj->getCollisionShape();
+	const btBoxShape* boxShape= (const btBoxShape*)boxObjWrap->getCollisionShape();
 	
 	bounds[0] = -boxShape->getHalfExtentsWithoutMargin();
 	bounds[1] = boxShape->getHalfExtentsWithoutMargin();
 
 	margins = boxShape->getMargin();//also add sphereShape margin?
 
-	const btTransform&	m44T = boxObj->getWorldTransform();
+	const btTransform&	m44T = boxObjWrap->getWorldTransform();
 
 	btVector3	boundsVec[2];
 	btScalar	fPenetration;
@@ -194,7 +195,7 @@ btScalar btSphereBoxCollisionAlgorithm::getSphereDistance(btCollisionObject* box
 	//////////////////////////////////////////////////
 	// Deep penetration case
 
-	fPenetration = getSpherePenetration( boxObj,pointOnBox, v3PointOnSphere, sphereCenter, fRadius,bounds[0],bounds[1] );
+	fPenetration = getSpherePenetration( boxObjWrap,pointOnBox, v3PointOnSphere, sphereCenter, fRadius,bounds[0],bounds[1] );
 
 	bounds[0] = boundsVec[0];
 	bounds[1] = boundsVec[1];
@@ -205,7 +206,7 @@ btScalar btSphereBoxCollisionAlgorithm::getSphereDistance(btCollisionObject* box
 		return btScalar(1.0);
 }
 
-btScalar btSphereBoxCollisionAlgorithm::getSpherePenetration( btCollisionObject* boxObj,btVector3& pointOnBox, btVector3& v3PointOnSphere, const btVector3& sphereCenter, btScalar fRadius, const btVector3& aabbMin, const btVector3& aabbMax) 
+btScalar btSphereBoxCollisionAlgorithm::getSpherePenetration( const btCollisionObjectWrapper* boxObjWrap,btVector3& pointOnBox, btVector3& v3PointOnSphere, const btVector3& sphereCenter, btScalar fRadius, const btVector3& aabbMin, const btVector3& aabbMax) 
 {
 
 	btVector3 bounds[2];
@@ -227,7 +228,7 @@ btScalar btSphereBoxCollisionAlgorithm::getSpherePenetration( btCollisionObject*
 	n[4].setValue(  btScalar(0.0),  btScalar(1.0),  btScalar(0.0) );
 	n[5].setValue(  btScalar(0.0),  btScalar(0.0),  btScalar(1.0) );
 
-	const btTransform&	m44T = boxObj->getWorldTransform();
+	const btTransform&	m44T = boxObjWrap->getWorldTransform();
 
 	// convert  point in local space
 	prel = m44T.invXform( sphereCenter);
diff --git a/src/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.h b/src/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.h
index 60286ae0a..0b67a8b35 100644
--- a/src/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.h
+++ b/src/BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.h
@@ -34,11 +34,11 @@ class btSphereBoxCollisionAlgorithm : public btActivatingCollisionAlgorithm
 	
 public:
 
-	btSphereBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1, bool isSwapped);
+	btSphereBoxCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap, bool isSwapped);
 
 	virtual ~btSphereBoxCollisionAlgorithm();
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -50,21 +50,21 @@ public:
 		}
 	}
 
-	btScalar getSphereDistance( btCollisionObject* boxObj,btVector3& v3PointOnBox, btVector3& v3PointOnSphere, const btVector3& v3SphereCenter, btScalar fRadius );
+	btScalar getSphereDistance( const btCollisionObjectWrapper* boxObjWrap,btVector3& v3PointOnBox, btVector3& v3PointOnSphere, const btVector3& v3SphereCenter, btScalar fRadius );
 
-	btScalar getSpherePenetration( btCollisionObject* boxObj, btVector3& v3PointOnBox, btVector3& v3PointOnSphere, const btVector3& v3SphereCenter, btScalar fRadius, const btVector3& aabbMin, const btVector3& aabbMax);
+	btScalar getSpherePenetration( const btCollisionObjectWrapper* boxObjWrap, btVector3& v3PointOnBox, btVector3& v3PointOnSphere, const btVector3& v3SphereCenter, btScalar fRadius, const btVector3& aabbMin, const btVector3& aabbMax);
 	
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btSphereBoxCollisionAlgorithm));
 			if (!m_swapped)
 			{
-				return new(mem) btSphereBoxCollisionAlgorithm(0,ci,body0,body1,false);
+				return new(mem) btSphereBoxCollisionAlgorithm(0,ci,body0Wrap,body1Wrap,false);
 			} else
 			{
-				return new(mem) btSphereBoxCollisionAlgorithm(0,ci,body0,body1,true);
+				return new(mem) btSphereBoxCollisionAlgorithm(0,ci,body0Wrap,body1Wrap,true);
 			}
 		}
 	};
diff --git a/src/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.cpp b/src/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.cpp
index 5c4e78fe5..36ba21f5b 100644
--- a/src/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.cpp
+++ b/src/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.cpp
@@ -17,15 +17,16 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
 #include "BulletCollision/CollisionShapes/btSphereShape.h"
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
-btSphereSphereCollisionAlgorithm::btSphereSphereCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1)
-: btActivatingCollisionAlgorithm(ci,col0,col1),
+btSphereSphereCollisionAlgorithm::btSphereSphereCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap)
+: btActivatingCollisionAlgorithm(ci,col0Wrap,col1Wrap),
 m_ownManifold(false),
 m_manifoldPtr(mf)
 {
 	if (!m_manifoldPtr)
 	{
-		m_manifoldPtr = m_dispatcher->getNewManifold(col0,col1);
+		m_manifoldPtr = m_dispatcher->getNewManifold(col0Wrap->getCollisionObject(),col1Wrap->getCollisionObject());
 		m_ownManifold = true;
 	}
 }
@@ -39,7 +40,7 @@ btSphereSphereCollisionAlgorithm::~btSphereSphereCollisionAlgorithm()
 	}
 }
 
-void btSphereSphereCollisionAlgorithm::processCollision (btCollisionObject* col0,btCollisionObject* col1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btSphereSphereCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 	(void)dispatchInfo;
 
@@ -48,10 +49,10 @@ void btSphereSphereCollisionAlgorithm::processCollision (btCollisionObject* col0
 
 	resultOut->setPersistentManifold(m_manifoldPtr);
 
-	btSphereShape* sphere0 = (btSphereShape*)col0->getCollisionShape();
-	btSphereShape* sphere1 = (btSphereShape*)col1->getCollisionShape();
+	btSphereShape* sphere0 = (btSphereShape*)col0Wrap->getCollisionShape();
+	btSphereShape* sphere1 = (btSphereShape*)col1Wrap->getCollisionShape();
 
-	btVector3 diff = col0->getWorldTransform().getOrigin()-  col1->getWorldTransform().getOrigin();
+	btVector3 diff = col0Wrap->getWorldTransform().getOrigin()-  col1Wrap->getWorldTransform().getOrigin();
 	btScalar len = diff.length();
 	btScalar radius0 = sphere0->getRadius();
 	btScalar radius1 = sphere1->getRadius();
@@ -80,7 +81,7 @@ void btSphereSphereCollisionAlgorithm::processCollision (btCollisionObject* col0
 	///point on A (worldspace)
 	///btVector3 pos0 = col0->getWorldTransform().getOrigin() - radius0 * normalOnSurfaceB;
 	///point on B (worldspace)
-	btVector3 pos1 = col1->getWorldTransform().getOrigin() + radius1* normalOnSurfaceB;
+	btVector3 pos1 = col1Wrap->getWorldTransform().getOrigin() + radius1* normalOnSurfaceB;
 
 	/// report a contact. internally this will be kept persistent, and contact reduction is done
 	
diff --git a/src/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h b/src/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h
index e55acf277..3517a568a 100644
--- a/src/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h
+++ b/src/BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h
@@ -32,12 +32,12 @@ class btSphereSphereCollisionAlgorithm : public btActivatingCollisionAlgorithm
 	btPersistentManifold*	m_manifoldPtr;
 	
 public:
-	btSphereSphereCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
+	btSphereSphereCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap);
 
 	btSphereSphereCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci)
 		: btActivatingCollisionAlgorithm(ci) {}
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -53,10 +53,10 @@ public:
 
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btSphereSphereCollisionAlgorithm));
-			return new(mem) btSphereSphereCollisionAlgorithm(0,ci,body0,body1);
+			return new(mem) btSphereSphereCollisionAlgorithm(0,ci,col0Wrap,col1Wrap);
 		}
 	};
 
diff --git a/src/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.cpp b/src/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.cpp
index c327c3ff7..280a4d355 100644
--- a/src/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.cpp
+++ b/src/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.cpp
@@ -19,17 +19,17 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btSphereShape.h"
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "SphereTriangleDetector.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
-
-btSphereTriangleCollisionAlgorithm::btSphereTriangleCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1,bool swapped)
-: btActivatingCollisionAlgorithm(ci,col0,col1),
+btSphereTriangleCollisionAlgorithm::btSphereTriangleCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool swapped)
+: btActivatingCollisionAlgorithm(ci,body0Wrap,body1Wrap),
 m_ownManifold(false),
 m_manifoldPtr(mf),
 m_swapped(swapped)
 {
 	if (!m_manifoldPtr)
 	{
-		m_manifoldPtr = m_dispatcher->getNewManifold(col0,col1);
+		m_manifoldPtr = m_dispatcher->getNewManifold(body0Wrap->getCollisionObject(),body1Wrap->getCollisionObject());
 		m_ownManifold = true;
 	}
 }
@@ -43,16 +43,16 @@ btSphereTriangleCollisionAlgorithm::~btSphereTriangleCollisionAlgorithm()
 	}
 }
 
-void btSphereTriangleCollisionAlgorithm::processCollision (btCollisionObject* col0,btCollisionObject* col1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btSphereTriangleCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* col0Wrap,const btCollisionObjectWrapper* col1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 	if (!m_manifoldPtr)
 		return;
 
-	btCollisionObject* sphereObj = m_swapped? col1 : col0;
-	btCollisionObject* triObj = m_swapped? col0 : col1;
+	const btCollisionObjectWrapper* sphereObjWrap = m_swapped? col1Wrap : col0Wrap;
+	const btCollisionObjectWrapper* triObjWrap = m_swapped? col0Wrap : col1Wrap;
 
-	btSphereShape* sphere = (btSphereShape*)sphereObj->getCollisionShape();
-	btTriangleShape* triangle = (btTriangleShape*)triObj->getCollisionShape();
+	btSphereShape* sphere = (btSphereShape*)sphereObjWrap->getCollisionShape();
+	btTriangleShape* triangle = (btTriangleShape*)triObjWrap->getCollisionShape();
 	
 	/// report a contact. internally this will be kept persistent, and contact reduction is done
 	resultOut->setPersistentManifold(m_manifoldPtr);
@@ -60,8 +60,8 @@ void btSphereTriangleCollisionAlgorithm::processCollision (btCollisionObject* co
 	
 	btDiscreteCollisionDetectorInterface::ClosestPointInput input;
 	input.m_maximumDistanceSquared = btScalar(BT_LARGE_FLOAT);///@todo: tighter bounds
-	input.m_transformA = sphereObj->getWorldTransform();
-	input.m_transformB = triObj->getWorldTransform();
+	input.m_transformA = sphereObjWrap->getWorldTransform();
+	input.m_transformB = triObjWrap->getWorldTransform();
 
 	bool swapResults = m_swapped;
 
diff --git a/src/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h b/src/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h
index 7c6c4d8f8..6b6e39a72 100644
--- a/src/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h
+++ b/src/BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h
@@ -32,12 +32,12 @@ class btSphereTriangleCollisionAlgorithm : public btActivatingCollisionAlgorithm
 	bool	m_swapped;
 	
 public:
-	btSphereTriangleCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,bool swapped);
+	btSphereTriangleCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool swapped);
 
 	btSphereTriangleCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci)
 		: btActivatingCollisionAlgorithm(ci) {}
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -54,12 +54,12 @@ public:
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
 		
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btSphereTriangleCollisionAlgorithm));
 
-			return new(mem) btSphereTriangleCollisionAlgorithm(ci.m_manifold,ci,body0,body1,m_swapped);
+			return new(mem) btSphereTriangleCollisionAlgorithm(ci.m_manifold,ci,body0Wrap,body1Wrap,m_swapped);
 		}
 	};
 
diff --git a/src/BulletCollision/CollisionShapes/btBox2dShape.h b/src/BulletCollision/CollisionShapes/btBox2dShape.h
index f4a9ca03e..ce333783e 100644
--- a/src/BulletCollision/CollisionShapes/btBox2dShape.h
+++ b/src/BulletCollision/CollisionShapes/btBox2dShape.h
@@ -23,7 +23,7 @@ subject to the following restrictions:
 #include "LinearMath/btMinMax.h"
 
 ///The btBox2dShape is a box primitive around the origin, its sides axis aligned with length specified by half extents, in local shape coordinates. When used as part of a btCollisionObject or btRigidBody it will be an oriented box in world space.
-class btBox2dShape: public btPolyhedralConvexShape
+ATTRIBUTE_ALIGNED16(class) btBox2dShape: public btPolyhedralConvexShape
 {
 
 	//btVector3	m_boxHalfExtents1; //use m_implicitShapeDimensions instead
@@ -34,6 +34,8 @@ class btBox2dShape: public btPolyhedralConvexShape
 
 public:
 
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	btVector3 getHalfExtentsWithMargin() const
 	{
 		btVector3 halfExtents = getHalfExtentsWithoutMargin();
diff --git a/src/BulletCollision/CollisionShapes/btBoxShape.h b/src/BulletCollision/CollisionShapes/btBoxShape.h
index 0c5857dae..715e3f2ab 100644
--- a/src/BulletCollision/CollisionShapes/btBoxShape.h
+++ b/src/BulletCollision/CollisionShapes/btBoxShape.h
@@ -23,7 +23,7 @@ subject to the following restrictions:
 #include "LinearMath/btMinMax.h"
 
 ///The btBoxShape is a box primitive around the origin, its sides axis aligned with length specified by half extents, in local shape coordinates. When used as part of a btCollisionObject or btRigidBody it will be an oriented box in world space.
-class btBoxShape: public btPolyhedralConvexShape
+ATTRIBUTE_ALIGNED16(class) btBoxShape: public btPolyhedralConvexShape
 {
 
 	//btVector3	m_boxHalfExtents1; //use m_implicitShapeDimensions instead
@@ -31,6 +31,8 @@ class btBoxShape: public btPolyhedralConvexShape
 
 public:
 
+BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	btVector3 getHalfExtentsWithMargin() const
 	{
 		btVector3 halfExtents = getHalfExtentsWithoutMargin();
diff --git a/src/BulletCollision/CollisionShapes/btCapsuleShape.h b/src/BulletCollision/CollisionShapes/btCapsuleShape.h
index ab763abf8..7197ab6a0 100644
--- a/src/BulletCollision/CollisionShapes/btCapsuleShape.h
+++ b/src/BulletCollision/CollisionShapes/btCapsuleShape.h
@@ -23,7 +23,7 @@ subject to the following restrictions:
 ///The btCapsuleShape represents a capsule around the Y axis, there is also the btCapsuleShapeX aligned around the X axis and btCapsuleShapeZ around the Z axis.
 ///The total height is height+2*radius, so the height is just the height between the center of each 'sphere' of the capsule caps.
 ///The btCapsuleShape is a convex hull of two spheres. The btMultiSphereShape is a more general collision shape that takes the convex hull of multiple sphere, so it can also represent a capsule when just using two spheres.
-class btCapsuleShape : public btConvexInternalShape
+ATTRIBUTE_ALIGNED16(class) btCapsuleShape : public btConvexInternalShape
 {
 protected:
 	int	m_upAxis;
@@ -33,6 +33,9 @@ protected:
 	btCapsuleShape() : btConvexInternalShape() {m_shapeType = CAPSULE_SHAPE_PROXYTYPE;};
 
 public:
+	
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	btCapsuleShape(btScalar radius,btScalar height);
 
 	///CollisionShape Interface
@@ -62,8 +65,8 @@ public:
 			halfExtents += btVector3(getMargin(),getMargin(),getMargin());
 			btMatrix3x3 abs_b = t.getBasis().absolute();  
 			btVector3 center = t.getOrigin();
-			btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));		  
-			
+            btVector3 extent = halfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);
+        
 			aabbMin = center - extent;
 			aabbMax = center + extent;
 	}
diff --git a/src/BulletCollision/CollisionShapes/btCollisionShape.h b/src/BulletCollision/CollisionShapes/btCollisionShape.h
index 865c10677..c5d4128e3 100644
--- a/src/BulletCollision/CollisionShapes/btCollisionShape.h
+++ b/src/BulletCollision/CollisionShapes/btCollisionShape.h
@@ -24,7 +24,7 @@ class btSerializer;
 
 
 ///The btCollisionShape class provides an interface for collision shapes that can be shared among btCollisionObjects.
-class btCollisionShape
+ATTRIBUTE_ALIGNED16(class) btCollisionShape
 {
 protected:
 	int m_shapeType;
@@ -32,6 +32,8 @@ protected:
 
 public:
 
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	btCollisionShape() : m_shapeType (INVALID_SHAPE_PROXYTYPE), m_userPointer(0)
 	{
 	}
diff --git a/src/BulletCollision/CollisionShapes/btCompoundShape.cpp b/src/BulletCollision/CollisionShapes/btCompoundShape.cpp
index 4eb860c57..12f422f19 100644
--- a/src/BulletCollision/CollisionShapes/btCompoundShape.cpp
+++ b/src/BulletCollision/CollisionShapes/btCompoundShape.cpp
@@ -182,9 +182,7 @@ void btCompoundShape::getAabb(const btTransform& trans,btVector3& aabbMin,btVect
 
 	btVector3 center = trans(localCenter);
 
-	btVector3 extent = btVector3(abs_b[0].dot(localHalfExtents),
-		abs_b[1].dot(localHalfExtents),
-		abs_b[2].dot(localHalfExtents));
+    btVector3 extent = localHalfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);
 	aabbMin = center-extent;
 	aabbMax = center+extent;
 	
diff --git a/src/BulletCollision/CollisionShapes/btConcaveShape.h b/src/BulletCollision/CollisionShapes/btConcaveShape.h
index 2a03241c9..2917cc5b6 100644
--- a/src/BulletCollision/CollisionShapes/btConcaveShape.h
+++ b/src/BulletCollision/CollisionShapes/btConcaveShape.h
@@ -33,12 +33,14 @@ typedef enum PHY_ScalarType {
 
 ///The btConcaveShape class provides an interface for non-moving (static) concave shapes.
 ///It has been implemented by the btStaticPlaneShape, btBvhTriangleMeshShape and btHeightfieldTerrainShape.
-class btConcaveShape : public btCollisionShape
+ATTRIBUTE_ALIGNED16(class) btConcaveShape : public btCollisionShape
 {
 protected:
 	btScalar m_collisionMargin;
 
 public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	btConcaveShape();
 
 	virtual ~btConcaveShape();
diff --git a/src/BulletCollision/CollisionShapes/btConeShape.h b/src/BulletCollision/CollisionShapes/btConeShape.h
index b69b5c5b0..8bf78201d 100644
--- a/src/BulletCollision/CollisionShapes/btConeShape.h
+++ b/src/BulletCollision/CollisionShapes/btConeShape.h
@@ -20,7 +20,7 @@ subject to the following restrictions:
 #include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h" // for the types
 
 ///The btConeShape implements a cone shape primitive, centered around the origin and aligned with the Y axis. The btConeShapeX is aligned around the X axis and btConeShapeZ around the Z axis.
-class btConeShape : public btConvexInternalShape
+ATTRIBUTE_ALIGNED16(class) btConeShape : public btConvexInternalShape
 
 {
 
@@ -32,6 +32,8 @@ class btConeShape : public btConvexInternalShape
 
 
 public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	btConeShape (btScalar radius,btScalar height);
 	
 	virtual btVector3	localGetSupportingVertex(const btVector3& vec) const;
diff --git a/src/BulletCollision/CollisionShapes/btConvex2dShape.h b/src/BulletCollision/CollisionShapes/btConvex2dShape.h
index caf54329d..bbd1caf42 100644
--- a/src/BulletCollision/CollisionShapes/btConvex2dShape.h
+++ b/src/BulletCollision/CollisionShapes/btConvex2dShape.h
@@ -21,12 +21,14 @@ subject to the following restrictions:
 
 ///The btConvex2dShape allows to use arbitrary convex shapes as 2d convex shapes, with the Z component assumed to be 0.
 ///For 2d boxes, the btBox2dShape is recommended.
-class btConvex2dShape : public btConvexShape
+ATTRIBUTE_ALIGNED16(class) btConvex2dShape : public btConvexShape
 {
 	btConvexShape*	m_childConvexShape;
 
 	public:
 	
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	btConvex2dShape(	btConvexShape* convexChildShape);
 	
 	virtual ~btConvex2dShape();
diff --git a/src/BulletCollision/CollisionShapes/btConvexHullShape.cpp b/src/BulletCollision/CollisionShapes/btConvexHullShape.cpp
index 226245979..21351986a 100644
--- a/src/BulletCollision/CollisionShapes/btConvexHullShape.cpp
+++ b/src/BulletCollision/CollisionShapes/btConvexHullShape.cpp
@@ -55,20 +55,17 @@ void btConvexHullShape::addPoint(const btVector3& point)
 btVector3	btConvexHullShape::localGetSupportingVertexWithoutMargin(const btVector3& vec)const
 {
 	btVector3 supVec(btScalar(0.),btScalar(0.),btScalar(0.));
-	btScalar newDot,maxDot = btScalar(-BT_LARGE_FLOAT);
+	btScalar maxDot = btScalar(-BT_LARGE_FLOAT);
 
-	for (int i=0;i<m_unscaledPoints.size();i++)
-	{
-		btVector3 vtx = m_unscaledPoints[i] * m_localScaling;
+    // Here we take advantage of dot(a, b*c) = dot(a*b, c).  Note: This is true mathematically, but not numerically. 
+    if( 0 < m_unscaledPoints.size() )
+    {
+        btVector3 scaled = vec * m_localScaling;
+        int index = (int) scaled.maxDot( &m_unscaledPoints[0], m_unscaledPoints.size(), maxDot); // FIXME: may violate encapsulation of m_unscaledPoints
+        return m_unscaledPoints[index] * m_localScaling;
+    }
 
-		newDot = vec.dot(vtx);
-		if (newDot > maxDot)
-		{
-			maxDot = newDot;
-			supVec = vtx;
-		}
-	}
-	return supVec;
+    return supVec;
 }
 
 void	btConvexHullShape::batchedUnitVectorGetSupportingVertexWithoutMargin(const btVector3* vectors,btVector3* supportVerticesOut,int numVectors) const
@@ -81,23 +78,19 @@ void	btConvexHullShape::batchedUnitVectorGetSupportingVertexWithoutMargin(const
 			supportVerticesOut[i][3] = btScalar(-BT_LARGE_FLOAT);
 		}
 	}
-	for (int i=0;i<m_unscaledPoints.size();i++)
-	{
-		btVector3 vtx = getScaledPoint(i);
 
-		for (int j=0;j<numVectors;j++)
-		{
-			const btVector3& vec = vectors[j];
-			
-			newDot = vec.dot(vtx);
-			if (newDot > supportVerticesOut[j][3])
-			{
-				//WARNING: don't swap next lines, the w component would get overwritten!
-				supportVerticesOut[j] = vtx;
-				supportVerticesOut[j][3] = newDot;
-			}
-		}
-	}
+    for (int j=0;j<numVectors;j++)
+    {
+        btVector3 vec = vectors[j] * m_localScaling;        // dot(a*b,c) = dot(a,b*c)
+        if( 0 <  m_unscaledPoints.size() )
+        {
+            int i = (int) vec.maxDot( &m_unscaledPoints[0], m_unscaledPoints.size(), newDot);
+            supportVerticesOut[j] = getScaledPoint(i);
+            supportVerticesOut[j][3] = newDot;        
+        }
+        else
+            supportVerticesOut[j][3] = -BT_LARGE_FLOAT;
+    }
 
 
 
diff --git a/src/BulletCollision/CollisionShapes/btConvexInternalShape.h b/src/BulletCollision/CollisionShapes/btConvexInternalShape.h
index 85cd9ef90..37e04f5fc 100644
--- a/src/BulletCollision/CollisionShapes/btConvexInternalShape.h
+++ b/src/BulletCollision/CollisionShapes/btConvexInternalShape.h
@@ -26,7 +26,7 @@ subject to the following restrictions:
 ///Note that when creating small shapes (derived from btConvexInternalShape), 
 ///you need to make sure to set a smaller collision margin, using the 'setMargin' API
 ///There is a automatic mechanism 'setSafeMargin' used by btBoxShape and btCylinderShape
-class btConvexInternalShape : public btConvexShape
+ATTRIBUTE_ALIGNED16(class) btConvexInternalShape : public btConvexShape
 {
 
 	protected:
@@ -44,7 +44,7 @@ class btConvexInternalShape : public btConvexShape
 
 public:
 
-	
+	BT_DECLARE_ALIGNED_ALLOCATOR();
 
 	virtual ~btConvexInternalShape()
 	{
diff --git a/src/BulletCollision/CollisionShapes/btConvexPointCloudShape.cpp b/src/BulletCollision/CollisionShapes/btConvexPointCloudShape.cpp
index c1b155aef..d45911519 100644
--- a/src/BulletCollision/CollisionShapes/btConvexPointCloudShape.cpp
+++ b/src/BulletCollision/CollisionShapes/btConvexPointCloudShape.cpp
@@ -28,7 +28,7 @@ void btConvexPointCloudShape::setLocalScaling(const btVector3& scaling)
 btVector3	btConvexPointCloudShape::localGetSupportingVertexWithoutMargin(const btVector3& vec0)const
 {
 	btVector3 supVec(btScalar(0.),btScalar(0.),btScalar(0.));
-	btScalar newDot,maxDot = btScalar(-BT_LARGE_FLOAT);
+	btScalar maxDot = btScalar(-BT_LARGE_FLOAT);
 
 	btVector3 vec = vec0;
 	btScalar lenSqr = vec.length2();
@@ -40,51 +40,33 @@ btVector3	btConvexPointCloudShape::localGetSupportingVertexWithoutMargin(const b
 		btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
 		vec *= rlen;
 	}
+    
+    if( m_numPoints > 0 )
+    {
+        // Here we take advantage of dot(a*b, c) = dot( a, b*c) to do less work. Note this transformation is true mathematically, not numerically.
+        btVector3 scaled = vec * m_localScaling;
+        int index = (int) vec.maxDot( &m_unscaledPoints[0], m_numPoints, maxDot);   //FIXME: may violate encapsulation of m_unscaledPoints
+        return getScaledPoint(index);
+    }
 
-
-	for (int i=0;i<m_numPoints;i++)
-	{
-		btVector3 vtx = getScaledPoint(i);
-
-		newDot = vec.dot(vtx);
-		if (newDot > maxDot)
-		{
-			maxDot = newDot;
-			supVec = vtx;
-		}
-	}
 	return supVec;
 }
 
 void	btConvexPointCloudShape::batchedUnitVectorGetSupportingVertexWithoutMargin(const btVector3* vectors,btVector3* supportVerticesOut,int numVectors) const
 {
-	btScalar newDot;
-	//use 'w' component of supportVerticesOut?
-	{
-		for (int i=0;i<numVectors;i++)
-		{
-			supportVerticesOut[i][3] = btScalar(-BT_LARGE_FLOAT);
-		}
-	}
-	for (int i=0;i<m_numPoints;i++)
-	{
-		btVector3 vtx = getScaledPoint(i);
-
-		for (int j=0;j<numVectors;j++)
-		{
-			const btVector3& vec = vectors[j];
-			
-			newDot = vec.dot(vtx);
-			if (newDot > supportVerticesOut[j][3])
-			{
-				//WARNING: don't swap next lines, the w component would get overwritten!
-				supportVerticesOut[j] = vtx;
-				supportVerticesOut[j][3] = newDot;
-			}
-		}
-	}
-
-
+    for( int j = 0; j < numVectors; j++ )
+    {
+        const btVector3& vec = vectors[j] * m_localScaling;  // dot( a*c, b) = dot(a, b*c)
+        btScalar maxDot;
+        int index = (int) vec.maxDot( &m_unscaledPoints[0], m_numPoints, maxDot);
+        supportVerticesOut[j][3] = btScalar(-BT_LARGE_FLOAT);
+        if( 0 <= index )
+        {
+            //WARNING: don't swap next lines, the w component would get overwritten!
+            supportVerticesOut[j] = getScaledPoint(index);
+            supportVerticesOut[j][3] = maxDot;
+        }
+    }
 
 }
 	
diff --git a/src/BulletCollision/CollisionShapes/btConvexPolyhedron.cpp b/src/BulletCollision/CollisionShapes/btConvexPolyhedron.cpp
index c8fe9492d..112ef7cbe 100644
--- a/src/BulletCollision/CollisionShapes/btConvexPolyhedron.cpp
+++ b/src/BulletCollision/CollisionShapes/btConvexPolyhedron.cpp
@@ -1,296 +1,296 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-///This file was written by Erwin Coumans
-///Separating axis rest based on work from Pierre Terdiman, see
-///And contact clipping based on work from Simon Hobbs
-
-#include "btConvexPolyhedron.h"
-#include "LinearMath/btHashMap.h"
-
-btConvexPolyhedron::btConvexPolyhedron()
-{
-
-}
-btConvexPolyhedron::~btConvexPolyhedron()
-{
-
-}
-
-
-inline bool IsAlmostZero(const btVector3& v)
-{
-	if(fabsf(v.x())>1e-6 || fabsf(v.y())>1e-6 || fabsf(v.z())>1e-6)	return false;
-	return true;
-}
-
-struct btInternalVertexPair
-{
-	btInternalVertexPair(short int v0,short int v1)
-		:m_v0(v0),
-		m_v1(v1)
-	{
-		if (m_v1>m_v0)
-			btSwap(m_v0,m_v1);
-	}
-	short int m_v0;
-	short int m_v1;
-	int getHash() const
-	{
-		return m_v0+(m_v1<<16);
-	}
-	bool equals(const btInternalVertexPair& other) const
-	{
-		return m_v0==other.m_v0 && m_v1==other.m_v1;
-	}
-};
-
-struct btInternalEdge
-{
-	btInternalEdge()
-		:m_face0(-1),
-		m_face1(-1)
-	{
-	}
-	short int m_face0;
-	short int m_face1;
-};
-
-//
-
-#ifdef TEST_INTERNAL_OBJECTS
-bool btConvexPolyhedron::testContainment() const
-{
-	for(int p=0;p<8;p++)
-	{
-		btVector3 LocalPt;
-		if(p==0)		LocalPt = m_localCenter + btVector3(m_extents[0], m_extents[1], m_extents[2]);
-		else if(p==1)	LocalPt = m_localCenter + btVector3(m_extents[0], m_extents[1], -m_extents[2]);
-		else if(p==2)	LocalPt = m_localCenter + btVector3(m_extents[0], -m_extents[1], m_extents[2]);
-		else if(p==3)	LocalPt = m_localCenter + btVector3(m_extents[0], -m_extents[1], -m_extents[2]);
-		else if(p==4)	LocalPt = m_localCenter + btVector3(-m_extents[0], m_extents[1], m_extents[2]);
-		else if(p==5)	LocalPt = m_localCenter + btVector3(-m_extents[0], m_extents[1], -m_extents[2]);
-		else if(p==6)	LocalPt = m_localCenter + btVector3(-m_extents[0], -m_extents[1], m_extents[2]);
-		else if(p==7)	LocalPt = m_localCenter + btVector3(-m_extents[0], -m_extents[1], -m_extents[2]);
-
-		for(int i=0;i<m_faces.size();i++)
-		{
-			const btVector3 Normal(m_faces[i].m_plane[0], m_faces[i].m_plane[1], m_faces[i].m_plane[2]);
-			const btScalar d = LocalPt.dot(Normal) + m_faces[i].m_plane[3];
-			if(d>0.0f)
-				return false;
-		}
-	}
-	return true;
-}
-#endif
-
-void	btConvexPolyhedron::initialize()
-{
-
-	btHashMap<btInternalVertexPair,btInternalEdge> edges;
-
-	btScalar TotalArea = 0.0f;
-	
-	m_localCenter.setValue(0, 0, 0);
-	for(int i=0;i<m_faces.size();i++)
-	{
-		int numVertices = m_faces[i].m_indices.size();
-		int NbTris = numVertices;
-		for(int j=0;j<NbTris;j++)
-		{
-			int k = (j+1)%numVertices;
-			btInternalVertexPair vp(m_faces[i].m_indices[j],m_faces[i].m_indices[k]);
-			btInternalEdge* edptr = edges.find(vp);
-			btVector3 edge = m_vertices[vp.m_v1]-m_vertices[vp.m_v0];
-			edge.normalize();
-
-			bool found = false;
-
-			for (int p=0;p<m_uniqueEdges.size();p++)
-			{
-				
-				if (IsAlmostZero(m_uniqueEdges[p]-edge) || 
-					IsAlmostZero(m_uniqueEdges[p]+edge))
-				{
-					found = true;
-					break;
-				}
-			}
-
-			if (!found)
-			{
-				m_uniqueEdges.push_back(edge);
-			}
-
-			if (edptr)
-			{
-				btAssert(edptr->m_face0>=0);
-				btAssert(edptr->m_face1<0);
-				edptr->m_face1 = i;
-			} else
-			{
-				btInternalEdge ed;
-				ed.m_face0 = i;
-				edges.insert(vp,ed);
-			}
-		}
-	}
-
-#ifdef USE_CONNECTED_FACES
-	for(int i=0;i<m_faces.size();i++)
-	{
-		int numVertices = m_faces[i].m_indices.size();
-		m_faces[i].m_connectedFaces.resize(numVertices);
-
-		for(int j=0;j<numVertices;j++)
-		{
-			int k = (j+1)%numVertices;
-			btInternalVertexPair vp(m_faces[i].m_indices[j],m_faces[i].m_indices[k]);
-			btInternalEdge* edptr = edges.find(vp);
-			btAssert(edptr);
-			btAssert(edptr->m_face0>=0);
-			btAssert(edptr->m_face1>=0);
-
-			int connectedFace = (edptr->m_face0==i)?edptr->m_face1:edptr->m_face0;
-			m_faces[i].m_connectedFaces[j] = connectedFace;
-		}
-	}
-#endif//USE_CONNECTED_FACES
-
-	for(int i=0;i<m_faces.size();i++)
-	{
-		int numVertices = m_faces[i].m_indices.size();
-		int NbTris = numVertices-2;
-		
-		const btVector3& p0 = m_vertices[m_faces[i].m_indices[0]];
-		for(int j=1;j<=NbTris;j++)
-		{
-			int k = (j+1)%numVertices;
-			const btVector3& p1 = m_vertices[m_faces[i].m_indices[j]];
-			const btVector3& p2 = m_vertices[m_faces[i].m_indices[k]];
-			btScalar Area = ((p0 - p1).cross(p0 - p2)).length() * 0.5f;
-			btVector3 Center = (p0+p1+p2)/3.0f;
-			m_localCenter += Area * Center;
-			TotalArea += Area;
-		}
-	}
-	m_localCenter /= TotalArea;
-
-
-
-
-#ifdef TEST_INTERNAL_OBJECTS
-	if(1)
-	{
-		m_radius = FLT_MAX;
-		for(int i=0;i<m_faces.size();i++)
-		{
-			const btVector3 Normal(m_faces[i].m_plane[0], m_faces[i].m_plane[1], m_faces[i].m_plane[2]);
-			const btScalar dist = btFabs(m_localCenter.dot(Normal) + m_faces[i].m_plane[3]);
-			if(dist<m_radius)
-				m_radius = dist;
-		}
-
-	
-		btScalar MinX = FLT_MAX;
-		btScalar MinY = FLT_MAX;
-		btScalar MinZ = FLT_MAX;
-		btScalar MaxX = -FLT_MAX;
-		btScalar MaxY = -FLT_MAX;
-		btScalar MaxZ = -FLT_MAX;
-		for(int i=0; i<m_vertices.size(); i++)
-		{
-			const btVector3& pt = m_vertices[i];
-			if(pt.x()<MinX)	MinX = pt.x();
-			if(pt.x()>MaxX)	MaxX = pt.x();
-			if(pt.y()<MinY)	MinY = pt.y();
-			if(pt.y()>MaxY)	MaxY = pt.y();
-			if(pt.z()<MinZ)	MinZ = pt.z();
-			if(pt.z()>MaxZ)	MaxZ = pt.z();
-		}
-		mC.setValue(MaxX+MinX, MaxY+MinY, MaxZ+MinZ);
-		mE.setValue(MaxX-MinX, MaxY-MinY, MaxZ-MinZ);
-
-
-
-//		const btScalar r = m_radius / sqrtf(2.0f);
-		const btScalar r = m_radius / sqrtf(3.0f);
-		const int LargestExtent = mE.maxAxis();
-		const btScalar Step = (mE[LargestExtent]*0.5f - r)/1024.0f;
-		m_extents[0] = m_extents[1] = m_extents[2] = r;
-		m_extents[LargestExtent] = mE[LargestExtent]*0.5f;
-		bool FoundBox = false;
-		for(int j=0;j<1024;j++)
-		{
-			if(testContainment())
-			{
-				FoundBox = true;
-				break;
-			}
-
-			m_extents[LargestExtent] -= Step;
-		}
-		if(!FoundBox)
-		{
-			m_extents[0] = m_extents[1] = m_extents[2] = r;
-		}
-		else
-		{
-			// Refine the box
-			const btScalar Step = (m_radius - r)/1024.0f;
-			const int e0 = (1<<LargestExtent) & 3;
-			const int e1 = (1<<e0) & 3;
-
-			for(int j=0;j<1024;j++)
-			{
-				const btScalar Saved0 = m_extents[e0];
-				const btScalar Saved1 = m_extents[e1];
-				m_extents[e0] += Step;
-				m_extents[e1] += Step;
-
-				if(!testContainment())
-				{
-					m_extents[e0] = Saved0;
-					m_extents[e1] = Saved1;
-					break;
-				}
-			}
-		}
-	}
-#endif
-}
-
-
-void btConvexPolyhedron::project(const btTransform& trans, const btVector3& dir, btScalar& min, btScalar& max) const
-{
-	min = FLT_MAX;
-	max = -FLT_MAX;
-	int numVerts = m_vertices.size();
-	for(int i=0;i<numVerts;i++)
-	{
-		btVector3 pt = trans * m_vertices[i];
-		btScalar dp = pt.dot(dir);
-		if(dp < min)	min = dp;
-		if(dp > max)	max = dp;
-	}
-	if(min>max)
-	{
-		btScalar tmp = min;
-		min = max;
-		max = tmp;
-	}
-}
\ No newline at end of file
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+///This file was written by Erwin Coumans
+///Separating axis rest based on work from Pierre Terdiman, see
+///And contact clipping based on work from Simon Hobbs
+
+#include "btConvexPolyhedron.h"
+#include "LinearMath/btHashMap.h"
+
+btConvexPolyhedron::btConvexPolyhedron()
+{
+
+}
+btConvexPolyhedron::~btConvexPolyhedron()
+{
+
+}
+
+
+inline bool IsAlmostZero(const btVector3& v)
+{
+	if(fabsf(v.x())>1e-6 || fabsf(v.y())>1e-6 || fabsf(v.z())>1e-6)	return false;
+	return true;
+}
+
+struct btInternalVertexPair
+{
+	btInternalVertexPair(short int v0,short int v1)
+		:m_v0(v0),
+		m_v1(v1)
+	{
+		if (m_v1>m_v0)
+			btSwap(m_v0,m_v1);
+	}
+	short int m_v0;
+	short int m_v1;
+	int getHash() const
+	{
+		return m_v0+(m_v1<<16);
+	}
+	bool equals(const btInternalVertexPair& other) const
+	{
+		return m_v0==other.m_v0 && m_v1==other.m_v1;
+	}
+};
+
+struct btInternalEdge
+{
+	btInternalEdge()
+		:m_face0(-1),
+		m_face1(-1)
+	{
+	}
+	short int m_face0;
+	short int m_face1;
+};
+
+//
+
+#ifdef TEST_INTERNAL_OBJECTS
+bool btConvexPolyhedron::testContainment() const
+{
+	for(int p=0;p<8;p++)
+	{
+		btVector3 LocalPt;
+		if(p==0)		LocalPt = m_localCenter + btVector3(m_extents[0], m_extents[1], m_extents[2]);
+		else if(p==1)	LocalPt = m_localCenter + btVector3(m_extents[0], m_extents[1], -m_extents[2]);
+		else if(p==2)	LocalPt = m_localCenter + btVector3(m_extents[0], -m_extents[1], m_extents[2]);
+		else if(p==3)	LocalPt = m_localCenter + btVector3(m_extents[0], -m_extents[1], -m_extents[2]);
+		else if(p==4)	LocalPt = m_localCenter + btVector3(-m_extents[0], m_extents[1], m_extents[2]);
+		else if(p==5)	LocalPt = m_localCenter + btVector3(-m_extents[0], m_extents[1], -m_extents[2]);
+		else if(p==6)	LocalPt = m_localCenter + btVector3(-m_extents[0], -m_extents[1], m_extents[2]);
+		else if(p==7)	LocalPt = m_localCenter + btVector3(-m_extents[0], -m_extents[1], -m_extents[2]);
+
+		for(int i=0;i<m_faces.size();i++)
+		{
+			const btVector3 Normal(m_faces[i].m_plane[0], m_faces[i].m_plane[1], m_faces[i].m_plane[2]);
+			const btScalar d = LocalPt.dot(Normal) + m_faces[i].m_plane[3];
+			if(d>0.0f)
+				return false;
+		}
+	}
+	return true;
+}
+#endif
+
+void	btConvexPolyhedron::initialize()
+{
+
+	btHashMap<btInternalVertexPair,btInternalEdge> edges;
+
+	btScalar TotalArea = 0.0f;
+	
+	m_localCenter.setValue(0, 0, 0);
+	for(int i=0;i<m_faces.size();i++)
+	{
+		int numVertices = m_faces[i].m_indices.size();
+		int NbTris = numVertices;
+		for(int j=0;j<NbTris;j++)
+		{
+			int k = (j+1)%numVertices;
+			btInternalVertexPair vp(m_faces[i].m_indices[j],m_faces[i].m_indices[k]);
+			btInternalEdge* edptr = edges.find(vp);
+			btVector3 edge = m_vertices[vp.m_v1]-m_vertices[vp.m_v0];
+			edge.normalize();
+
+			bool found = false;
+
+			for (int p=0;p<m_uniqueEdges.size();p++)
+			{
+				
+				if (IsAlmostZero(m_uniqueEdges[p]-edge) || 
+					IsAlmostZero(m_uniqueEdges[p]+edge))
+				{
+					found = true;
+					break;
+				}
+			}
+
+			if (!found)
+			{
+				m_uniqueEdges.push_back(edge);
+			}
+
+			if (edptr)
+			{
+				btAssert(edptr->m_face0>=0);
+				btAssert(edptr->m_face1<0);
+				edptr->m_face1 = i;
+			} else
+			{
+				btInternalEdge ed;
+				ed.m_face0 = i;
+				edges.insert(vp,ed);
+			}
+		}
+	}
+
+#ifdef USE_CONNECTED_FACES
+	for(int i=0;i<m_faces.size();i++)
+	{
+		int numVertices = m_faces[i].m_indices.size();
+		m_faces[i].m_connectedFaces.resize(numVertices);
+
+		for(int j=0;j<numVertices;j++)
+		{
+			int k = (j+1)%numVertices;
+			btInternalVertexPair vp(m_faces[i].m_indices[j],m_faces[i].m_indices[k]);
+			btInternalEdge* edptr = edges.find(vp);
+			btAssert(edptr);
+			btAssert(edptr->m_face0>=0);
+			btAssert(edptr->m_face1>=0);
+
+			int connectedFace = (edptr->m_face0==i)?edptr->m_face1:edptr->m_face0;
+			m_faces[i].m_connectedFaces[j] = connectedFace;
+		}
+	}
+#endif//USE_CONNECTED_FACES
+
+	for(int i=0;i<m_faces.size();i++)
+	{
+		int numVertices = m_faces[i].m_indices.size();
+		int NbTris = numVertices-2;
+		
+		const btVector3& p0 = m_vertices[m_faces[i].m_indices[0]];
+		for(int j=1;j<=NbTris;j++)
+		{
+			int k = (j+1)%numVertices;
+			const btVector3& p1 = m_vertices[m_faces[i].m_indices[j]];
+			const btVector3& p2 = m_vertices[m_faces[i].m_indices[k]];
+			btScalar Area = ((p0 - p1).cross(p0 - p2)).length() * 0.5f;
+			btVector3 Center = (p0+p1+p2)/3.0f;
+			m_localCenter += Area * Center;
+			TotalArea += Area;
+		}
+	}
+	m_localCenter /= TotalArea;
+
+
+
+
+#ifdef TEST_INTERNAL_OBJECTS
+	if(1)
+	{
+		m_radius = FLT_MAX;
+		for(int i=0;i<m_faces.size();i++)
+		{
+			const btVector3 Normal(m_faces[i].m_plane[0], m_faces[i].m_plane[1], m_faces[i].m_plane[2]);
+			const btScalar dist = btFabs(m_localCenter.dot(Normal) + m_faces[i].m_plane[3]);
+			if(dist<m_radius)
+				m_radius = dist;
+		}
+
+	
+		btScalar MinX = FLT_MAX;
+		btScalar MinY = FLT_MAX;
+		btScalar MinZ = FLT_MAX;
+		btScalar MaxX = -FLT_MAX;
+		btScalar MaxY = -FLT_MAX;
+		btScalar MaxZ = -FLT_MAX;
+		for(int i=0; i<m_vertices.size(); i++)
+		{
+			const btVector3& pt = m_vertices[i];
+			if(pt.x()<MinX)	MinX = pt.x();
+			if(pt.x()>MaxX)	MaxX = pt.x();
+			if(pt.y()<MinY)	MinY = pt.y();
+			if(pt.y()>MaxY)	MaxY = pt.y();
+			if(pt.z()<MinZ)	MinZ = pt.z();
+			if(pt.z()>MaxZ)	MaxZ = pt.z();
+		}
+		mC.setValue(MaxX+MinX, MaxY+MinY, MaxZ+MinZ);
+		mE.setValue(MaxX-MinX, MaxY-MinY, MaxZ-MinZ);
+
+
+
+//		const btScalar r = m_radius / sqrtf(2.0f);
+		const btScalar r = m_radius / sqrtf(3.0f);
+		const int LargestExtent = mE.maxAxis();
+		const btScalar Step = (mE[LargestExtent]*0.5f - r)/1024.0f;
+		m_extents[0] = m_extents[1] = m_extents[2] = r;
+		m_extents[LargestExtent] = mE[LargestExtent]*0.5f;
+		bool FoundBox = false;
+		for(int j=0;j<1024;j++)
+		{
+			if(testContainment())
+			{
+				FoundBox = true;
+				break;
+			}
+
+			m_extents[LargestExtent] -= Step;
+		}
+		if(!FoundBox)
+		{
+			m_extents[0] = m_extents[1] = m_extents[2] = r;
+		}
+		else
+		{
+			// Refine the box
+			const btScalar Step = (m_radius - r)/1024.0f;
+			const int e0 = (1<<LargestExtent) & 3;
+			const int e1 = (1<<e0) & 3;
+
+			for(int j=0;j<1024;j++)
+			{
+				const btScalar Saved0 = m_extents[e0];
+				const btScalar Saved1 = m_extents[e1];
+				m_extents[e0] += Step;
+				m_extents[e1] += Step;
+
+				if(!testContainment())
+				{
+					m_extents[e0] = Saved0;
+					m_extents[e1] = Saved1;
+					break;
+				}
+			}
+		}
+	}
+#endif
+}
+
+
+void btConvexPolyhedron::project(const btTransform& trans, const btVector3& dir, btScalar& min, btScalar& max) const
+{
+	min = FLT_MAX;
+	max = -FLT_MAX;
+	int numVerts = m_vertices.size();
+	for(int i=0;i<numVerts;i++)
+	{
+		btVector3 pt = trans * m_vertices[i];
+		btScalar dp = pt.dot(dir);
+		if(dp < min)	min = dp;
+		if(dp > max)	max = dp;
+	}
+	if(min>max)
+	{
+		btScalar tmp = min;
+		min = max;
+		max = tmp;
+	}
+}
diff --git a/src/BulletCollision/CollisionShapes/btConvexPolyhedron.h b/src/BulletCollision/CollisionShapes/btConvexPolyhedron.h
index 4386cddf9..f10c83758 100644
--- a/src/BulletCollision/CollisionShapes/btConvexPolyhedron.h
+++ b/src/BulletCollision/CollisionShapes/btConvexPolyhedron.h
@@ -1,62 +1,65 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-///This file was written by Erwin Coumans
-
-
-#ifndef _BT_POLYHEDRAL_FEATURES_H
-#define _BT_POLYHEDRAL_FEATURES_H
-
-#include "LinearMath/btTransform.h"
-#include "LinearMath/btAlignedObjectArray.h"
-
-#define TEST_INTERNAL_OBJECTS 1
-
-
-struct btFace
-{
-	btAlignedObjectArray<int>	m_indices;
-//	btAlignedObjectArray<int>	m_connectedFaces;
-	btScalar	m_plane[4];
-};
-
-
-class btConvexPolyhedron
-{
-	public:
-	btConvexPolyhedron();
-	virtual	~btConvexPolyhedron();
-
-	btAlignedObjectArray<btVector3>	m_vertices;
-	btAlignedObjectArray<btFace>	m_faces;
-	btAlignedObjectArray<btVector3> m_uniqueEdges;
-
-	btVector3		m_localCenter;
-	btVector3		m_extents;
-	btScalar		m_radius;
-	btVector3		mC;
-	btVector3		mE;
-
-	void	initialize();
-	bool testContainment() const;
-
-	void project(const btTransform& trans, const btVector3& dir, btScalar& min, btScalar& max) const;
-};
-
-	
-#endif //_BT_POLYHEDRAL_FEATURES_H
-
-
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+///This file was written by Erwin Coumans
+
+
+#ifndef _BT_POLYHEDRAL_FEATURES_H
+#define _BT_POLYHEDRAL_FEATURES_H
+
+#include "LinearMath/btTransform.h"
+#include "LinearMath/btAlignedObjectArray.h"
+
+#define TEST_INTERNAL_OBJECTS 1
+
+
+struct btFace
+{
+	btAlignedObjectArray<int>	m_indices;
+//	btAlignedObjectArray<int>	m_connectedFaces;
+	btScalar	m_plane[4];
+};
+
+
+ATTRIBUTE_ALIGNED16(class) btConvexPolyhedron
+{
+	public:
+		
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+		
+	btConvexPolyhedron();
+	virtual	~btConvexPolyhedron();
+
+	btAlignedObjectArray<btVector3>	m_vertices;
+	btAlignedObjectArray<btFace>	m_faces;
+	btAlignedObjectArray<btVector3> m_uniqueEdges;
+
+	btVector3		m_localCenter;
+	btVector3		m_extents;
+	btScalar		m_radius;
+	btVector3		mC;
+	btVector3		mE;
+
+	void	initialize();
+	bool testContainment() const;
+
+	void project(const btTransform& trans, const btVector3& dir, btScalar& min, btScalar& max) const;
+};
+
+	
+#endif //_BT_POLYHEDRAL_FEATURES_H
+
+
diff --git a/src/BulletCollision/CollisionShapes/btConvexShape.cpp b/src/BulletCollision/CollisionShapes/btConvexShape.cpp
index 8c67d8ebe..3ffa42228 100644
--- a/src/BulletCollision/CollisionShapes/btConvexShape.cpp
+++ b/src/BulletCollision/CollisionShapes/btConvexShape.cpp
@@ -109,19 +109,8 @@ static btVector3 convexHullSupport (const btVector3& localDirOrg, const btVector
 	return supVec;
 #else
 
-	btScalar newDot,maxDot = btScalar(-BT_LARGE_FLOAT);
-	int ptIndex = -1;
-
-	for (int i=0;i<numPoints;i++)
-	{
-
-		newDot = vec.dot(points[i]);
-		if (newDot > maxDot)
-		{
-			maxDot = newDot;
-			ptIndex = i;
-		}
-	}
+    btScalar maxDot;
+    long ptIndex = vec.maxDot( points, numPoints, maxDot);
 	btAssert(ptIndex >= 0);
 	btVector3 supVec = points[ptIndex] * localScaling;
 	return supVec;
@@ -141,16 +130,26 @@ btVector3 btConvexShape::localGetSupportVertexWithoutMarginNonVirtual (const btV
 		btBoxShape* convexShape = (btBoxShape*)this;
 		const btVector3& halfExtents = convexShape->getImplicitShapeDimensions();
 
+#if defined( __APPLE__ ) && (defined( BT_USE_SSE )||defined( BT_USE_NEON ))
+    #if defined( BT_USE_SSE )
+            return btVector3( _mm_xor_ps( _mm_and_ps( localDir.mVec128, (__m128){-0.0f, -0.0f, -0.0f, -0.0f }), halfExtents.mVec128 ));
+    #elif defined( BT_USE_NEON )
+            return btVector3( (float32x4_t) (((uint32x4_t) localDir.mVec128 & (uint32x4_t){ 0x80000000, 0x80000000, 0x80000000, 0x80000000}) ^ (uint32x4_t) halfExtents.mVec128 ));
+    #else
+        #error unknown vector arch
+    #endif
+#else
 		return btVector3(btFsels(localDir.x(), halfExtents.x(), -halfExtents.x()),
 			btFsels(localDir.y(), halfExtents.y(), -halfExtents.y()),
 			btFsels(localDir.z(), halfExtents.z(), -halfExtents.z()));
+#endif
 	}
 	case TRIANGLE_SHAPE_PROXYTYPE:
 	{
 		btTriangleShape* triangleShape = (btTriangleShape*)this;
 		btVector3 dir(localDir.getX(),localDir.getY(),localDir.getZ());
 		btVector3* vertices = &triangleShape->m_vertices1[0];
-		btVector3 dots(dir.dot(vertices[0]), dir.dot(vertices[1]), dir.dot(vertices[2]));
+        btVector3 dots = dir.dot3(vertices[0], vertices[1], vertices[2]);
 		btVector3 sup = vertices[dots.maxAxis()];
 		return btVector3(sup.getX(),sup.getY(),sup.getZ());
 	}
@@ -383,8 +382,8 @@ void btConvexShape::getAabbNonVirtual (const btTransform& t, btVector3& aabbMin,
 		halfExtents += btVector3(margin,margin,margin);
 		btMatrix3x3 abs_b = t.getBasis().absolute();  
 		btVector3 center = t.getOrigin();
-		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));
-		
+        btVector3 extent = halfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);    
+        
 		aabbMin = center - extent;
 		aabbMax = center + extent;
 		break;
@@ -417,7 +416,7 @@ void btConvexShape::getAabbNonVirtual (const btTransform& t, btVector3& aabbMin,
 		halfExtents += btVector3(capsuleShape->getMarginNonVirtual(),capsuleShape->getMarginNonVirtual(),capsuleShape->getMarginNonVirtual());
 		btMatrix3x3 abs_b = t.getBasis().absolute();  
 		btVector3 center = t.getOrigin();
-		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));		  	
+        btVector3 extent = halfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);    
 		aabbMin = center - extent;
 		aabbMax = center + extent;
 	}
diff --git a/src/BulletCollision/CollisionShapes/btConvexTriangleMeshShape.h b/src/BulletCollision/CollisionShapes/btConvexTriangleMeshShape.h
index af5d00388..f338865ca 100644
--- a/src/BulletCollision/CollisionShapes/btConvexTriangleMeshShape.h
+++ b/src/BulletCollision/CollisionShapes/btConvexTriangleMeshShape.h
@@ -22,12 +22,14 @@ subject to the following restrictions:
 
 /// The btConvexTriangleMeshShape is a convex hull of a triangle mesh, but the performance is not as good as btConvexHullShape.
 /// A small benefit of this class is that it uses the btStridingMeshInterface, so you can avoid the duplication of the triangle mesh data. Nevertheless, most users should use the much better performing btConvexHullShape instead.
-class btConvexTriangleMeshShape : public btPolyhedralConvexAabbCachingShape
+ATTRIBUTE_ALIGNED16(class) btConvexTriangleMeshShape : public btPolyhedralConvexAabbCachingShape
 {
 
 	class btStridingMeshInterface*	m_stridingMesh;
 
 public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	btConvexTriangleMeshShape(btStridingMeshInterface* meshInterface, bool calcAabb = true);
 
 	class btStridingMeshInterface*	getMeshInterface()
diff --git a/src/BulletCollision/CollisionShapes/btCylinderShape.h b/src/BulletCollision/CollisionShapes/btCylinderShape.h
index 125bfc78a..01467d42f 100644
--- a/src/BulletCollision/CollisionShapes/btCylinderShape.h
+++ b/src/BulletCollision/CollisionShapes/btCylinderShape.h
@@ -21,7 +21,7 @@ subject to the following restrictions:
 #include "LinearMath/btVector3.h"
 
 /// The btCylinderShape class implements a cylinder shape primitive, centered around the origin. Its central axis aligned with the Y axis. btCylinderShapeX is aligned with the X axis and btCylinderShapeZ around the Z axis.
-class btCylinderShape : public btConvexInternalShape
+ATTRIBUTE_ALIGNED16(class) btCylinderShape : public btConvexInternalShape
 
 {
 
@@ -31,6 +31,8 @@ protected:
 
 public:
 
+BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	btVector3 getHalfExtentsWithMargin() const
 	{
 		btVector3 halfExtents = getHalfExtentsWithoutMargin();
@@ -128,6 +130,8 @@ public:
 class btCylinderShapeX : public btCylinderShape
 {
 public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	btCylinderShapeX (const btVector3& halfExtents);
 
 	virtual btVector3	localGetSupportingVertexWithoutMargin(const btVector3& vec)const;
@@ -149,6 +153,8 @@ public:
 class btCylinderShapeZ : public btCylinderShape
 {
 public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	btCylinderShapeZ (const btVector3& halfExtents);
 
 	virtual btVector3	localGetSupportingVertexWithoutMargin(const btVector3& vec)const;
diff --git a/src/BulletCollision/CollisionShapes/btEmptyShape.h b/src/BulletCollision/CollisionShapes/btEmptyShape.h
index 87b7b66d1..069a79402 100644
--- a/src/BulletCollision/CollisionShapes/btEmptyShape.h
+++ b/src/BulletCollision/CollisionShapes/btEmptyShape.h
@@ -28,9 +28,11 @@ subject to the following restrictions:
 
 /// The btEmptyShape is a collision shape without actual collision detection shape, so most users should ignore this class.
 /// It can be replaced by another shape during runtime, but the inertia tensor should be recomputed.
-class btEmptyShape	: public btConcaveShape
+ATTRIBUTE_ALIGNED16(class) btEmptyShape	: public btConcaveShape
 {
 public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	btEmptyShape();
 
 	virtual ~btEmptyShape();
diff --git a/src/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp b/src/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp
index 95631c301..5cf79a974 100644
--- a/src/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp
+++ b/src/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp
@@ -38,7 +38,7 @@ btHeightfieldTerrainShape::btHeightfieldTerrainShape(int heightStickWidth, int h
 	// legacy constructor: support only float or unsigned char,
 	// 	and min height is zero
 	PHY_ScalarType hdt = (useFloatData) ? PHY_FLOAT : PHY_UCHAR;
-	btScalar minHeight = 0.0;
+	btScalar minHeight = 0.0f;
 
 	// previously, height = uchar * maxHeight / 65535.
 	// So to preserve legacy behavior, heightScale = maxHeight / 65535
@@ -135,9 +135,7 @@ void btHeightfieldTerrainShape::getAabb(const btTransform& t,btVector3& aabbMin,
 
 	btMatrix3x3 abs_b = t.getBasis().absolute();  
 	btVector3 center = t.getOrigin();
-	btVector3 extent = btVector3(abs_b[0].dot(halfExtents),
-		   abs_b[1].dot(halfExtents),
-		  abs_b[2].dot(halfExtents));
+    btVector3 extent = halfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);
 	extent += btVector3(getMargin(),getMargin(),getMargin());
 
 	aabbMin = center - extent;
diff --git a/src/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h b/src/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h
index 78e231e08..32c803308 100644
--- a/src/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h
+++ b/src/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h
@@ -68,7 +68,7 @@ subject to the following restrictions:
 
   For usage and testing see the TerrainDemo.
  */
-class btHeightfieldTerrainShape : public btConcaveShape
+ATTRIBUTE_ALIGNED16(class) btHeightfieldTerrainShape : public btConcaveShape
 {
 protected:
 	btVector3	m_localAabbMin;
@@ -116,6 +116,9 @@ protected:
 	                PHY_ScalarType heightDataType, bool flipQuadEdges);
 
 public:
+	
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	/// preferred constructor
 	/**
 	  This constructor supports a range of heightfield
diff --git a/src/BulletCollision/CollisionShapes/btMinkowskiSumShape.h b/src/BulletCollision/CollisionShapes/btMinkowskiSumShape.h
index 6c844e8c0..a3f9a4723 100644
--- a/src/BulletCollision/CollisionShapes/btMinkowskiSumShape.h
+++ b/src/BulletCollision/CollisionShapes/btMinkowskiSumShape.h
@@ -20,7 +20,7 @@ subject to the following restrictions:
 #include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h" // for the types
 
 /// The btMinkowskiSumShape is only for advanced users. This shape represents implicit based minkowski sum of two convex implicit shapes.
-class btMinkowskiSumShape : public btConvexInternalShape
+ATTRIBUTE_ALIGNED16(class) btMinkowskiSumShape : public btConvexInternalShape
 {
 
 	btTransform	m_transA;
@@ -30,6 +30,8 @@ class btMinkowskiSumShape : public btConvexInternalShape
 
 public:
 
+BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	btMinkowskiSumShape(const btConvexShape* shapeA,const btConvexShape* shapeB);
 
 	virtual btVector3	localGetSupportingVertexWithoutMargin(const btVector3& vec)const;
diff --git a/src/BulletCollision/CollisionShapes/btMultiSphereShape.cpp b/src/BulletCollision/CollisionShapes/btMultiSphereShape.cpp
index c996bfcda..5bae24250 100644
--- a/src/BulletCollision/CollisionShapes/btMultiSphereShape.cpp
+++ b/src/BulletCollision/CollisionShapes/btMultiSphereShape.cpp
@@ -39,10 +39,11 @@ btMultiSphereShape::btMultiSphereShape (const btVector3* positions,const btScala
 
 }
 
- 
+#ifndef MIN
+    #define MIN( _a, _b)    ((_a) < (_b) ? (_a) : (_b))
+#endif
  btVector3	btMultiSphereShape::localGetSupportingVertexWithoutMargin(const btVector3& vec0)const
 {
-	int i;
 	btVector3 supVec(0,0,0);
 
 	btScalar maxDot(btScalar(-BT_LARGE_FLOAT));
@@ -66,18 +67,23 @@ btMultiSphereShape::btMultiSphereShape (const btVector3* positions,const btScala
 	const btScalar* rad = &m_radiArray[0];
 	int numSpheres = m_localPositionArray.size();
 
-	for (i=0;i<numSpheres;i++)
-	{
-		vtx = (*pos) +vec*m_localScaling*(*rad) - vec * getMargin();
-		pos++;
-		rad++;
-		newDot = vec.dot(vtx);
-		if (newDot > maxDot)
+    for( int k = 0; k < numSpheres; k+= 128 )
+    {
+        btVector3 temp[128];
+        int inner_count = MIN( numSpheres - k, 128 );
+        for( long i = 0; i < inner_count; i++ )
+        {
+            temp[i] = (*pos) +vec*m_localScaling*(*rad) - vec * getMargin();
+            pos++;
+            rad++;
+        }
+        long i = vec.maxDot( temp, inner_count, newDot);
+        if( newDot > maxDot )
 		{
 			maxDot = newDot;
-			supVec = vtx;
+			supVec = temp[i];
 		}
-	}
+    }
 
 	return supVec;
 
@@ -98,18 +104,25 @@ btMultiSphereShape::btMultiSphereShape (const btVector3* positions,const btScala
 		const btVector3* pos = &m_localPositionArray[0];
 		const btScalar* rad = &m_radiArray[0];
 		int numSpheres = m_localPositionArray.size();
-		for (int i=0;i<numSpheres;i++)
-		{
-			vtx = (*pos) +vec*m_localScaling*(*rad) - vec * getMargin();
-			pos++;
-			rad++;
-			newDot = vec.dot(vtx);
-			if (newDot > maxDot)
-			{
-				maxDot = newDot;
-				supportVerticesOut[j] = vtx;
-			}
-		}
+
+        for( int k = 0; k < numSpheres; k+= 128 )
+        {
+            btVector3 temp[128];
+            int inner_count = MIN( numSpheres - k, 128 );
+            for( long i = 0; i < inner_count; i++ )
+            {
+                temp[i] = (*pos) +vec*m_localScaling*(*rad) - vec * getMargin();
+                pos++;
+                rad++;
+            }
+            long i = vec.maxDot( temp, inner_count, newDot);
+            if( newDot > maxDot )
+            {
+                maxDot = newDot;
+                supportVerticesOut[j] = temp[i];
+            }
+        }
+        
 	}
 }
 
diff --git a/src/BulletCollision/CollisionShapes/btMultiSphereShape.h b/src/BulletCollision/CollisionShapes/btMultiSphereShape.h
index 06c5d16d9..5d3b40268 100644
--- a/src/BulletCollision/CollisionShapes/btMultiSphereShape.h
+++ b/src/BulletCollision/CollisionShapes/btMultiSphereShape.h
@@ -25,13 +25,15 @@ subject to the following restrictions:
 
 ///The btMultiSphereShape represents the convex hull of a collection of spheres. You can create special capsules or other smooth volumes.
 ///It is possible to animate the spheres for deformation, but call 'recalcLocalAabb' after changing any sphere position/radius
-class btMultiSphereShape : public btConvexInternalAabbCachingShape
+ATTRIBUTE_ALIGNED16(class) btMultiSphereShape : public btConvexInternalAabbCachingShape
 {
 	
 	btAlignedObjectArray<btVector3> m_localPositionArray;
 	btAlignedObjectArray<btScalar>  m_radiArray;
 	
 public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	btMultiSphereShape (const btVector3* positions,const btScalar* radi,int numSpheres);
 
 	///CollisionShape Interface
diff --git a/src/BulletCollision/CollisionShapes/btPolyhedralConvexShape.cpp b/src/BulletCollision/CollisionShapes/btPolyhedralConvexShape.cpp
index 82def79cf..063e1d044 100644
--- a/src/BulletCollision/CollisionShapes/btPolyhedralConvexShape.cpp
+++ b/src/BulletCollision/CollisionShapes/btPolyhedralConvexShape.cpp
@@ -45,7 +45,7 @@ bool	btPolyhedralConvexShape::initializePolyhedralFeatures()
 	void* mem = btAlignedAlloc(sizeof(btConvexPolyhedron),16);
 	m_polyhedron = new (mem) btConvexPolyhedron;
 
-		btAlignedObjectArray<btVector3> orgVertices;
+	btAlignedObjectArray<btVector3> orgVertices;
 
 	for (int i=0;i<getNumVertices();i++)
 	{
@@ -107,9 +107,6 @@ bool	btPolyhedralConvexShape::initializePolyhedralFeatures()
 		int numEdges = 0;
 		//compute face normals
 
-		btScalar maxCross2 = 0.f;
-		int chosenEdge = -1;
-
 		do
 		{
 			
@@ -299,6 +296,9 @@ bool	btPolyhedralConvexShape::initializePolyhedralFeatures()
 	return true;
 }
 
+#ifndef MIN
+    #define MIN(_a, _b)     ((_a) < (_b) ? (_a) : (_b))
+#endif
 
 btVector3	btPolyhedralConvexShape::localGetSupportingVertexWithoutMargin(const btVector3& vec0)const
 {
@@ -323,17 +323,19 @@ btVector3	btPolyhedralConvexShape::localGetSupportingVertexWithoutMargin(const b
 	btVector3 vtx;
 	btScalar newDot;
 
-	for (i=0;i<getNumVertices();i++)
-	{
-		getVertex(i,vtx);
-		newDot = vec.dot(vtx);
+    for( int k = 0; k < getNumVertices(); k += 128 )
+    {
+        btVector3 temp[128];
+        int inner_count = MIN(getNumVertices() - k, 128);
+        for( i = 0; i < inner_count; i++ )
+            getVertex(i,temp[i]); 
+        i = (int) vec.maxDot( temp, inner_count, newDot);
 		if (newDot > maxDot)
 		{
 			maxDot = newDot;
-			supVec = vtx;
-		}
-	}
-
+			supVec = temp[i];
+		}        
+    }
 	
 #endif //__SPU__
 	return supVec;
@@ -356,21 +358,23 @@ void	btPolyhedralConvexShape::batchedUnitVectorGetSupportingVertexWithoutMargin(
 
 	for (int j=0;j<numVectors;j++)
 	{
-	
-		const btVector3& vec = vectors[j];
-
-		for (i=0;i<getNumVertices();i++)
-		{
-			getVertex(i,vtx);
-			newDot = vec.dot(vtx);
-			if (newDot > supportVerticesOut[j][3])
-			{
-				//WARNING: don't swap next lines, the w component would get overwritten!
-				supportVerticesOut[j] = vtx;
+        const btVector3& vec = vectors[j];
+        
+        for( int k = 0; k < getNumVertices(); k += 128 )
+        {
+            btVector3 temp[128];
+            int inner_count = MIN(getNumVertices() - k, 128);
+            for( i = 0; i < inner_count; i++ )
+                getVertex(i,temp[i]); 
+            i = (int) vec.maxDot( temp, inner_count, newDot);
+            if (newDot > supportVerticesOut[j][3])
+            {
+				supportVerticesOut[j] = temp[i];
 				supportVerticesOut[j][3] = newDot;
-			}
-		}
-	}
+            }        
+        }
+    }
+
 #endif //__SPU__
 }
 
diff --git a/src/BulletCollision/CollisionShapes/btPolyhedralConvexShape.h b/src/BulletCollision/CollisionShapes/btPolyhedralConvexShape.h
index ee2e1e282..cc689a6b1 100644
--- a/src/BulletCollision/CollisionShapes/btPolyhedralConvexShape.h
+++ b/src/BulletCollision/CollisionShapes/btPolyhedralConvexShape.h
@@ -22,7 +22,7 @@ class btConvexPolyhedron;
 
 
 ///The btPolyhedralConvexShape is an internal interface class for polyhedral convex shapes.
-class btPolyhedralConvexShape : public btConvexInternalShape
+ATTRIBUTE_ALIGNED16(class) btPolyhedralConvexShape : public btConvexInternalShape
 {
 	
 
@@ -31,6 +31,9 @@ protected:
 	btConvexPolyhedron* m_polyhedron;
 
 public:
+	
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 
 	btPolyhedralConvexShape();
 
diff --git a/src/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.cpp b/src/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.cpp
index 25d58d61b..6a337c786 100644
--- a/src/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.cpp
+++ b/src/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.cpp
@@ -98,9 +98,7 @@ void	btScaledBvhTriangleMeshShape::getAabb(const btTransform& trans,btVector3& a
 
 	btVector3 center = trans(localCenter);
 
-	btVector3 extent = btVector3(abs_b[0].dot(localHalfExtents),
-		   abs_b[1].dot(localHalfExtents),
-		  abs_b[2].dot(localHalfExtents));
+    btVector3 extent = localHalfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);
 	aabbMin = center - extent;
 	aabbMax = center + extent;
 
diff --git a/src/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.h b/src/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.h
index ff86ef319..39049eaf0 100644
--- a/src/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.h
+++ b/src/BulletCollision/CollisionShapes/btScaledBvhTriangleMeshShape.h
@@ -31,6 +31,8 @@ ATTRIBUTE_ALIGNED16(class) btScaledBvhTriangleMeshShape : public btConcaveShape
 
 public:
 
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
 
 	btScaledBvhTriangleMeshShape(btBvhTriangleMeshShape* childShape,const btVector3& localScaling);
 
diff --git a/src/BulletCollision/CollisionShapes/btShapeHull.h b/src/BulletCollision/CollisionShapes/btShapeHull.h
index 642a28874..e959f198b 100644
--- a/src/BulletCollision/CollisionShapes/btShapeHull.h
+++ b/src/BulletCollision/CollisionShapes/btShapeHull.h
@@ -25,7 +25,7 @@ subject to the following restrictions:
 ///The btShapeHull class takes a btConvexShape, builds a simplified convex hull using btConvexHull and provides triangle indices and vertices.
 ///It can be useful for to simplify a complex convex object and for visualization of a non-polyhedral convex object.
 ///It approximates the convex hull using the supporting vertex of 42 directions.
-class btShapeHull
+ATTRIBUTE_ALIGNED16(class) btShapeHull
 {
 protected:
 
@@ -37,6 +37,8 @@ protected:
 	static btVector3* getUnitSpherePoints();
 
 public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	btShapeHull (const btConvexShape* shape);
 	~btShapeHull ();
 
diff --git a/src/BulletCollision/CollisionShapes/btStaticPlaneShape.h b/src/BulletCollision/CollisionShapes/btStaticPlaneShape.h
index b13825e61..e6e328839 100644
--- a/src/BulletCollision/CollisionShapes/btStaticPlaneShape.h
+++ b/src/BulletCollision/CollisionShapes/btStaticPlaneShape.h
@@ -31,6 +31,8 @@ protected:
 	btVector3	m_localScaling;
 
 public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	btStaticPlaneShape(const btVector3& planeNormal,btScalar planeConstant);
 
 	virtual ~btStaticPlaneShape();
diff --git a/src/BulletCollision/CollisionShapes/btStridingMeshInterface.h b/src/BulletCollision/CollisionShapes/btStridingMeshInterface.h
index f2b27ade8..9fbe13976 100644
--- a/src/BulletCollision/CollisionShapes/btStridingMeshInterface.h
+++ b/src/BulletCollision/CollisionShapes/btStridingMeshInterface.h
@@ -27,13 +27,15 @@ subject to the following restrictions:
 ///	The btStridingMeshInterface is the interface class for high performance generic access to triangle meshes, used in combination with btBvhTriangleMeshShape and some other collision shapes.
 /// Using index striding of 3*sizeof(integer) it can use triangle arrays, using index striding of 1*sizeof(integer) it can handle triangle strips.
 /// It allows for sharing graphics and collision meshes. Also it provides locking/unlocking of graphics meshes that are in gpu memory.
-class  btStridingMeshInterface
+ATTRIBUTE_ALIGNED16(class ) btStridingMeshInterface
 {
 	protected:
 	
 		btVector3 m_scaling;
 
 	public:
+		BT_DECLARE_ALIGNED_ALLOCATOR();
+		
 		btStridingMeshInterface() :m_scaling(btScalar(1.),btScalar(1.),btScalar(1.))
 		{
 
diff --git a/src/BulletCollision/CollisionShapes/btTetrahedronShape.h b/src/BulletCollision/CollisionShapes/btTetrahedronShape.h
index 6b7128efc..b69209835 100644
--- a/src/BulletCollision/CollisionShapes/btTetrahedronShape.h
+++ b/src/BulletCollision/CollisionShapes/btTetrahedronShape.h
@@ -22,7 +22,7 @@ subject to the following restrictions:
 
 
 ///The btBU_Simplex1to4 implements tetrahedron, triangle, line, vertex collision shapes. In most cases it is better to use btConvexHullShape instead.
-class btBU_Simplex1to4 : public btPolyhedralConvexAabbCachingShape
+ATTRIBUTE_ALIGNED16(class) btBU_Simplex1to4 : public btPolyhedralConvexAabbCachingShape
 {
 protected:
 
@@ -30,6 +30,8 @@ protected:
 	btVector3	m_vertices[4];
 
 public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	btBU_Simplex1to4();
 
 	btBU_Simplex1to4(const btVector3& pt0);
diff --git a/src/BulletCollision/CollisionShapes/btTriangleMeshShape.cpp b/src/BulletCollision/CollisionShapes/btTriangleMeshShape.cpp
index 683684da7..0e1795140 100644
--- a/src/BulletCollision/CollisionShapes/btTriangleMeshShape.cpp
+++ b/src/BulletCollision/CollisionShapes/btTriangleMeshShape.cpp
@@ -55,13 +55,9 @@ void btTriangleMeshShape::getAabb(const btTransform& trans,btVector3& aabbMin,bt
 
 	btVector3 center = trans(localCenter);
 
-	btVector3 extent = btVector3(abs_b[0].dot(localHalfExtents),
-		   abs_b[1].dot(localHalfExtents),
-		  abs_b[2].dot(localHalfExtents));
+    btVector3 extent = localHalfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);
 	aabbMin = center - extent;
 	aabbMax = center + extent;
-
-
 }
 
 void	btTriangleMeshShape::recalcLocalAabb()
diff --git a/src/BulletCollision/CollisionShapes/btTriangleMeshShape.h b/src/BulletCollision/CollisionShapes/btTriangleMeshShape.h
index c8caf8fe6..453e58005 100644
--- a/src/BulletCollision/CollisionShapes/btTriangleMeshShape.h
+++ b/src/BulletCollision/CollisionShapes/btTriangleMeshShape.h
@@ -21,7 +21,7 @@ subject to the following restrictions:
 
 
 ///The btTriangleMeshShape is an internal concave triangle mesh interface. Don't use this class directly, use btBvhTriangleMeshShape instead.
-class btTriangleMeshShape : public btConcaveShape
+ATTRIBUTE_ALIGNED16(class) btTriangleMeshShape : public btConcaveShape
 {
 protected:
 	btVector3	m_localAabbMin;
@@ -33,6 +33,7 @@ protected:
 	btTriangleMeshShape(btStridingMeshInterface* meshInterface);
 
 public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
 
 	virtual ~btTriangleMeshShape();
 
diff --git a/src/BulletCollision/CollisionShapes/btTriangleShape.h b/src/BulletCollision/CollisionShapes/btTriangleShape.h
index 71b055738..a8a80f82f 100644
--- a/src/BulletCollision/CollisionShapes/btTriangleShape.h
+++ b/src/BulletCollision/CollisionShapes/btTriangleShape.h
@@ -25,6 +25,8 @@ ATTRIBUTE_ALIGNED16(class) btTriangleShape : public btPolyhedralConvexShape
 
 public:
 
+BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	btVector3	m_vertices1[3];
 
 	virtual int getNumVertices() const
@@ -66,7 +68,7 @@ public:
 
 	btVector3 localGetSupportingVertexWithoutMargin(const btVector3& dir)const 
 	{
-		btVector3 dots(dir.dot(m_vertices1[0]), dir.dot(m_vertices1[1]), dir.dot(m_vertices1[2]));
+        btVector3 dots = dir.dot3(m_vertices1[0], m_vertices1[1], m_vertices1[2]);
 	  	return m_vertices1[dots.maxAxis()];
 
 	}
@@ -76,7 +78,7 @@ public:
 		for (int i=0;i<numVectors;i++)
 		{
 			const btVector3& dir = vectors[i];
-			btVector3 dots(dir.dot(m_vertices1[0]), dir.dot(m_vertices1[1]), dir.dot(m_vertices1[2]));
+            btVector3 dots = dir.dot3(m_vertices1[0], m_vertices1[1], m_vertices1[2]);
   			supportVerticesOut[i] = m_vertices1[dots.maxAxis()];
 		}
 
diff --git a/src/BulletCollision/CollisionShapes/btUniformScalingShape.h b/src/BulletCollision/CollisionShapes/btUniformScalingShape.h
index cbf7e6fd3..a10f58d24 100644
--- a/src/BulletCollision/CollisionShapes/btUniformScalingShape.h
+++ b/src/BulletCollision/CollisionShapes/btUniformScalingShape.h
@@ -21,7 +21,7 @@ subject to the following restrictions:
 
 ///The btUniformScalingShape allows to re-use uniform scaled instances of btConvexShape in a memory efficient way.
 ///Istead of using btUniformScalingShape, it is better to use the non-uniform setLocalScaling method on convex shapes that implement it.
-class btUniformScalingShape : public btConvexShape
+ATTRIBUTE_ALIGNED16(class) btUniformScalingShape : public btConvexShape
 {
 	btConvexShape*	m_childConvexShape;
 
@@ -29,6 +29,8 @@ class btUniformScalingShape : public btConvexShape
 	
 	public:
 	
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	btUniformScalingShape(	btConvexShape* convexChildShape, btScalar uniformScalingFactor);
 	
 	virtual ~btUniformScalingShape();
diff --git a/src/BulletCollision/Gimpact/btBoxCollision.h b/src/BulletCollision/Gimpact/btBoxCollision.h
index d5676aaa8..0a0357e5a 100644
--- a/src/BulletCollision/Gimpact/btBoxCollision.h
+++ b/src/BulletCollision/Gimpact/btBoxCollision.h
@@ -218,9 +218,7 @@ public:
 
 	SIMD_FORCE_INLINE btVector3 transform(const btVector3 & point) const
 	{
-		return btVector3(m_R1to0[0].dot(point) + m_T1to0.x(),
-			m_R1to0[1].dot(point) + m_T1to0.y(),
-			m_R1to0[2].dot(point) + m_T1to0.z());
+        return point.dot3( m_R1to0[0], m_R1to0[1], m_R1to0[2] ) + m_T1to0;
 	}
 };
 
@@ -364,9 +362,9 @@ public:
 		// Compute new center
 		center = trans(center);
 
-		btVector3 textends(extends.dot(trans.getBasis().getRow(0).absolute()),
- 				 extends.dot(trans.getBasis().getRow(1).absolute()),
-				 extends.dot(trans.getBasis().getRow(2).absolute()));
+        btVector3 textends = extends.dot3(trans.getBasis().getRow(0).absolute(), 
+                                          trans.getBasis().getRow(1).absolute(), 
+                                          trans.getBasis().getRow(2).absolute());
 
 		m_min = center - textends;
 		m_max = center + textends;
@@ -381,10 +379,10 @@ public:
 		// Compute new center
 		center = trans.transform(center);
 
-		btVector3 textends(extends.dot(trans.m_R1to0.getRow(0).absolute()),
- 				 extends.dot(trans.m_R1to0.getRow(1).absolute()),
-				 extends.dot(trans.m_R1to0.getRow(2).absolute()));
-
+        btVector3 textends = extends.dot3(trans.m_R1to0.getRow(0).absolute(), 
+                                          trans.m_R1to0.getRow(1).absolute(), 
+                                          trans.m_R1to0.getRow(2).absolute());
+        
 		m_min = center - textends;
 		m_max = center + textends;
 	}
diff --git a/src/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.cpp b/src/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.cpp
index 2f2c09ffc..5997e443e 100644
--- a/src/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.cpp
+++ b/src/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.cpp
@@ -51,7 +51,7 @@ public:
 	}
 
 
-	void get_plane_equation_transformed(const btTransform & trans,btVector4 &equation)
+	void get_plane_equation_transformed(const btTransform & trans,btVector4 &equation) const 
 	{
 		equation[0] = trans.getBasis().getRow(0).dot(m_planeNormal);
 		equation[1] = trans.getBasis().getRow(1).dot(m_planeNormal);
@@ -89,7 +89,7 @@ Declared here due of insuficent space on Pool allocators
 class GIM_ShapeRetriever
 {
 public:
-	btGImpactShapeInterface * m_gim_shape;
+	const btGImpactShapeInterface * m_gim_shape;
 	btTriangleShapeEx m_trishape;
 	btTetrahedronShapeEx m_tetrashape;
 
@@ -98,7 +98,7 @@ public:
 	{
 	public:
 		GIM_ShapeRetriever * m_parent;
-		virtual btCollisionShape * getChildShape(int index)
+		virtual const btCollisionShape * getChildShape(int index)
 		{
 			return m_parent->m_gim_shape->getChildShape(index);
 		}
@@ -133,7 +133,7 @@ public:
 	TetraShapeRetriever  m_tetra_retriever;
 	ChildShapeRetriever * m_current_retriever;
 
-	GIM_ShapeRetriever(btGImpactShapeInterface * gim_shape)
+	GIM_ShapeRetriever(const btGImpactShapeInterface * gim_shape)
 	{
 		m_gim_shape = gim_shape;
 		//select retriever
@@ -153,7 +153,7 @@ public:
 		m_current_retriever->m_parent = this;
 	}
 
-	btCollisionShape * getChildShape(int index)
+	const btCollisionShape * getChildShape(int index)
 	{
 		return m_current_retriever->getChildShape(index);
 	}
@@ -193,8 +193,8 @@ float btGImpactCollisionAlgorithm::getAverageTriangleCollisionTime()
 
 
 
-btGImpactCollisionAlgorithm::btGImpactCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
-: btActivatingCollisionAlgorithm(ci,body0,body1)
+btGImpactCollisionAlgorithm::btGImpactCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
+: btActivatingCollisionAlgorithm(ci,body0Wrap,body1Wrap)
 {
 	m_manifoldPtr = NULL;
 	m_convex_algorithm = NULL;
@@ -209,71 +209,60 @@ btGImpactCollisionAlgorithm::~btGImpactCollisionAlgorithm()
 
 
 
-void btGImpactCollisionAlgorithm::addContactPoint(btCollisionObject * body0,
-				btCollisionObject * body1,
+void btGImpactCollisionAlgorithm::addContactPoint(const btCollisionObjectWrapper * body0Wrap,
+				const btCollisionObjectWrapper * body1Wrap,
 				const btVector3 & point,
 				const btVector3 & normal,
 				btScalar distance)
 {
 	m_resultOut->setShapeIdentifiersA(m_part0,m_triface0);
 	m_resultOut->setShapeIdentifiersB(m_part1,m_triface1);
-	checkManifold(body0,body1);
+	checkManifold(body0Wrap,body1Wrap);
 	m_resultOut->addContactPoint(normal,point,distance);
 }
 
 
 void btGImpactCollisionAlgorithm::shape_vs_shape_collision(
-					  btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btCollisionShape * shape0,
-					  btCollisionShape * shape1)
+					  const btCollisionObjectWrapper * body0Wrap,
+					  const btCollisionObjectWrapper* body1Wrap,
+					  const btCollisionShape * shape0,
+					  const btCollisionShape * shape1)
 {
 
-	btCollisionShape* tmpShape0 = body0->getCollisionShape();
-	btCollisionShape* tmpShape1 = body1->getCollisionShape();
-	
-	body0->internalSetTemporaryCollisionShape(shape0);
-	body1->internalSetTemporaryCollisionShape(shape1);
 
 	{
-		btCollisionAlgorithm* algor = newAlgorithm(body0,body1);
+		btCollisionObjectWrapper ob0(body0Wrap,shape0,body0Wrap->getCollisionObject(), body0Wrap->getWorldTransform());
+		btCollisionObjectWrapper ob1(body1Wrap,shape1,body1Wrap->getCollisionObject(),body1Wrap->getWorldTransform());
+
+		btCollisionAlgorithm* algor = newAlgorithm(&ob0,&ob1);
 		// post :	checkManifold is called
 
 		m_resultOut->setShapeIdentifiersA(m_part0,m_triface0);
 		m_resultOut->setShapeIdentifiersB(m_part1,m_triface1);
 
-		algor->processCollision(body0,body1,*m_dispatchInfo,m_resultOut);
+		algor->processCollision(&ob0,&ob1,*m_dispatchInfo,m_resultOut);
 
 		algor->~btCollisionAlgorithm();
 		m_dispatcher->freeCollisionAlgorithm(algor);
 	}
 
-	body0->internalSetTemporaryCollisionShape(tmpShape0);
-	body1->internalSetTemporaryCollisionShape(tmpShape1);
 }
 
 void btGImpactCollisionAlgorithm::convex_vs_convex_collision(
-					  btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btCollisionShape * shape0,
-					  btCollisionShape * shape1)
+					  const btCollisionObjectWrapper* body0Wrap,
+					  const btCollisionObjectWrapper* body1Wrap,
+					  const btCollisionShape* shape0,
+					  const btCollisionShape* shape1)
 {
 
-	btCollisionShape* tmpShape0 = body0->getCollisionShape();
-	btCollisionShape* tmpShape1 = body1->getCollisionShape();
-	
-	body0->internalSetTemporaryCollisionShape(shape0);
-	body1->internalSetTemporaryCollisionShape(shape1);
-
-
 	m_resultOut->setShapeIdentifiersA(m_part0,m_triface0);
 	m_resultOut->setShapeIdentifiersB(m_part1,m_triface1);
 
-	checkConvexAlgorithm(body0,body1);
-	m_convex_algorithm->processCollision(body0,body1,*m_dispatchInfo,m_resultOut);
+	btCollisionObjectWrapper ob0(body0Wrap,shape0,body0Wrap->getCollisionObject(),body0Wrap->getWorldTransform());
+	btCollisionObjectWrapper ob1(body1Wrap,shape1,body1Wrap->getCollisionObject(),body1Wrap->getWorldTransform());
+	checkConvexAlgorithm(&ob0,&ob1);
+	m_convex_algorithm->processCollision(&ob0,&ob1,*m_dispatchInfo,m_resultOut);
 
-	body0->internalSetTemporaryCollisionShape(tmpShape0);
-	body1->internalSetTemporaryCollisionShape(tmpShape1);
 
 }
 
@@ -283,8 +272,8 @@ void btGImpactCollisionAlgorithm::convex_vs_convex_collision(
 void btGImpactCollisionAlgorithm::gimpact_vs_gimpact_find_pairs(
 					  const btTransform & trans0,
 					  const btTransform & trans1,
-					  btGImpactShapeInterface * shape0,
-					  btGImpactShapeInterface * shape1,btPairSet & pairset)
+					  const btGImpactShapeInterface * shape0,
+					  const btGImpactShapeInterface * shape1,btPairSet & pairset)
 {
 	if(shape0->hasBoxSet() && shape1->hasBoxSet())
 	{
@@ -320,8 +309,8 @@ void btGImpactCollisionAlgorithm::gimpact_vs_gimpact_find_pairs(
 void btGImpactCollisionAlgorithm::gimpact_vs_shape_find_pairs(
 					  const btTransform & trans0,
 					  const btTransform & trans1,
-					  btGImpactShapeInterface * shape0,
-					  btCollisionShape * shape1,
+					  const btGImpactShapeInterface * shape0,
+					  const btCollisionShape * shape1,
 					  btAlignedObjectArray<int> & collided_primitives)
 {
 
@@ -359,10 +348,10 @@ void btGImpactCollisionAlgorithm::gimpact_vs_shape_find_pairs(
 }
 
 
-void btGImpactCollisionAlgorithm::collide_gjk_triangles(btCollisionObject * body0,
-				  btCollisionObject * body1,
-				  btGImpactMeshShapePart * shape0,
-				  btGImpactMeshShapePart * shape1,
+void btGImpactCollisionAlgorithm::collide_gjk_triangles(const btCollisionObjectWrapper * body0Wrap,
+				  const btCollisionObjectWrapper * body1Wrap,
+				  const btGImpactMeshShapePart * shape0,
+				  const btGImpactMeshShapePart * shape1,
 				  const int * pairs, int pair_count)
 {
 	btTriangleShapeEx tri0;
@@ -389,7 +378,7 @@ void btGImpactCollisionAlgorithm::collide_gjk_triangles(btCollisionObject * body
 		//collide two convex shapes
 		if(tri0.overlap_test_conservative(tri1))
 		{
-			convex_vs_convex_collision(body0,body1,&tri0,&tri1);
+			convex_vs_convex_collision(body0Wrap,body1Wrap,&tri0,&tri1);
 		}
 
 	}
@@ -398,14 +387,14 @@ void btGImpactCollisionAlgorithm::collide_gjk_triangles(btCollisionObject * body
 	shape1->unlockChildShapes();
 }
 
-void btGImpactCollisionAlgorithm::collide_sat_triangles(btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactMeshShapePart * shape0,
-					  btGImpactMeshShapePart * shape1,
+void btGImpactCollisionAlgorithm::collide_sat_triangles(const btCollisionObjectWrapper* body0Wrap,
+					  const btCollisionObjectWrapper* body1Wrap,
+					  const btGImpactMeshShapePart * shape0,
+					  const btGImpactMeshShapePart * shape1,
 					  const int * pairs, int pair_count)
 {
-	btTransform orgtrans0 = body0->getWorldTransform();
-	btTransform orgtrans1 = body1->getWorldTransform();
+	btTransform orgtrans0 = body0Wrap->getWorldTransform();
+	btTransform orgtrans1 = body1Wrap->getWorldTransform();
 
 	btPrimitiveTriangle ptri0;
 	btPrimitiveTriangle ptri1;
@@ -451,7 +440,7 @@ void btGImpactCollisionAlgorithm::collide_sat_triangles(btCollisionObject * body
 				while(j--)
 				{
 
-					addContactPoint(body0, body1,
+					addContactPoint(body0Wrap, body1Wrap,
 								contact_data.m_points[j],
 								contact_data.m_separating_normal,
 								-contact_data.m_penetration_depth);
@@ -472,20 +461,20 @@ void btGImpactCollisionAlgorithm::collide_sat_triangles(btCollisionObject * body
 
 
 void btGImpactCollisionAlgorithm::gimpact_vs_gimpact(
-						btCollisionObject * body0,
-					   	btCollisionObject * body1,
-					  	btGImpactShapeInterface * shape0,
-					  	btGImpactShapeInterface * shape1)
+						const btCollisionObjectWrapper* body0Wrap,
+					   	const btCollisionObjectWrapper * body1Wrap,
+					  	const btGImpactShapeInterface * shape0,
+					  	const btGImpactShapeInterface * shape1)
 {
 
 	if(shape0->getGImpactShapeType()==CONST_GIMPACT_TRIMESH_SHAPE)
 	{
-		btGImpactMeshShape * meshshape0 = static_cast<btGImpactMeshShape *>(shape0);
+		const btGImpactMeshShape * meshshape0 = static_cast<const btGImpactMeshShape *>(shape0);
 		m_part0 = meshshape0->getMeshPartCount();
 
 		while(m_part0--)
 		{
-			gimpact_vs_gimpact(body0,body1,meshshape0->getMeshPart(m_part0),shape1);
+			gimpact_vs_gimpact(body0Wrap,body1Wrap,meshshape0->getMeshPart(m_part0),shape1);
 		}
 
 		return;
@@ -493,13 +482,13 @@ void btGImpactCollisionAlgorithm::gimpact_vs_gimpact(
 
 	if(shape1->getGImpactShapeType()==CONST_GIMPACT_TRIMESH_SHAPE)
 	{
-		btGImpactMeshShape * meshshape1 = static_cast<btGImpactMeshShape *>(shape1);
+		const btGImpactMeshShape * meshshape1 = static_cast<const btGImpactMeshShape *>(shape1);
 		m_part1 = meshshape1->getMeshPartCount();
 
 		while(m_part1--)
 		{
 
-			gimpact_vs_gimpact(body0,body1,shape0,meshshape1->getMeshPart(m_part1));
+			gimpact_vs_gimpact(body0Wrap,body1Wrap,shape0,meshshape1->getMeshPart(m_part1));
 
 		}
 
@@ -507,8 +496,8 @@ void btGImpactCollisionAlgorithm::gimpact_vs_gimpact(
 	}
 
 
-	btTransform orgtrans0 = body0->getWorldTransform();
-	btTransform orgtrans1 = body1->getWorldTransform();
+	btTransform orgtrans0 = body0Wrap->getWorldTransform();
+	btTransform orgtrans1 = body1Wrap->getWorldTransform();
 
 	btPairSet pairset;
 
@@ -519,13 +508,13 @@ void btGImpactCollisionAlgorithm::gimpact_vs_gimpact(
 	if(shape0->getGImpactShapeType() == CONST_GIMPACT_TRIMESH_SHAPE_PART &&
 		shape1->getGImpactShapeType() == CONST_GIMPACT_TRIMESH_SHAPE_PART)
 	{
-		btGImpactMeshShapePart * shapepart0 = static_cast<btGImpactMeshShapePart * >(shape0);
-		btGImpactMeshShapePart * shapepart1 = static_cast<btGImpactMeshShapePart * >(shape1);
+		const btGImpactMeshShapePart * shapepart0 = static_cast<const btGImpactMeshShapePart * >(shape0);
+		const btGImpactMeshShapePart * shapepart1 = static_cast<const btGImpactMeshShapePart * >(shape1);
 		//specialized function
 		#ifdef BULLET_TRIANGLE_COLLISION
-		collide_gjk_triangles(body0,body1,shapepart0,shapepart1,&pairset[0].m_index1,pairset.size());
+		collide_gjk_triangles(body0Wrap,body1Wrap,shapepart0,shapepart1,&pairset[0].m_index1,pairset.size());
 		#else
-		collide_sat_triangles(body0,body1,shapepart0,shapepart1,&pairset[0].m_index1,pairset.size());
+		collide_sat_triangles(body0Wrap,body1Wrap,shapepart0,shapepart1,&pairset[0].m_index1,pairset.size());
 		#endif
 
 		return;
@@ -548,55 +537,49 @@ void btGImpactCollisionAlgorithm::gimpact_vs_gimpact(
 		GIM_PAIR * pair = &pairset[i];
 		m_triface0 = pair->m_index1;
 		m_triface1 = pair->m_index2;
-		btCollisionShape * colshape0 = retriever0.getChildShape(m_triface0);
-		btCollisionShape * colshape1 = retriever1.getChildShape(m_triface1);
+		const btCollisionShape * colshape0 = retriever0.getChildShape(m_triface0);
+		const btCollisionShape * colshape1 = retriever1.getChildShape(m_triface1);
+
+		btTransform tr0 = body0Wrap->getWorldTransform();
+		btTransform tr1 = body1Wrap->getWorldTransform();
 
 		if(child_has_transform0)
 		{
-			body0->setWorldTransform(orgtrans0*shape0->getChildTransform(m_triface0));
+			tr0 = orgtrans0*shape0->getChildTransform(m_triface0);
 		}
 
 		if(child_has_transform1)
 		{
-			body1->setWorldTransform(orgtrans1*shape1->getChildTransform(m_triface1));
+			tr1 = orgtrans1*shape1->getChildTransform(m_triface1);
 		}
 
+		btCollisionObjectWrapper ob0(body0Wrap,colshape0,body0Wrap->getCollisionObject(),tr0);
+		btCollisionObjectWrapper ob1(body1Wrap,colshape1,body1Wrap->getCollisionObject(),tr1);
+
 		//collide two convex shapes
-		convex_vs_convex_collision(body0,body1,colshape0,colshape1);
-
-
-		if(child_has_transform0)
-		{
-			body0->setWorldTransform(orgtrans0);
-		}
-
-		if(child_has_transform1)
-		{
-			body1->setWorldTransform(orgtrans1);
-		}
-
+		convex_vs_convex_collision(&ob0,&ob1,colshape0,colshape1);
 	}
 
 	shape0->unlockChildShapes();
 	shape1->unlockChildShapes();
 }
 
-void btGImpactCollisionAlgorithm::gimpact_vs_shape(btCollisionObject * body0,
-				  btCollisionObject * body1,
-				  btGImpactShapeInterface * shape0,
-				  btCollisionShape * shape1,bool swapped)
+void btGImpactCollisionAlgorithm::gimpact_vs_shape(const btCollisionObjectWrapper* body0Wrap,
+				  const btCollisionObjectWrapper * body1Wrap,
+				  const btGImpactShapeInterface * shape0,
+				  const btCollisionShape * shape1,bool swapped)
 {
 	if(shape0->getGImpactShapeType()==CONST_GIMPACT_TRIMESH_SHAPE)
 	{
-		btGImpactMeshShape * meshshape0 = static_cast<btGImpactMeshShape *>(shape0);
+		const btGImpactMeshShape * meshshape0 = static_cast<const btGImpactMeshShape *>(shape0);
 		int& part = swapped ? m_part1 : m_part0;
 		part = meshshape0->getMeshPartCount();
 
 		while(part--)
 		{
 
-			gimpact_vs_shape(body0,
-				  body1,
+			gimpact_vs_shape(body0Wrap,
+				  body1Wrap,
 				  meshshape0->getMeshPart(part),
 				  shape1,swapped);
 
@@ -609,9 +592,9 @@ void btGImpactCollisionAlgorithm::gimpact_vs_shape(btCollisionObject * body0,
 	if(shape0->getGImpactShapeType() == CONST_GIMPACT_TRIMESH_SHAPE_PART &&
 		shape1->getShapeType() == STATIC_PLANE_PROXYTYPE)
 	{
-		btGImpactMeshShapePart * shapepart = static_cast<btGImpactMeshShapePart *>(shape0);
-		btStaticPlaneShape * planeshape = static_cast<btStaticPlaneShape * >(shape1);
-		gimpacttrimeshpart_vs_plane_collision(body0,body1,shapepart,planeshape,swapped);
+		const btGImpactMeshShapePart * shapepart = static_cast<const btGImpactMeshShapePart *>(shape0);
+		const btStaticPlaneShape * planeshape = static_cast<const btStaticPlaneShape * >(shape1);
+		gimpacttrimeshpart_vs_plane_collision(body0Wrap,body1Wrap,shapepart,planeshape,swapped);
 		return;
 	}
 
@@ -621,21 +604,21 @@ void btGImpactCollisionAlgorithm::gimpact_vs_shape(btCollisionObject * body0,
 
 	if(shape1->isCompound())
 	{
-		btCompoundShape * compoundshape = static_cast<btCompoundShape *>(shape1);
-		gimpact_vs_compoundshape(body0,body1,shape0,compoundshape,swapped);
+		const btCompoundShape * compoundshape = static_cast<const btCompoundShape *>(shape1);
+		gimpact_vs_compoundshape(body0Wrap,body1Wrap,shape0,compoundshape,swapped);
 		return;
 	}
 	else if(shape1->isConcave())
 	{
-		btConcaveShape * concaveshape = static_cast<btConcaveShape *>(shape1);
-		gimpact_vs_concave(body0,body1,shape0,concaveshape,swapped);
+		const btConcaveShape * concaveshape = static_cast<const btConcaveShape *>(shape1);
+		gimpact_vs_concave(body0Wrap,body1Wrap,shape0,concaveshape,swapped);
 		return;
 	}
 
 
-	btTransform orgtrans0 = body0->getWorldTransform();
+	btTransform orgtrans0 = body0Wrap->getWorldTransform();
 
-	btTransform orgtrans1 = body1->getWorldTransform();
+	btTransform orgtrans1 = body1Wrap->getWorldTransform();
 
 	btAlignedObjectArray<int> collided_results;
 
@@ -662,27 +645,25 @@ void btGImpactCollisionAlgorithm::gimpact_vs_shape(btCollisionObject * body0,
         else
             m_triface0 = child_index;
 
-		btCollisionShape * colshape0 = retriever0.getChildShape(child_index);
+		const btCollisionShape * colshape0 = retriever0.getChildShape(child_index);
+
+		btTransform tr0 = body0Wrap->getWorldTransform();
 
 		if(child_has_transform0)
 		{
-			body0->setWorldTransform(orgtrans0*shape0->getChildTransform(child_index));
+			tr0 = orgtrans0*shape0->getChildTransform(child_index);
 		}
 
+		btCollisionObjectWrapper ob0(body0Wrap,colshape0,body0Wrap->getCollisionObject(),body0Wrap->getWorldTransform());
+
 		//collide two shapes
 		if(swapped)
 		{
-			shape_vs_shape_collision(body1,body0,shape1,colshape0);
+			shape_vs_shape_collision(body1Wrap,&ob0,shape1,colshape0);
 		}
 		else
 		{
-			shape_vs_shape_collision(body0,body1,colshape0,shape1);
-		}
-
-		//restore transforms
-		if(child_has_transform0)
-		{
-			body0->setWorldTransform(orgtrans0);
+			shape_vs_shape_collision(&ob0,body1Wrap,colshape0,shape1);
 		}
 
 	}
@@ -691,44 +672,39 @@ void btGImpactCollisionAlgorithm::gimpact_vs_shape(btCollisionObject * body0,
 
 }
 
-void btGImpactCollisionAlgorithm::gimpact_vs_compoundshape(btCollisionObject * body0,
-				  btCollisionObject * body1,
-				  btGImpactShapeInterface * shape0,
-				  btCompoundShape * shape1,bool swapped)
+void btGImpactCollisionAlgorithm::gimpact_vs_compoundshape(const btCollisionObjectWrapper* body0Wrap,
+				  const btCollisionObjectWrapper* body1Wrap,
+				  const btGImpactShapeInterface * shape0,
+				  const btCompoundShape * shape1,bool swapped)
 {
-	btTransform orgtrans1 = body1->getWorldTransform();
+	btTransform orgtrans1 = body1Wrap->getWorldTransform();
 
 	int i = shape1->getNumChildShapes();
 	while(i--)
 	{
 
-		btCollisionShape * colshape1 = shape1->getChildShape(i);
+		const btCollisionShape * colshape1 = shape1->getChildShape(i);
 		btTransform childtrans1 = orgtrans1*shape1->getChildTransform(i);
 
-		body1->setWorldTransform(childtrans1);
-
+		btCollisionObjectWrapper ob1(body1Wrap,colshape1,body1Wrap->getCollisionObject(),childtrans1);
 		//collide child shape
-		gimpact_vs_shape(body0, body1,
+		gimpact_vs_shape(body0Wrap, &ob1,
 					  shape0,colshape1,swapped);
-
-
-		//restore transforms
-		body1->setWorldTransform(orgtrans1);
 	}
 }
 
 void btGImpactCollisionAlgorithm::gimpacttrimeshpart_vs_plane_collision(
-					  btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactMeshShapePart * shape0,
-					  btStaticPlaneShape * shape1,bool swapped)
+					  const btCollisionObjectWrapper * body0Wrap,
+					  const btCollisionObjectWrapper * body1Wrap,
+					  const btGImpactMeshShapePart * shape0,
+					  const btStaticPlaneShape * shape1,bool swapped)
 {
 
 
-	btTransform orgtrans0 = body0->getWorldTransform();
-	btTransform orgtrans1 = body1->getWorldTransform();
+	btTransform orgtrans0 = body0Wrap->getWorldTransform();
+	btTransform orgtrans1 = body1Wrap->getWorldTransform();
 
-	btPlaneShape * planeshape = static_cast<btPlaneShape *>(shape1);
+	const btPlaneShape * planeshape = static_cast<const btPlaneShape *>(shape1);
 	btVector4 plane;
 	planeshape->get_plane_equation_transformed(orgtrans1,plane);
 
@@ -757,14 +733,14 @@ void btGImpactCollisionAlgorithm::gimpacttrimeshpart_vs_plane_collision(
 		{
 			if(swapped)
 			{
-				addContactPoint(body1, body0,
+				addContactPoint(body1Wrap, body0Wrap,
 					vertex,
 					-plane,
 					distance);
 			}
 			else
 			{
-				addContactPoint(body0, body1,
+				addContactPoint(body0Wrap, body1Wrap,
 					vertex,
 					plane,
 					distance);
@@ -782,9 +758,9 @@ class btGImpactTriangleCallback: public btTriangleCallback
 {
 public:
 	btGImpactCollisionAlgorithm * algorithm;
-	btCollisionObject * body0;
-	btCollisionObject * body1;
-	btGImpactShapeInterface * gimpactshape0;
+	const btCollisionObjectWrapper * body0Wrap;
+	const btCollisionObjectWrapper * body1Wrap;
+	const btGImpactShapeInterface * gimpactshape0;
 	bool swapped;
 	btScalar margin;
 
@@ -803,7 +779,7 @@ public:
             algorithm->setFace1(triangleIndex);
         }
 		algorithm->gimpact_vs_shape(
-							body0,body1,gimpactshape0,&tri1,swapped);
+							body0Wrap,body1Wrap,gimpactshape0,&tri1,swapped);
 	}
 };
 
@@ -811,16 +787,16 @@ public:
 
 
 void btGImpactCollisionAlgorithm::gimpact_vs_concave(
-				  btCollisionObject * body0,
-				  btCollisionObject * body1,
-				  btGImpactShapeInterface * shape0,
-				  btConcaveShape * shape1,bool swapped)
+				  const btCollisionObjectWrapper* body0Wrap,
+				  const btCollisionObjectWrapper * body1Wrap,
+				  const btGImpactShapeInterface * shape0,
+				  const btConcaveShape * shape1,bool swapped)
 {
 	//create the callback
 	btGImpactTriangleCallback tricallback;
 	tricallback.algorithm = this;
-	tricallback.body0 = body0;
-	tricallback.body1 = body1;
+	tricallback.body0Wrap = body0Wrap;
+	tricallback.body1Wrap = body1Wrap;
 	tricallback.gimpactshape0 = shape0;
 	tricallback.swapped = swapped;
 	tricallback.margin = shape1->getMargin();
@@ -828,7 +804,7 @@ void btGImpactCollisionAlgorithm::gimpact_vs_concave(
 	//getting the trimesh AABB
 	btTransform gimpactInConcaveSpace;
 
-	gimpactInConcaveSpace = body1->getWorldTransform().inverse() * body0->getWorldTransform();
+	gimpactInConcaveSpace = body1Wrap->getWorldTransform().inverse() * body0Wrap->getWorldTransform();
 
 	btVector3 minAABB,maxAABB;
 	shape0->getAabb(gimpactInConcaveSpace,minAABB,maxAABB);
@@ -839,36 +815,36 @@ void btGImpactCollisionAlgorithm::gimpact_vs_concave(
 
 
 
-void btGImpactCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btGImpactCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
     clearCache();
 
     m_resultOut = resultOut;
 	m_dispatchInfo = &dispatchInfo;
-    btGImpactShapeInterface * gimpactshape0;
-    btGImpactShapeInterface * gimpactshape1;
+    const btGImpactShapeInterface * gimpactshape0;
+    const btGImpactShapeInterface * gimpactshape1;
 
-	if (body0->getCollisionShape()->getShapeType()==GIMPACT_SHAPE_PROXYTYPE)
+	if (body0Wrap->getCollisionShape()->getShapeType()==GIMPACT_SHAPE_PROXYTYPE)
 	{
-		gimpactshape0 = static_cast<btGImpactShapeInterface *>(body0->getCollisionShape());
+		gimpactshape0 = static_cast<const btGImpactShapeInterface *>(body0Wrap->getCollisionShape());
 
-		if( body1->getCollisionShape()->getShapeType()==GIMPACT_SHAPE_PROXYTYPE )
+		if( body1Wrap->getCollisionShape()->getShapeType()==GIMPACT_SHAPE_PROXYTYPE )
 		{
-			gimpactshape1 = static_cast<btGImpactShapeInterface *>(body1->getCollisionShape());
+			gimpactshape1 = static_cast<const btGImpactShapeInterface *>(body1Wrap->getCollisionShape());
 
-			gimpact_vs_gimpact(body0,body1,gimpactshape0,gimpactshape1);
+			gimpact_vs_gimpact(body0Wrap,body1Wrap,gimpactshape0,gimpactshape1);
 		}
 		else
 		{
-			gimpact_vs_shape(body0,body1,gimpactshape0,body1->getCollisionShape(),false);
+			gimpact_vs_shape(body0Wrap,body1Wrap,gimpactshape0,body1Wrap->getCollisionShape(),false);
 		}
 
 	}
-	else if (body1->getCollisionShape()->getShapeType()==GIMPACT_SHAPE_PROXYTYPE )
+	else if (body1Wrap->getCollisionShape()->getShapeType()==GIMPACT_SHAPE_PROXYTYPE )
 	{
-		gimpactshape1 = static_cast<btGImpactShapeInterface *>(body1->getCollisionShape());
+		gimpactshape1 = static_cast<const btGImpactShapeInterface *>(body1Wrap->getCollisionShape());
 
-		gimpact_vs_shape(body1,body0,gimpactshape1,body0->getCollisionShape(),true);
+		gimpact_vs_shape(body1Wrap,body0Wrap,gimpactshape1,body0Wrap->getCollisionShape(),true);
 	}
 }
 
diff --git a/src/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.h b/src/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.h
index 6b6e07c98..c01b1eee8 100644
--- a/src/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.h
+++ b/src/BulletCollision/Gimpact/btGImpactCollisionAlgorithm.h
@@ -40,7 +40,7 @@ class btDispatcher;
 #include "BulletCollision/CollisionShapes/btCompoundShape.h"
 #include "BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h"
 #include "LinearMath/btIDebugDraw.h"
-
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 
 //! Collision Algorithm for GImpact Shapes
@@ -65,7 +65,7 @@ protected:
 
 
 	//! Creates a new contact point
-	SIMD_FORCE_INLINE btPersistentManifold* newContactManifold(btCollisionObject* body0,btCollisionObject* body1)
+	SIMD_FORCE_INLINE btPersistentManifold* newContactManifold(const btCollisionObject* body0,const btCollisionObject* body1)
 	{
 		m_manifoldPtr = m_dispatcher->getNewManifold(body0,body1);
 		return m_manifoldPtr;
@@ -106,38 +106,38 @@ protected:
 
 
 	// Call before process collision
-	SIMD_FORCE_INLINE void checkManifold(btCollisionObject* body0,btCollisionObject* body1)
+	SIMD_FORCE_INLINE void checkManifold(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 	{
 		if(getLastManifold() == 0)
 		{
-			newContactManifold(body0,body1);
+			newContactManifold(body0Wrap->getCollisionObject(),body1Wrap->getCollisionObject());
 		}
 
 		m_resultOut->setPersistentManifold(getLastManifold());
 	}
 
 	// Call before process collision
-	SIMD_FORCE_INLINE btCollisionAlgorithm * newAlgorithm(btCollisionObject* body0,btCollisionObject* body1)
+	SIMD_FORCE_INLINE btCollisionAlgorithm * newAlgorithm(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 	{
-		checkManifold(body0,body1);
+		checkManifold(body0Wrap,body1Wrap);
 
 		btCollisionAlgorithm * convex_algorithm = m_dispatcher->findAlgorithm(
-				body0,body1,getLastManifold());
+				body0Wrap,body1Wrap,getLastManifold());
 		return convex_algorithm ;
 	}
 
 	// Call before process collision
-	SIMD_FORCE_INLINE void checkConvexAlgorithm(btCollisionObject* body0,btCollisionObject* body1)
+	SIMD_FORCE_INLINE void checkConvexAlgorithm(const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 	{
 		if(m_convex_algorithm) return;
-		m_convex_algorithm = newAlgorithm(body0,body1);
+		m_convex_algorithm = newAlgorithm(body0Wrap,body1Wrap);
 	}
 
 
 
 
-	void addContactPoint(btCollisionObject * body0,
-					btCollisionObject * body1,
+	void addContactPoint(const btCollisionObjectWrapper * body0Wrap,
+					const btCollisionObjectWrapper * body1Wrap,
 					const btVector3 & point,
 					const btVector3 & normal,
 					btScalar distance);
@@ -145,62 +145,62 @@ protected:
 //! Collision routines
 //!@{
 
-	void collide_gjk_triangles(btCollisionObject * body0,
-				  btCollisionObject * body1,
-				  btGImpactMeshShapePart * shape0,
-				  btGImpactMeshShapePart * shape1,
+	void collide_gjk_triangles(const btCollisionObjectWrapper* body0Wrap,
+				  const btCollisionObjectWrapper* body1Wrap,
+				  const btGImpactMeshShapePart * shape0,
+				  const btGImpactMeshShapePart * shape1,
 				  const int * pairs, int pair_count);
 
-	void collide_sat_triangles(btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactMeshShapePart * shape0,
-					  btGImpactMeshShapePart * shape1,
+	void collide_sat_triangles(const btCollisionObjectWrapper* body0Wrap,
+					  const btCollisionObjectWrapper* body1Wrap,
+					  const btGImpactMeshShapePart * shape0,
+					  const btGImpactMeshShapePart * shape1,
 					  const int * pairs, int pair_count);
 
 
 
 
 	void shape_vs_shape_collision(
-					  btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btCollisionShape * shape0,
-					  btCollisionShape * shape1);
+					  const btCollisionObjectWrapper* body0,
+					  const btCollisionObjectWrapper* body1,
+					  const btCollisionShape * shape0,
+					  const btCollisionShape * shape1);
 
-	void convex_vs_convex_collision(btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btCollisionShape * shape0,
-					  btCollisionShape * shape1);
+	void convex_vs_convex_collision(const btCollisionObjectWrapper* body0Wrap,
+					  const btCollisionObjectWrapper* body1Wrap,
+					  const btCollisionShape* shape0,
+					  const btCollisionShape* shape1);
 
 
 
 	void gimpact_vs_gimpact_find_pairs(
 					  const btTransform & trans0,
 					  const btTransform & trans1,
-					  btGImpactShapeInterface * shape0,
-					  btGImpactShapeInterface * shape1,btPairSet & pairset);
+					  const btGImpactShapeInterface * shape0,
+					  const btGImpactShapeInterface * shape1,btPairSet & pairset);
 
 	void gimpact_vs_shape_find_pairs(
 					  const btTransform & trans0,
 					  const btTransform & trans1,
-					  btGImpactShapeInterface * shape0,
-					  btCollisionShape * shape1,
+					  const btGImpactShapeInterface * shape0,
+					  const btCollisionShape * shape1,
 					  btAlignedObjectArray<int> & collided_primitives);
 
 
 	void gimpacttrimeshpart_vs_plane_collision(
-					  btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactMeshShapePart * shape0,
-					  btStaticPlaneShape * shape1,bool swapped);
+					  const btCollisionObjectWrapper * body0Wrap,
+					  const btCollisionObjectWrapper * body1Wrap,
+					  const btGImpactMeshShapePart * shape0,
+					  const btStaticPlaneShape * shape1,bool swapped);
 
 
 public:
 
-	btGImpactCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
+	btGImpactCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
 
 	virtual ~btGImpactCollisionAlgorithm();
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	btScalar	calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -213,10 +213,10 @@ public:
 
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btGImpactCollisionAlgorithm));
-			return new(mem) btGImpactCollisionAlgorithm(ci,body0,body1);
+			return new(mem) btGImpactCollisionAlgorithm(ci,body0Wrap,body1Wrap);
 		}
 	};
 
@@ -236,26 +236,26 @@ public:
 	*/
 
 
-	void gimpact_vs_gimpact(btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactShapeInterface * shape0,
-					  btGImpactShapeInterface * shape1);
+	void gimpact_vs_gimpact(const btCollisionObjectWrapper* body0Wrap,
+					  const btCollisionObjectWrapper * body1Wrap,
+					  const btGImpactShapeInterface * shape0,
+					  const btGImpactShapeInterface * shape1);
 
-	void gimpact_vs_shape(btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactShapeInterface * shape0,
-					  btCollisionShape * shape1,bool swapped);
+	void gimpact_vs_shape(const btCollisionObjectWrapper* body0Wrap,
+					  const btCollisionObjectWrapper* body1Wrap,
+					  const btGImpactShapeInterface * shape0,
+					  const btCollisionShape * shape1,bool swapped);
 
-	void gimpact_vs_compoundshape(btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactShapeInterface * shape0,
-					  btCompoundShape * shape1,bool swapped);
+	void gimpact_vs_compoundshape(const btCollisionObjectWrapper * body0Wrap,
+					  const btCollisionObjectWrapper * body1Wrap,
+					  const btGImpactShapeInterface * shape0,
+					  const btCompoundShape * shape1,bool swapped);
 
 	void gimpact_vs_concave(
-					  btCollisionObject * body0,
-					  btCollisionObject * body1,
-					  btGImpactShapeInterface * shape0,
-					  btConcaveShape * shape1,bool swapped);
+					  const btCollisionObjectWrapper * body0Wrap,
+					  const btCollisionObjectWrapper * body1Wrap,
+					  const btGImpactShapeInterface * shape0,
+					  const btConcaveShape * shape1,bool swapped);
 
 
 
diff --git a/src/BulletCollision/Gimpact/btGImpactQuantizedBvh.cpp b/src/BulletCollision/Gimpact/btGImpactQuantizedBvh.cpp
index cd4dfdb60..4528758c3 100644
--- a/src/BulletCollision/Gimpact/btGImpactQuantizedBvh.cpp
+++ b/src/BulletCollision/Gimpact/btGImpactQuantizedBvh.cpp
@@ -384,7 +384,7 @@ bool btGImpactQuantizedBvh::rayQuery(
 
 
 SIMD_FORCE_INLINE bool _quantized_node_collision(
-	btGImpactQuantizedBvh * boxset0, btGImpactQuantizedBvh * boxset1,
+	const btGImpactQuantizedBvh * boxset0, const btGImpactQuantizedBvh * boxset1,
 	const BT_BOX_BOX_TRANSFORM_CACHE & trans_cache_1to0,
 	int node0 ,int node1, bool complete_primitive_tests)
 {
@@ -402,7 +402,7 @@ SIMD_FORCE_INLINE bool _quantized_node_collision(
 
 //stackless recursive collision routine
 static void _find_quantized_collision_pairs_recursive(
-	btGImpactQuantizedBvh * boxset0, btGImpactQuantizedBvh * boxset1,
+	const btGImpactQuantizedBvh * boxset0, const btGImpactQuantizedBvh * boxset1,
 	btPairSet * collision_pairs,
 	const BT_BOX_BOX_TRANSFORM_CACHE & trans_cache_1to0,
 	int node0, int node1, bool complete_primitive_tests)
@@ -501,8 +501,8 @@ static void _find_quantized_collision_pairs_recursive(
 }
 
 
-void btGImpactQuantizedBvh::find_collision(btGImpactQuantizedBvh * boxset0, const btTransform & trans0,
-		btGImpactQuantizedBvh * boxset1, const btTransform & trans1,
+void btGImpactQuantizedBvh::find_collision(const btGImpactQuantizedBvh * boxset0, const btTransform & trans0,
+		const btGImpactQuantizedBvh * boxset1, const btTransform & trans1,
 		btPairSet & collision_pairs)
 {
 
diff --git a/src/BulletCollision/Gimpact/btGImpactQuantizedBvh.h b/src/BulletCollision/Gimpact/btGImpactQuantizedBvh.h
index 9c9907747..e6e52fff4 100644
--- a/src/BulletCollision/Gimpact/btGImpactQuantizedBvh.h
+++ b/src/BulletCollision/Gimpact/btGImpactQuantizedBvh.h
@@ -363,8 +363,8 @@ public:
 	static float getAverageTreeCollisionTime();
 #endif //TRI_COLLISION_PROFILING
 
-	static void find_collision(btGImpactQuantizedBvh * boxset1, const btTransform & trans1,
-		btGImpactQuantizedBvh * boxset2, const btTransform & trans2,
+	static void find_collision(const btGImpactQuantizedBvh * boxset1, const btTransform & trans1,
+		const btGImpactQuantizedBvh * boxset2, const btTransform & trans2,
 		btPairSet & collision_pairs);
 };
 
diff --git a/src/BulletCollision/Gimpact/btGImpactShape.h b/src/BulletCollision/Gimpact/btGImpactShape.h
index 90015bb9a..2a4f9386a 100644
--- a/src/BulletCollision/Gimpact/btGImpactShape.h
+++ b/src/BulletCollision/Gimpact/btGImpactShape.h
@@ -192,7 +192,7 @@ public:
 	virtual eGIMPACT_SHAPE_TYPE getGImpactShapeType() const = 0 ;
 
 	//! gets boxset
-	SIMD_FORCE_INLINE btGImpactBoxSet * getBoxSet()
+	SIMD_FORCE_INLINE const btGImpactBoxSet * getBoxSet() const
 	{
 		return &m_box_set;
 	}
diff --git a/src/BulletCollision/Gimpact/gim_array.h b/src/BulletCollision/Gimpact/gim_array.h
index cfd5da8f4..27e6f32fc 100644
--- a/src/BulletCollision/Gimpact/gim_array.h
+++ b/src/BulletCollision/Gimpact/gim_array.h
@@ -285,18 +285,16 @@ public:
 	    m_data[index] = obj;
 	}
 
-	inline void resize(GUINT size, bool call_constructor = true)
+	inline void resize(GUINT size, bool call_constructor = true, const T& fillData=T())
 	{
-
 	    if(size>m_size)
 	    {
             reserve(size);
             if(call_constructor)
             {
-            	T obj;
                 while(m_size<size)
                 {
-                    m_data[m_size] = obj;
+                    m_data[m_size] = fillData;
                     m_size++;
                 }
             }
diff --git a/src/BulletCollision/Gimpact/gim_box_collision.h b/src/BulletCollision/Gimpact/gim_box_collision.h
index b360dd470..9c572638a 100644
--- a/src/BulletCollision/Gimpact/gim_box_collision.h
+++ b/src/BulletCollision/Gimpact/gim_box_collision.h
@@ -186,9 +186,7 @@ public:
 
 	SIMD_FORCE_INLINE btVector3 transform(const btVector3 & point)
 	{
-		return btVector3(m_R1to0[0].dot(point) + m_T1to0.x(),
-			m_R1to0[1].dot(point) + m_T1to0.y(),
-			m_R1to0[2].dot(point) + m_T1to0.z());
+        return point.dot3(m_R1to0[0], m_R1to0[1], m_R1to0[2]) + m_T1to0;
 	}
 };
 
@@ -332,10 +330,10 @@ public:
 		// Compute new center
 		center = trans(center);
 
-		btVector3 textends(extends.dot(trans.getBasis().getRow(0).absolute()),
- 				 extends.dot(trans.getBasis().getRow(1).absolute()),
-				 extends.dot(trans.getBasis().getRow(2).absolute()));
-
+        btVector3 textends = extends.dot3(trans.getBasis().getRow(0).absolute(), 
+                                          trans.getBasis().getRow(1).absolute(), 
+                                          trans.getBasis().getRow(2).absolute());
+        
 		m_min = center - textends;
 		m_max = center + textends;
 	}
diff --git a/src/BulletCollision/NarrowPhaseCollision/btContinuousConvexCollision.cpp b/src/BulletCollision/NarrowPhaseCollision/btContinuousConvexCollision.cpp
index 91fcea57a..940282f57 100644
--- a/src/BulletCollision/NarrowPhaseCollision/btContinuousConvexCollision.cpp
+++ b/src/BulletCollision/NarrowPhaseCollision/btContinuousConvexCollision.cpp
@@ -62,7 +62,6 @@ void btContinuousConvexCollision::computeClosestPoints( const btTransform& trans
 		const btConvexShape* convexShape = m_convexA;
 		const btStaticPlaneShape* planeShape = m_planeShape;
 		
-		bool hasCollision = false;
 		const btVector3& planeNormal = planeShape->getPlaneNormal();
 		const btScalar& planeConstant = planeShape->getPlaneConstant();
 		
diff --git a/src/BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h b/src/BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h
index 2277a19d9..f0043b8b9 100644
--- a/src/BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h
+++ b/src/BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h
@@ -63,12 +63,12 @@ public:
 	void	getClosestPointsNonVirtual(const ClosestPointInput& input,Result& output,class btIDebugDraw* debugDraw);
 	
 
-	void setMinkowskiA(btConvexShape* minkA)
+	void setMinkowskiA(const btConvexShape* minkA)
 	{
 		m_minkowskiA = minkA;
 	}
 
-	void setMinkowskiB(btConvexShape* minkB)
+	void setMinkowskiB(const btConvexShape* minkB)
 	{
 		m_minkowskiB = minkB;
 	}
diff --git a/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.cpp b/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.cpp
index 954b83952..ec8735614 100644
--- a/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.cpp
+++ b/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.cpp
@@ -287,7 +287,7 @@ void btPersistentManifold::refreshContactPoints(const btTransform& trA,const btT
 			{
 				//contact point processed callback
 				if (gContactProcessedCallback)
-					(*gContactProcessedCallback)(manifoldPoint,m_body0,m_body1);
+					(*gContactProcessedCallback)(manifoldPoint,(void*)m_body0,(void*)m_body1);
 			}
 		}
 	}
diff --git a/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h b/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h
index d877f0994..a5df9e770 100644
--- a/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h
+++ b/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h
@@ -20,6 +20,7 @@ subject to the following restrictions:
 #include "LinearMath/btVector3.h"
 #include "LinearMath/btTransform.h"
 #include "btManifoldPoint.h"
+class btCollisionObject;
 #include "LinearMath/btAlignedAllocator.h"
 
 struct btCollisionResult;
@@ -57,9 +58,8 @@ ATTRIBUTE_ALIGNED128( class) btPersistentManifold : public btTypedObject
 	btManifoldPoint m_pointCache[MANIFOLD_CACHE_SIZE];
 
 	/// this two body pointers can point to the physics rigidbody class.
-	/// void* will allow any rigidbody class
-	void* m_body0;
-	void* m_body1;
+	const btCollisionObject* m_body0;
+	const btCollisionObject* m_body1;
 
 	int	m_cachedPoints;
 
@@ -83,7 +83,7 @@ public:
 
 	btPersistentManifold();
 
-	btPersistentManifold(void* body0,void* body1,int , btScalar contactBreakingThreshold,btScalar contactProcessingThreshold)
+	btPersistentManifold(const btCollisionObject* body0,const btCollisionObject* body1,int , btScalar contactBreakingThreshold,btScalar contactProcessingThreshold)
 		: btTypedObject(BT_PERSISTENT_MANIFOLD_TYPE),
 	m_body0(body0),m_body1(body1),m_cachedPoints(0),
 		m_contactBreakingThreshold(contactBreakingThreshold),
@@ -91,13 +91,10 @@ public:
 	{
 	}
 
-	SIMD_FORCE_INLINE void* getBody0() { return m_body0;}
-	SIMD_FORCE_INLINE void* getBody1() { return m_body1;}
+	SIMD_FORCE_INLINE const btCollisionObject* getBody0() const { return m_body0;}
+	SIMD_FORCE_INLINE const btCollisionObject* getBody1() const { return m_body1;}
 
-	SIMD_FORCE_INLINE const void* getBody0() const { return m_body0;}
-	SIMD_FORCE_INLINE const void* getBody1() const { return m_body1;}
-
-	void	setBodies(void* body0,void* body1)
+	void	setBodies(const btCollisionObject* body0,const btCollisionObject* body1)
 	{
 		m_body0 = body0;
 		m_body1 = body1;
diff --git a/src/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.cpp b/src/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.cpp
index 61573ddb9..f37359976 100644
--- a/src/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.cpp
+++ b/src/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.cpp
@@ -1,440 +1,438 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-///This file was written by Erwin Coumans
-///Separating axis rest based on work from Pierre Terdiman, see
-///And contact clipping based on work from Simon Hobbs
-
-
-#include "btPolyhedralContactClipping.h"
-#include "BulletCollision/CollisionShapes/btConvexPolyhedron.h"
-
-#include <float.h> //for FLT_MAX
-
-int gExpectedNbTests=0;
-int gActualNbTests = 0;
-bool gUseInternalObject = true;
-
-// Clips a face to the back of a plane
-void btPolyhedralContactClipping::clipFace(const btVertexArray& pVtxIn, btVertexArray& ppVtxOut, const btVector3& planeNormalWS,btScalar planeEqWS)
-{
-	
-	int ve;
-	btScalar ds, de;
-	int numVerts = pVtxIn.size();
-	if (numVerts < 2)
-		return;
-
-	btVector3 firstVertex=pVtxIn[pVtxIn.size()-1];
-	btVector3 endVertex = pVtxIn[0];
-	
-	ds = planeNormalWS.dot(firstVertex)+planeEqWS;
-
-	for (ve = 0; ve < numVerts; ve++)
-	{
-		endVertex=pVtxIn[ve];
-
-		de = planeNormalWS.dot(endVertex)+planeEqWS;
-
-		if (ds<0)
-		{
-			if (de<0)
-			{
-				// Start < 0, end < 0, so output endVertex
-				ppVtxOut.push_back(endVertex);
-			}
-			else
-			{
-				// Start < 0, end >= 0, so output intersection
-				ppVtxOut.push_back( 	firstVertex.lerp(endVertex,btScalar(ds * 1.f/(ds - de))));
-			}
-		}
-		else
-		{
-			if (de<0)
-			{
-				// Start >= 0, end < 0 so output intersection and end
-				ppVtxOut.push_back(firstVertex.lerp(endVertex,btScalar(ds * 1.f/(ds - de))));
-				ppVtxOut.push_back(endVertex);
-			}
-		}
-		firstVertex = endVertex;
-		ds = de;
-	}
-}
-
-
-static bool TestSepAxis(const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, const btVector3& sep_axis, btScalar& depth)
-{
-	btScalar Min0,Max0;
-	btScalar Min1,Max1;
-	hullA.project(transA,sep_axis, Min0, Max0);
-	hullB.project(transB, sep_axis, Min1, Max1);
-
-	if(Max0<Min1 || Max1<Min0)
-		return false;
-
-	btScalar d0 = Max0 - Min1;
-	assert(d0>=0.0f);
-	btScalar d1 = Max1 - Min0;
-	assert(d1>=0.0f);
-	depth = d0<d1 ? d0:d1;
-	return true;
-}
-
-
-
-static int gActualSATPairTests=0;
-
-inline bool IsAlmostZero(const btVector3& v)
-{
-	if(fabsf(v.x())>1e-6 || fabsf(v.y())>1e-6 || fabsf(v.z())>1e-6)	return false;
-	return true;
-}
-
-#ifdef TEST_INTERNAL_OBJECTS
-
-inline void BoxSupport(const btScalar extents[3], const btScalar sv[3], btScalar p[3])
-{
-	// This version is ~11.000 cycles (4%) faster overall in one of the tests.
-//	IR(p[0]) = IR(extents[0])|(IR(sv[0])&SIGN_BITMASK);
-//	IR(p[1]) = IR(extents[1])|(IR(sv[1])&SIGN_BITMASK);
-//	IR(p[2]) = IR(extents[2])|(IR(sv[2])&SIGN_BITMASK);
-	p[0] = sv[0] < 0.0f ? -extents[0] : extents[0];
-	p[1] = sv[1] < 0.0f ? -extents[1] : extents[1];
-	p[2] = sv[2] < 0.0f ? -extents[2] : extents[2];
-}
-
-void InverseTransformPoint3x3(btVector3& out, const btVector3& in, const btTransform& tr)
-{
-	const btMatrix3x3& rot = tr.getBasis();
-	const btVector3& r0 = rot[0];
-	const btVector3& r1 = rot[1];
-	const btVector3& r2 = rot[2];
-
-	const btScalar x = r0.x()*in.x() + r1.x()*in.y() + r2.x()*in.z();
-	const btScalar y = r0.y()*in.x() + r1.y()*in.y() + r2.y()*in.z();
-	const btScalar z = r0.z()*in.x() + r1.z()*in.y() + r2.z()*in.z();
-
-	out.setValue(x, y, z);
-}
-
- bool TestInternalObjects( const btTransform& trans0, const btTransform& trans1, const btVector3& delta_c, const btVector3& axis, const btConvexPolyhedron& convex0, const btConvexPolyhedron& convex1, btScalar dmin)
-{
-	const btScalar dp = delta_c.dot(axis);
-
-	btVector3 localAxis0;
-	InverseTransformPoint3x3(localAxis0, axis,trans0);
-	btVector3 localAxis1;
-	InverseTransformPoint3x3(localAxis1, axis,trans1);
-
-	btScalar p0[3];
-	BoxSupport(convex0.m_extents, localAxis0, p0);
-	btScalar p1[3];
-	BoxSupport(convex1.m_extents, localAxis1, p1);
-
-	const btScalar Radius0 = p0[0]*localAxis0.x() + p0[1]*localAxis0.y() + p0[2]*localAxis0.z();
-	const btScalar Radius1 = p1[0]*localAxis1.x() + p1[1]*localAxis1.y() + p1[2]*localAxis1.z();
-
-	const btScalar MinRadius = Radius0>convex0.m_radius ? Radius0 : convex0.m_radius;
-	const btScalar MaxRadius = Radius1>convex1.m_radius ? Radius1 : convex1.m_radius;
-
-	const btScalar MinMaxRadius = MaxRadius + MinRadius;
-	const btScalar d0 = MinMaxRadius + dp;
-	const btScalar d1 = MinMaxRadius - dp;
-
-	const btScalar depth = d0<d1 ? d0:d1;
-	if(depth>dmin)
-		return false;
-	return true;
-}
-#endif //TEST_INTERNAL_OBJECTS
-
-
-bool btPolyhedralContactClipping::findSeparatingAxis(	const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, btVector3& sep)
-{
-	gActualSATPairTests++;
-
-//#ifdef TEST_INTERNAL_OBJECTS
-	const btVector3 c0 = transA * hullA.m_localCenter;
-	const btVector3 c1 = transB * hullB.m_localCenter;
-	const btVector3 DeltaC2 = c0 - c1;
-//#endif
-
-	btScalar dmin = FLT_MAX;
-	int curPlaneTests=0;
-
-	int numFacesA = hullA.m_faces.size();
-	// Test normals from hullA
-	for(int i=0;i<numFacesA;i++)
-	{
-		const btVector3 Normal(hullA.m_faces[i].m_plane[0], hullA.m_faces[i].m_plane[1], hullA.m_faces[i].m_plane[2]);
-		const btVector3 faceANormalWS = transA.getBasis() * Normal;
-		if (DeltaC2.dot(faceANormalWS)<0)
-			continue;
-
-		curPlaneTests++;
-#ifdef TEST_INTERNAL_OBJECTS
-		gExpectedNbTests++;
-		if(gUseInternalObject && !TestInternalObjects(transA,transB, DeltaC2, faceANormalWS, hullA, hullB, dmin))
-			continue;
-		gActualNbTests++;
-#endif
-
-		btScalar d;
-		if(!TestSepAxis( hullA, hullB, transA,transB, faceANormalWS, d))
-			return false;
-
-		if(d<dmin)
-		{
-			dmin = d;
-			sep = faceANormalWS;
-		}
-	}
-
-	int numFacesB = hullB.m_faces.size();
-	// Test normals from hullB
-	for(int i=0;i<numFacesB;i++)
-	{
-		const btVector3 Normal(hullB.m_faces[i].m_plane[0], hullB.m_faces[i].m_plane[1], hullB.m_faces[i].m_plane[2]);
-		const btVector3 WorldNormal = transB.getBasis() * Normal;
-		if (DeltaC2.dot(WorldNormal)<0)
-			continue;
-
-		curPlaneTests++;
-#ifdef TEST_INTERNAL_OBJECTS
-		gExpectedNbTests++;
-		if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, WorldNormal, hullA, hullB, dmin))
-			continue;
-		gActualNbTests++;
-#endif
-
-		btScalar d;
-		if(!TestSepAxis(hullA, hullB,transA,transB, WorldNormal,d))
-			return false;
-
-		if(d<dmin)
-		{
-			dmin = d;
-			sep = WorldNormal;
-		}
-	}
-
-	btVector3 edgeAstart,edgeAend,edgeBstart,edgeBend;
-
-	int curEdgeEdge = 0;
-	// Test edges
-	for(int e0=0;e0<hullA.m_uniqueEdges.size();e0++)
-	{
-		const btVector3 edge0 = hullA.m_uniqueEdges[e0];
-		const btVector3 WorldEdge0 = transA.getBasis() * edge0;
-		for(int e1=0;e1<hullB.m_uniqueEdges.size();e1++)
-		{
-			const btVector3 edge1 = hullB.m_uniqueEdges[e1];
-			const btVector3 WorldEdge1 = transB.getBasis() * edge1;
-
-			btVector3 Cross = WorldEdge0.cross(WorldEdge1);
-			curEdgeEdge++;
-			if(!IsAlmostZero(Cross))
-			{
-				Cross = Cross.normalize();
-				if (DeltaC2.dot(Cross)<0)
-					continue;
-
-
-#ifdef TEST_INTERNAL_OBJECTS
-				gExpectedNbTests++;
-				if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, Cross, hullA, hullB, dmin))
-					continue;
-				gActualNbTests++;
-#endif
-
-				btScalar dist;
-				if(!TestSepAxis( hullA, hullB, transA,transB, Cross, dist))
-					return false;
-
-				if(dist<dmin)
-				{
-					dmin = dist;
-					sep = Cross;
-				}
-			}
-		}
-
-	}
-
-	const btVector3 deltaC = transB.getOrigin() - transA.getOrigin();
-	if((deltaC.dot(sep))>0.0f)
-		sep = -sep;
-
-	return true;
-}
-
-void	btPolyhedralContactClipping::clipFaceAgainstHull(const btVector3& separatingNormal, const btConvexPolyhedron& hullA,  const btTransform& transA, btVertexArray& worldVertsB1, const btScalar minDist, btScalar maxDist,btDiscreteCollisionDetectorInterface::Result& resultOut)
-{
-	btVertexArray worldVertsB2;
-	btVertexArray* pVtxIn = &worldVertsB1;
-	btVertexArray* pVtxOut = &worldVertsB2;
-	pVtxOut->reserve(pVtxIn->size());
-
-	int closestFaceA=-1;
-	{
-		btScalar dmin = FLT_MAX;
-		for(int face=0;face<hullA.m_faces.size();face++)
-		{
-			const btVector3 Normal(hullA.m_faces[face].m_plane[0], hullA.m_faces[face].m_plane[1], hullA.m_faces[face].m_plane[2]);
-			const btVector3 faceANormalWS = transA.getBasis() * Normal;
-		
-			btScalar d = faceANormalWS.dot(separatingNormal);
-			if (d < dmin)
-			{
-				dmin = d;
-				closestFaceA = face;
-			}
-		}
-	}
-	if (closestFaceA<0)
-		return;
-
-	const btFace& polyA = hullA.m_faces[closestFaceA];
-
-		// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
-	int numContacts = pVtxIn->size();
-	int numVerticesA = polyA.m_indices.size();
-	for(int e0=0;e0<numVerticesA;e0++)
-	{
-		const btVector3& a = hullA.m_vertices[polyA.m_indices[e0]];
-		const btVector3& b = hullA.m_vertices[polyA.m_indices[(e0+1)%numVerticesA]];
-		const btVector3 edge0 = a - b;
-		const btVector3 WorldEdge0 = transA.getBasis() * edge0;
-		btVector3 worldPlaneAnormal1 = transA.getBasis()* btVector3(polyA.m_plane[0],polyA.m_plane[1],polyA.m_plane[2]);
-
-		btVector3 planeNormalWS1 = -WorldEdge0.cross(worldPlaneAnormal1);//.cross(WorldEdge0);
-		btVector3 worldA1 = transA*a;
-		btScalar planeEqWS1 = -worldA1.dot(planeNormalWS1);
-		
-//int otherFace=0;
-#ifdef BLA1
-		int otherFace = polyA.m_connectedFaces[e0];
-		btVector3 localPlaneNormal (hullA.m_faces[otherFace].m_plane[0],hullA.m_faces[otherFace].m_plane[1],hullA.m_faces[otherFace].m_plane[2]);
-		btScalar localPlaneEq = hullA.m_faces[otherFace].m_plane[3];
-
-		btVector3 planeNormalWS = transA.getBasis()*localPlaneNormal;
-		btScalar planeEqWS=localPlaneEq-planeNormalWS.dot(transA.getOrigin());
-#else 
-		btVector3 planeNormalWS = planeNormalWS1;
-		btScalar planeEqWS=planeEqWS1;
-		
-#endif
-		//clip face
-
-		clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);
-		btSwap(pVtxIn,pVtxOut);
-		pVtxOut->resize(0);
-	}
-
-
-
-//#define ONLY_REPORT_DEEPEST_POINT
-
-	btVector3 point;
-	
-
-	// only keep points that are behind the witness face
-	{
-		btVector3 localPlaneNormal (polyA.m_plane[0],polyA.m_plane[1],polyA.m_plane[2]);
-		btScalar localPlaneEq = polyA.m_plane[3];
-		btVector3 planeNormalWS = transA.getBasis()*localPlaneNormal;
-		btScalar planeEqWS=localPlaneEq-planeNormalWS.dot(transA.getOrigin());
-		for (int i=0;i<pVtxIn->size();i++)
-		{
-			
-			btScalar depth = planeNormalWS.dot(pVtxIn->at(i))+planeEqWS;
-			if (depth <=minDist)
-			{
-//				printf("clamped: depth=%f to minDist=%f\n",depth,minDist);
-				depth = minDist;
-			}
-
-			if (depth <=maxDist)
-			{
-				btVector3 point = pVtxIn->at(i);
-#ifdef ONLY_REPORT_DEEPEST_POINT
-				curMaxDist = depth;
-#else
-#if 0
-				if (depth<-3)
-				{
-					printf("error in btPolyhedralContactClipping depth = %f\n", depth);
-					printf("likely wrong separatingNormal passed in\n");
-				} 
-#endif				
-				resultOut.addContactPoint(separatingNormal,point,depth);
-#endif
-			}
-		}
-	}
-#ifdef ONLY_REPORT_DEEPEST_POINT
-	if (curMaxDist<maxDist)
-	{
-		resultOut.addContactPoint(separatingNormal,point,curMaxDist);
-	}
-#endif //ONLY_REPORT_DEEPEST_POINT
-
-}
-
-
-void	btPolyhedralContactClipping::clipHullAgainstHull(const btVector3& separatingNormal1, const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, const btScalar minDist, btScalar maxDist,btDiscreteCollisionDetectorInterface::Result& resultOut)
-{
-
-	btVector3 separatingNormal = separatingNormal1.normalized();
-	const btVector3 c0 = transA * hullA.m_localCenter;
-	const btVector3 c1 = transB * hullB.m_localCenter;
-	const btVector3 DeltaC2 = c0 - c1;
-
-
-	btScalar curMaxDist=maxDist;
-	int closestFaceB=-1;
-	btScalar dmax = -FLT_MAX;
-	{
-		for(int face=0;face<hullB.m_faces.size();face++)
-		{
-			const btVector3 Normal(hullB.m_faces[face].m_plane[0], hullB.m_faces[face].m_plane[1], hullB.m_faces[face].m_plane[2]);
-			const btVector3 WorldNormal = transB.getBasis() * Normal;
-			btScalar d = WorldNormal.dot(separatingNormal);
-			if (d > dmax)
-			{
-				dmax = d;
-				closestFaceB = face;
-			}
-		}
-	}
-				btVertexArray worldVertsB1;
-				{
-					const btFace& polyB = hullB.m_faces[closestFaceB];
-					const int numVertices = polyB.m_indices.size();
-					for(int e0=0;e0<numVertices;e0++)
-					{
-						const btVector3& b = hullB.m_vertices[polyB.m_indices[e0]];
-						worldVertsB1.push_back(transB*b);
-					}
-				}
-
-	
-	if (closestFaceB>=0)
-		clipFaceAgainstHull(separatingNormal, hullA, transA,worldVertsB1, minDist, maxDist,resultOut);
-
-}
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+///This file was written by Erwin Coumans
+///Separating axis rest based on work from Pierre Terdiman, see
+///And contact clipping based on work from Simon Hobbs
+
+
+#include "btPolyhedralContactClipping.h"
+#include "BulletCollision/CollisionShapes/btConvexPolyhedron.h"
+
+#include <float.h> //for FLT_MAX
+
+int gExpectedNbTests=0;
+int gActualNbTests = 0;
+bool gUseInternalObject = true;
+
+// Clips a face to the back of a plane
+void btPolyhedralContactClipping::clipFace(const btVertexArray& pVtxIn, btVertexArray& ppVtxOut, const btVector3& planeNormalWS,btScalar planeEqWS)
+{
+	
+	int ve;
+	btScalar ds, de;
+	int numVerts = pVtxIn.size();
+	if (numVerts < 2)
+		return;
+
+	btVector3 firstVertex=pVtxIn[pVtxIn.size()-1];
+	btVector3 endVertex = pVtxIn[0];
+	
+	ds = planeNormalWS.dot(firstVertex)+planeEqWS;
+
+	for (ve = 0; ve < numVerts; ve++)
+	{
+		endVertex=pVtxIn[ve];
+
+		de = planeNormalWS.dot(endVertex)+planeEqWS;
+
+		if (ds<0)
+		{
+			if (de<0)
+			{
+				// Start < 0, end < 0, so output endVertex
+				ppVtxOut.push_back(endVertex);
+			}
+			else
+			{
+				// Start < 0, end >= 0, so output intersection
+				ppVtxOut.push_back( 	firstVertex.lerp(endVertex,btScalar(ds * 1.f/(ds - de))));
+			}
+		}
+		else
+		{
+			if (de<0)
+			{
+				// Start >= 0, end < 0 so output intersection and end
+				ppVtxOut.push_back(firstVertex.lerp(endVertex,btScalar(ds * 1.f/(ds - de))));
+				ppVtxOut.push_back(endVertex);
+			}
+		}
+		firstVertex = endVertex;
+		ds = de;
+	}
+}
+
+
+static bool TestSepAxis(const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, const btVector3& sep_axis, btScalar& depth)
+{
+	btScalar Min0,Max0;
+	btScalar Min1,Max1;
+	hullA.project(transA,sep_axis, Min0, Max0);
+	hullB.project(transB, sep_axis, Min1, Max1);
+
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+
+	btScalar d0 = Max0 - Min1;
+	assert(d0>=0.0f);
+	btScalar d1 = Max1 - Min0;
+	assert(d1>=0.0f);
+	depth = d0<d1 ? d0:d1;
+	return true;
+}
+
+
+
+static int gActualSATPairTests=0;
+
+inline bool IsAlmostZero(const btVector3& v)
+{
+	if(fabsf(v.x())>1e-6 || fabsf(v.y())>1e-6 || fabsf(v.z())>1e-6)	return false;
+	return true;
+}
+
+#ifdef TEST_INTERNAL_OBJECTS
+
+inline void BoxSupport(const btScalar extents[3], const btScalar sv[3], btScalar p[3])
+{
+	// This version is ~11.000 cycles (4%) faster overall in one of the tests.
+//	IR(p[0]) = IR(extents[0])|(IR(sv[0])&SIGN_BITMASK);
+//	IR(p[1]) = IR(extents[1])|(IR(sv[1])&SIGN_BITMASK);
+//	IR(p[2]) = IR(extents[2])|(IR(sv[2])&SIGN_BITMASK);
+	p[0] = sv[0] < 0.0f ? -extents[0] : extents[0];
+	p[1] = sv[1] < 0.0f ? -extents[1] : extents[1];
+	p[2] = sv[2] < 0.0f ? -extents[2] : extents[2];
+}
+
+void InverseTransformPoint3x3(btVector3& out, const btVector3& in, const btTransform& tr)
+{
+	const btMatrix3x3& rot = tr.getBasis();
+	const btVector3& r0 = rot[0];
+	const btVector3& r1 = rot[1];
+	const btVector3& r2 = rot[2];
+
+	const btScalar x = r0.x()*in.x() + r1.x()*in.y() + r2.x()*in.z();
+	const btScalar y = r0.y()*in.x() + r1.y()*in.y() + r2.y()*in.z();
+	const btScalar z = r0.z()*in.x() + r1.z()*in.y() + r2.z()*in.z();
+
+	out.setValue(x, y, z);
+}
+
+ bool TestInternalObjects( const btTransform& trans0, const btTransform& trans1, const btVector3& delta_c, const btVector3& axis, const btConvexPolyhedron& convex0, const btConvexPolyhedron& convex1, btScalar dmin)
+{
+	const btScalar dp = delta_c.dot(axis);
+
+	btVector3 localAxis0;
+	InverseTransformPoint3x3(localAxis0, axis,trans0);
+	btVector3 localAxis1;
+	InverseTransformPoint3x3(localAxis1, axis,trans1);
+
+	btScalar p0[3];
+	BoxSupport(convex0.m_extents, localAxis0, p0);
+	btScalar p1[3];
+	BoxSupport(convex1.m_extents, localAxis1, p1);
+
+	const btScalar Radius0 = p0[0]*localAxis0.x() + p0[1]*localAxis0.y() + p0[2]*localAxis0.z();
+	const btScalar Radius1 = p1[0]*localAxis1.x() + p1[1]*localAxis1.y() + p1[2]*localAxis1.z();
+
+	const btScalar MinRadius = Radius0>convex0.m_radius ? Radius0 : convex0.m_radius;
+	const btScalar MaxRadius = Radius1>convex1.m_radius ? Radius1 : convex1.m_radius;
+
+	const btScalar MinMaxRadius = MaxRadius + MinRadius;
+	const btScalar d0 = MinMaxRadius + dp;
+	const btScalar d1 = MinMaxRadius - dp;
+
+	const btScalar depth = d0<d1 ? d0:d1;
+	if(depth>dmin)
+		return false;
+	return true;
+}
+#endif //TEST_INTERNAL_OBJECTS
+
+
+bool btPolyhedralContactClipping::findSeparatingAxis(	const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, btVector3& sep)
+{
+	gActualSATPairTests++;
+
+//#ifdef TEST_INTERNAL_OBJECTS
+	const btVector3 c0 = transA * hullA.m_localCenter;
+	const btVector3 c1 = transB * hullB.m_localCenter;
+	const btVector3 DeltaC2 = c0 - c1;
+//#endif
+
+	btScalar dmin = FLT_MAX;
+	int curPlaneTests=0;
+
+	int numFacesA = hullA.m_faces.size();
+	// Test normals from hullA
+	for(int i=0;i<numFacesA;i++)
+	{
+		const btVector3 Normal(hullA.m_faces[i].m_plane[0], hullA.m_faces[i].m_plane[1], hullA.m_faces[i].m_plane[2]);
+		const btVector3 faceANormalWS = transA.getBasis() * Normal;
+		if (DeltaC2.dot(faceANormalWS)<0)
+			continue;
+
+		curPlaneTests++;
+#ifdef TEST_INTERNAL_OBJECTS
+		gExpectedNbTests++;
+		if(gUseInternalObject && !TestInternalObjects(transA,transB, DeltaC2, faceANormalWS, hullA, hullB, dmin))
+			continue;
+		gActualNbTests++;
+#endif
+
+		btScalar d;
+		if(!TestSepAxis( hullA, hullB, transA,transB, faceANormalWS, d))
+			return false;
+
+		if(d<dmin)
+		{
+			dmin = d;
+			sep = faceANormalWS;
+		}
+	}
+
+	int numFacesB = hullB.m_faces.size();
+	// Test normals from hullB
+	for(int i=0;i<numFacesB;i++)
+	{
+		const btVector3 Normal(hullB.m_faces[i].m_plane[0], hullB.m_faces[i].m_plane[1], hullB.m_faces[i].m_plane[2]);
+		const btVector3 WorldNormal = transB.getBasis() * Normal;
+		if (DeltaC2.dot(WorldNormal)<0)
+			continue;
+
+		curPlaneTests++;
+#ifdef TEST_INTERNAL_OBJECTS
+		gExpectedNbTests++;
+		if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, WorldNormal, hullA, hullB, dmin))
+			continue;
+		gActualNbTests++;
+#endif
+
+		btScalar d;
+		if(!TestSepAxis(hullA, hullB,transA,transB, WorldNormal,d))
+			return false;
+
+		if(d<dmin)
+		{
+			dmin = d;
+			sep = WorldNormal;
+		}
+	}
+
+	btVector3 edgeAstart,edgeAend,edgeBstart,edgeBend;
+
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA.m_uniqueEdges.size();e0++)
+	{
+		const btVector3 edge0 = hullA.m_uniqueEdges[e0];
+		const btVector3 WorldEdge0 = transA.getBasis() * edge0;
+		for(int e1=0;e1<hullB.m_uniqueEdges.size();e1++)
+		{
+			const btVector3 edge1 = hullB.m_uniqueEdges[e1];
+			const btVector3 WorldEdge1 = transB.getBasis() * edge1;
+
+			btVector3 Cross = WorldEdge0.cross(WorldEdge1);
+			curEdgeEdge++;
+			if(!IsAlmostZero(Cross))
+			{
+				Cross = Cross.normalize();
+				if (DeltaC2.dot(Cross)<0)
+					continue;
+
+
+#ifdef TEST_INTERNAL_OBJECTS
+				gExpectedNbTests++;
+				if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, Cross, hullA, hullB, dmin))
+					continue;
+				gActualNbTests++;
+#endif
+
+				btScalar dist;
+				if(!TestSepAxis( hullA, hullB, transA,transB, Cross, dist))
+					return false;
+
+				if(dist<dmin)
+				{
+					dmin = dist;
+					sep = Cross;
+				}
+			}
+		}
+
+	}
+
+	const btVector3 deltaC = transB.getOrigin() - transA.getOrigin();
+	if((deltaC.dot(sep))>0.0f)
+		sep = -sep;
+
+	return true;
+}
+
+void	btPolyhedralContactClipping::clipFaceAgainstHull(const btVector3& separatingNormal, const btConvexPolyhedron& hullA,  const btTransform& transA, btVertexArray& worldVertsB1, const btScalar minDist, btScalar maxDist,btDiscreteCollisionDetectorInterface::Result& resultOut)
+{
+	btVertexArray worldVertsB2;
+	btVertexArray* pVtxIn = &worldVertsB1;
+	btVertexArray* pVtxOut = &worldVertsB2;
+	pVtxOut->reserve(pVtxIn->size());
+
+	int closestFaceA=-1;
+	{
+		btScalar dmin = FLT_MAX;
+		for(int face=0;face<hullA.m_faces.size();face++)
+		{
+			const btVector3 Normal(hullA.m_faces[face].m_plane[0], hullA.m_faces[face].m_plane[1], hullA.m_faces[face].m_plane[2]);
+			const btVector3 faceANormalWS = transA.getBasis() * Normal;
+		
+			btScalar d = faceANormalWS.dot(separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+			}
+		}
+	}
+	if (closestFaceA<0)
+		return;
+
+	const btFace& polyA = hullA.m_faces[closestFaceA];
+
+		// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+	int numVerticesA = polyA.m_indices.size();
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+		const btVector3& a = hullA.m_vertices[polyA.m_indices[e0]];
+		const btVector3& b = hullA.m_vertices[polyA.m_indices[(e0+1)%numVerticesA]];
+		const btVector3 edge0 = a - b;
+		const btVector3 WorldEdge0 = transA.getBasis() * edge0;
+		btVector3 worldPlaneAnormal1 = transA.getBasis()* btVector3(polyA.m_plane[0],polyA.m_plane[1],polyA.m_plane[2]);
+
+		btVector3 planeNormalWS1 = -WorldEdge0.cross(worldPlaneAnormal1);//.cross(WorldEdge0);
+		btVector3 worldA1 = transA*a;
+		btScalar planeEqWS1 = -worldA1.dot(planeNormalWS1);
+		
+//int otherFace=0;
+#ifdef BLA1
+		int otherFace = polyA.m_connectedFaces[e0];
+		btVector3 localPlaneNormal (hullA.m_faces[otherFace].m_plane[0],hullA.m_faces[otherFace].m_plane[1],hullA.m_faces[otherFace].m_plane[2]);
+		btScalar localPlaneEq = hullA.m_faces[otherFace].m_plane[3];
+
+		btVector3 planeNormalWS = transA.getBasis()*localPlaneNormal;
+		btScalar planeEqWS=localPlaneEq-planeNormalWS.dot(transA.getOrigin());
+#else 
+		btVector3 planeNormalWS = planeNormalWS1;
+		btScalar planeEqWS=planeEqWS1;
+		
+#endif
+		//clip face
+
+		clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);
+		btSwap(pVtxIn,pVtxOut);
+		pVtxOut->resize(0);
+	}
+
+
+
+//#define ONLY_REPORT_DEEPEST_POINT
+
+	btVector3 point;
+	
+
+	// only keep points that are behind the witness face
+	{
+		btVector3 localPlaneNormal (polyA.m_plane[0],polyA.m_plane[1],polyA.m_plane[2]);
+		btScalar localPlaneEq = polyA.m_plane[3];
+		btVector3 planeNormalWS = transA.getBasis()*localPlaneNormal;
+		btScalar planeEqWS=localPlaneEq-planeNormalWS.dot(transA.getOrigin());
+		for (int i=0;i<pVtxIn->size();i++)
+		{
+			
+			btScalar depth = planeNormalWS.dot(pVtxIn->at(i))+planeEqWS;
+			if (depth <=minDist)
+			{
+//				printf("clamped: depth=%f to minDist=%f\n",depth,minDist);
+				depth = minDist;
+			}
+
+			if (depth <=maxDist)
+			{
+				btVector3 point = pVtxIn->at(i);
+#ifdef ONLY_REPORT_DEEPEST_POINT
+				curMaxDist = depth;
+#else
+#if 0
+				if (depth<-3)
+				{
+					printf("error in btPolyhedralContactClipping depth = %f\n", depth);
+					printf("likely wrong separatingNormal passed in\n");
+				} 
+#endif				
+				resultOut.addContactPoint(separatingNormal,point,depth);
+#endif
+			}
+		}
+	}
+#ifdef ONLY_REPORT_DEEPEST_POINT
+	if (curMaxDist<maxDist)
+	{
+		resultOut.addContactPoint(separatingNormal,point,curMaxDist);
+	}
+#endif //ONLY_REPORT_DEEPEST_POINT
+
+}
+
+
+void	btPolyhedralContactClipping::clipHullAgainstHull(const btVector3& separatingNormal1, const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, const btScalar minDist, btScalar maxDist,btDiscreteCollisionDetectorInterface::Result& resultOut)
+{
+
+	btVector3 separatingNormal = separatingNormal1.normalized();
+	const btVector3 c0 = transA * hullA.m_localCenter;
+	const btVector3 c1 = transB * hullB.m_localCenter;
+	const btVector3 DeltaC2 = c0 - c1;
+
+
+	int closestFaceB=-1;
+	btScalar dmax = -FLT_MAX;
+	{
+		for(int face=0;face<hullB.m_faces.size();face++)
+		{
+			const btVector3 Normal(hullB.m_faces[face].m_plane[0], hullB.m_faces[face].m_plane[1], hullB.m_faces[face].m_plane[2]);
+			const btVector3 WorldNormal = transB.getBasis() * Normal;
+			btScalar d = WorldNormal.dot(separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+				btVertexArray worldVertsB1;
+				{
+					const btFace& polyB = hullB.m_faces[closestFaceB];
+					const int numVertices = polyB.m_indices.size();
+					for(int e0=0;e0<numVertices;e0++)
+					{
+						const btVector3& b = hullB.m_vertices[polyB.m_indices[e0]];
+						worldVertsB1.push_back(transB*b);
+					}
+				}
+
+	
+	if (closestFaceB>=0)
+		clipFaceAgainstHull(separatingNormal, hullA, transA,worldVertsB1, minDist, maxDist,resultOut);
+
+}
diff --git a/src/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.h b/src/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.h
index 7ab9c1e03..99103df20 100644
--- a/src/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.h
+++ b/src/BulletCollision/NarrowPhaseCollision/btPolyhedralContactClipping.h
@@ -1,46 +1,46 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-///This file was written by Erwin Coumans
-
-
-#ifndef BT_POLYHEDRAL_CONTACT_CLIPPING_H
-#define BT_POLYHEDRAL_CONTACT_CLIPPING_H
-
-
-#include "LinearMath/btAlignedObjectArray.h"
-#include "LinearMath/btTransform.h"
-#include "btDiscreteCollisionDetectorInterface.h"
-
-class btConvexPolyhedron;
-
-typedef btAlignedObjectArray<btVector3> btVertexArray;
-
-// Clips a face to the back of a plane
-struct btPolyhedralContactClipping
-{
-	static void clipHullAgainstHull(const btVector3& separatingNormal, const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, const btScalar minDist, btScalar maxDist, btDiscreteCollisionDetectorInterface::Result& resultOut);
-	static void	clipFaceAgainstHull(const btVector3& separatingNormal, const btConvexPolyhedron& hullA,  const btTransform& transA, btVertexArray& worldVertsB1, const btScalar minDist, btScalar maxDist,btDiscreteCollisionDetectorInterface::Result& resultOut);
-
-	static bool findSeparatingAxis(	const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, btVector3& sep);
-
-	///the clipFace method is used internally
-	static void clipFace(const btVertexArray& pVtxIn, btVertexArray& ppVtxOut, const btVector3& planeNormalWS,btScalar planeEqWS);
-
-};
-
-#endif // BT_POLYHEDRAL_CONTACT_CLIPPING_H
-
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+///This file was written by Erwin Coumans
+
+
+#ifndef BT_POLYHEDRAL_CONTACT_CLIPPING_H
+#define BT_POLYHEDRAL_CONTACT_CLIPPING_H
+
+
+#include "LinearMath/btAlignedObjectArray.h"
+#include "LinearMath/btTransform.h"
+#include "btDiscreteCollisionDetectorInterface.h"
+
+class btConvexPolyhedron;
+
+typedef btAlignedObjectArray<btVector3> btVertexArray;
+
+// Clips a face to the back of a plane
+struct btPolyhedralContactClipping
+{
+	static void clipHullAgainstHull(const btVector3& separatingNormal, const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, const btScalar minDist, btScalar maxDist, btDiscreteCollisionDetectorInterface::Result& resultOut);
+	static void	clipFaceAgainstHull(const btVector3& separatingNormal, const btConvexPolyhedron& hullA,  const btTransform& transA, btVertexArray& worldVertsB1, const btScalar minDist, btScalar maxDist,btDiscreteCollisionDetectorInterface::Result& resultOut);
+
+	static bool findSeparatingAxis(	const btConvexPolyhedron& hullA, const btConvexPolyhedron& hullB, const btTransform& transA,const btTransform& transB, btVector3& sep);
+
+	///the clipFace method is used internally
+	static void clipFace(const btVertexArray& pVtxIn, btVertexArray& ppVtxOut, const btVector3& planeNormalWS,btScalar planeEqWS);
+
+};
+
+#endif // BT_POLYHEDRAL_CONTACT_CLIPPING_H
+
diff --git a/src/BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h b/src/BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h
index f1c7613ef..2f389e27e 100644
--- a/src/BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h
+++ b/src/BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h
@@ -92,13 +92,15 @@ struct	btSubSimplexClosestResult
 /// btVoronoiSimplexSolver is an implementation of the closest point distance algorithm from a 1-4 points simplex to the origin.
 /// Can be used with GJK, as an alternative to Johnson distance algorithm.
 #ifdef NO_VIRTUAL_INTERFACE
-class btVoronoiSimplexSolver
+ATTRIBUTE_ALIGNED16(class) btVoronoiSimplexSolver
 #else
-class btVoronoiSimplexSolver : public btSimplexSolverInterface
+ATTRIBUTE_ALIGNED16(class) btVoronoiSimplexSolver : public btSimplexSolverInterface
 #endif
 {
 public:
 
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	int	m_numVertices;
 
 	btVector3	m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS];
diff --git a/src/BulletDynamics/ConstraintSolver/btConeTwistConstraint.h b/src/BulletDynamics/ConstraintSolver/btConeTwistConstraint.h
index 868e62f06..861dea2f0 100644
--- a/src/BulletDynamics/ConstraintSolver/btConeTwistConstraint.h
+++ b/src/BulletDynamics/ConstraintSolver/btConeTwistConstraint.h
@@ -50,7 +50,7 @@ enum btConeTwistFlags
 };
 
 ///btConeTwistConstraint can be used to simulate ragdoll joints (upper arm, leg etc)
-class btConeTwistConstraint : public btTypedConstraint
+ATTRIBUTE_ALIGNED16(class) btConeTwistConstraint : public btTypedConstraint
 {
 #ifdef IN_PARALLELL_SOLVER
 public:
@@ -126,6 +126,8 @@ protected:
 
 public:
 
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	btConeTwistConstraint(btRigidBody& rbA,btRigidBody& rbB,const btTransform& rbAFrame, const btTransform& rbBFrame);
 	
 	btConeTwistConstraint(btRigidBody& rbA,const btTransform& rbAFrame);
diff --git a/src/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.h b/src/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.h
index b44108110..0409f9537 100644
--- a/src/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.h
+++ b/src/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.h
@@ -268,7 +268,7 @@ This brings support for limit parameters and motors. </li>
 </ul>
 
 */
-class btGeneric6DofConstraint : public btTypedConstraint
+ATTRIBUTE_ALIGNED16(class) btGeneric6DofConstraint : public btTypedConstraint
 {
 protected:
 
@@ -346,6 +346,8 @@ protected:
 
 public:
 
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	///for backwards compatibility during the transition to 'getInfo/getInfo2'
 	bool		m_useSolveConstraintObsolete;
 
diff --git a/src/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.h b/src/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.h
index 31e0cd531..187267f5e 100644
--- a/src/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.h
+++ b/src/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.h
@@ -32,7 +32,7 @@ subject to the following restrictions:
 /// 4 : rotation Y (2nd Euler rotational around new position of Y axis, range [-PI/2+epsilon, PI/2-epsilon] )
 /// 5 : rotation Z (1st Euler rotational around Z axis, range [-PI+epsilon, PI-epsilon] )
 
-class btGeneric6DofSpringConstraint : public btGeneric6DofConstraint
+ATTRIBUTE_ALIGNED16(class) btGeneric6DofSpringConstraint : public btGeneric6DofConstraint
 {
 protected:
 	bool		m_springEnabled[6];
@@ -42,6 +42,9 @@ protected:
 	void init();
 	void internalUpdateSprings(btConstraintInfo2* info);
 public: 
+	
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
     btGeneric6DofSpringConstraint(btRigidBody& rbA, btRigidBody& rbB, const btTransform& frameInA, const btTransform& frameInB ,bool useLinearReferenceFrameA);
     btGeneric6DofSpringConstraint(btRigidBody& rbB, const btTransform& frameInB, bool useLinearReferenceFrameB);
 	void enableSpring(int index, bool onOff);
diff --git a/src/BulletDynamics/ConstraintSolver/btHinge2Constraint.h b/src/BulletDynamics/ConstraintSolver/btHinge2Constraint.h
index a76452ddb..9a0049869 100644
--- a/src/BulletDynamics/ConstraintSolver/btHinge2Constraint.h
+++ b/src/BulletDynamics/ConstraintSolver/btHinge2Constraint.h
@@ -29,13 +29,15 @@ subject to the following restrictions:
 // 2 rotational degrees of freedom, similar to Euler rotations around Z (axis 1) and X (axis 2)
 // 1 translational (along axis Z) with suspension spring
 
-class btHinge2Constraint : public btGeneric6DofSpringConstraint
+ATTRIBUTE_ALIGNED16(class) btHinge2Constraint : public btGeneric6DofSpringConstraint
 {
 protected:
 	btVector3	m_anchor;
 	btVector3	m_axis1;
 	btVector3	m_axis2;
 public:
+		BT_DECLARE_ALIGNED_ALLOCATOR();
+		
 	// constructor
 	// anchor, axis1 and axis2 are in world coordinate system
 	// axis1 must be orthogonal to axis2
diff --git a/src/BulletDynamics/ConstraintSolver/btHingeConstraint.h b/src/BulletDynamics/ConstraintSolver/btHingeConstraint.h
index cb2973e1d..a7f2cca55 100644
--- a/src/BulletDynamics/ConstraintSolver/btHingeConstraint.h
+++ b/src/BulletDynamics/ConstraintSolver/btHingeConstraint.h
@@ -100,6 +100,8 @@ public:
 	
 public:
 
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	btHingeConstraint(btRigidBody& rbA,btRigidBody& rbB, const btVector3& pivotInA,const btVector3& pivotInB, const btVector3& axisInA,const btVector3& axisInB, bool useReferenceFrameA = false);
 
 	btHingeConstraint(btRigidBody& rbA,const btVector3& pivotInA,const btVector3& axisInA, bool useReferenceFrameA = false);
diff --git a/src/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.h b/src/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.h
index b3bda03ee..1e13416df 100644
--- a/src/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.h
+++ b/src/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.h
@@ -67,6 +67,8 @@ public:
 	
 public:
 
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	///for backwards compatibility during the transition to 'getInfo/getInfo2'
 	bool		m_useSolveConstraintObsolete;
 
diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
index ab0742240..17cf92d5d 100644
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
@@ -800,7 +800,7 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 			int totalNumRows = 0;
 			int i;
 			
-			m_tmpConstraintSizesPool.resize(numConstraints);
+			m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints);
 			//calculate the total number of contraint rows
 			for (i=0;i<numConstraints;i++)
 			{
@@ -815,7 +815,7 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 				}
 				totalNumRows += info1.m_numConstraintRows;
 			}
-			m_tmpSolverNonContactConstraintPool.resize(totalNumRows);
+			m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows);
 
 			
 			///setup the btSolverConstraints
@@ -966,9 +966,9 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 	int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size();
 
 	///@todo: use stack allocator for such temporarily memory, same for solver bodies/constraints
-	m_orderNonContactConstraintPool.resize(numNonContactPool);
-	m_orderTmpConstraintPool.resize(numConstraintPool);
-	m_orderFrictionConstraintPool.resize(numFrictionPool);
+	m_orderNonContactConstraintPool.resizeNoInitialize(numNonContactPool);
+	m_orderTmpConstraintPool.resizeNoInitialize(numConstraintPool);
+	m_orderFrictionConstraintPool.resizeNoInitialize(numFrictionPool);
 	{
 		int i;
 		for (i=0;i<numNonContactPool;i++)
@@ -1228,9 +1228,9 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCo
 	}
 
 
-	m_tmpSolverContactConstraintPool.resize(0);
-	m_tmpSolverNonContactConstraintPool.resize(0);
-	m_tmpSolverContactFrictionConstraintPool.resize(0);
+	m_tmpSolverContactConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverNonContactConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize(0);
 
 	return 0.f;
 }
diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
index bb377db8d..94dbe27ff 100644
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
@@ -25,7 +25,7 @@ class btIDebugDraw;
 #include "BulletCollision/NarrowPhaseCollision/btManifoldPoint.h"
 
 ///The btSequentialImpulseConstraintSolver is a fast SIMD implementation of the Projected Gauss Seidel (iterative LCP) method.
-class btSequentialImpulseConstraintSolver : public btConstraintSolver
+ATTRIBUTE_ALIGNED16(class) btSequentialImpulseConstraintSolver : public btConstraintSolver
 {
 protected:
 
@@ -95,6 +95,7 @@ protected:
 
 public:
 
+	BT_DECLARE_ALIGNED_ALLOCATOR();
 	
 	btSequentialImpulseConstraintSolver();
 	virtual ~btSequentialImpulseConstraintSolver();
diff --git a/src/BulletDynamics/ConstraintSolver/btSliderConstraint.h b/src/BulletDynamics/ConstraintSolver/btSliderConstraint.h
index 2edc8d2b2..ca8e715bc 100755
--- a/src/BulletDynamics/ConstraintSolver/btSliderConstraint.h
+++ b/src/BulletDynamics/ConstraintSolver/btSliderConstraint.h
@@ -60,7 +60,7 @@ enum btSliderFlags
 };
 
 
-class btSliderConstraint : public btTypedConstraint
+ATTRIBUTE_ALIGNED16(class) btSliderConstraint : public btTypedConstraint
 {
 protected:
 	///for backwards compatibility during the transition to 'getInfo/getInfo2'
@@ -155,6 +155,8 @@ protected:
 	//------------------------    
 	void initParams();
 public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	// constructors
     btSliderConstraint(btRigidBody& rbA, btRigidBody& rbB, const btTransform& frameInA, const btTransform& frameInB ,bool useLinearReferenceFrameA);
     btSliderConstraint(btRigidBody& rbB, const btTransform& frameInB, bool useLinearReferenceFrameA);
diff --git a/src/BulletDynamics/ConstraintSolver/btTypedConstraint.h b/src/BulletDynamics/ConstraintSolver/btTypedConstraint.h
index a16e869a9..1445b2416 100644
--- a/src/BulletDynamics/ConstraintSolver/btTypedConstraint.h
+++ b/src/BulletDynamics/ConstraintSolver/btTypedConstraint.h
@@ -52,7 +52,7 @@ enum btConstraintParams
 
 
 ///TypedConstraint is the baseclass for Bullet constraints and vehicles
-class btTypedConstraint : public btTypedObject
+ATTRIBUTE_ALIGNED16(class) btTypedConstraint : public btTypedObject
 {
 	int	m_userConstraintType;
 
@@ -87,6 +87,8 @@ protected:
 
 public:
 
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	virtual ~btTypedConstraint() {};
 	btTypedConstraint(btTypedConstraintType type, btRigidBody& rbA);
 	btTypedConstraint(btTypedConstraintType type, btRigidBody& rbA,btRigidBody& rbB);
diff --git a/src/BulletDynamics/ConstraintSolver/btUniversalConstraint.h b/src/BulletDynamics/ConstraintSolver/btUniversalConstraint.h
index a86939164..9e7084104 100644
--- a/src/BulletDynamics/ConstraintSolver/btUniversalConstraint.h
+++ b/src/BulletDynamics/ConstraintSolver/btUniversalConstraint.h
@@ -31,13 +31,16 @@ subject to the following restrictions:
 /// "Given axis 1 on body 1, and axis 2 on body 2 that is perpendicular to axis 1, it keeps them perpendicular. 
 /// In other words, rotation of the two bodies about the direction perpendicular to the two axes will be equal."
 
-class btUniversalConstraint : public btGeneric6DofConstraint
+ATTRIBUTE_ALIGNED16(class) btUniversalConstraint : public btGeneric6DofConstraint
 {
 protected:
 	btVector3	m_anchor;
 	btVector3	m_axis1;
 	btVector3	m_axis2;
 public:
+	
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
 	// constructor
 	// anchor, axis1 and axis2 are in world coordinate system
 	// axis1 must be orthogonal to axis2
diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.cpp b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.cpp
index 954ef241a..83cd9a275 100644
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.cpp
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.cpp
@@ -947,7 +947,8 @@ void	btDiscreteDynamicsWorld::integrateTransforms(btScalar timeStep)
 						//response  between two dynamic objects without friction, assuming 0 penetration depth
 						btScalar appliedImpulse = 0.f;
 						btScalar depth = 0.f;
-						appliedImpulse = resolveSingleCollision(body,sweepResults.m_hitCollisionObject,sweepResults.m_hitPointWorld,sweepResults.m_hitNormalWorld,getSolverInfo(), depth);
+
+						appliedImpulse = resolveSingleCollision(body,(btCollisionObject*)sweepResults.m_hitCollisionObject,sweepResults.m_hitPointWorld,sweepResults.m_hitNormalWorld,getSolverInfo(), depth);
 						
 
 #endif
diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h
index 23a38dd2a..15fb0d95f 100644
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h
@@ -34,7 +34,7 @@ struct InplaceSolverIslandCallback;
 
 ///btDiscreteDynamicsWorld provides discrete rigid body simulation
 ///those classes replace the obsolete CcdPhysicsEnvironment/CcdPhysicsController
-class btDiscreteDynamicsWorld : public btDynamicsWorld
+ATTRIBUTE_ALIGNED16(class) btDiscreteDynamicsWorld : public btDynamicsWorld
 {
 protected:
 	
@@ -87,6 +87,8 @@ protected:
 public:
 
 
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	///this btDiscreteDynamicsWorld constructor gets created objects from the user, and will not delete those
 	btDiscreteDynamicsWorld(btDispatcher* dispatcher,btBroadphaseInterface* pairCache,btConstraintSolver* constraintSolver,btCollisionConfiguration* collisionConfiguration);
 
diff --git a/src/BulletDynamics/Dynamics/btRigidBody.cpp b/src/BulletDynamics/Dynamics/btRigidBody.cpp
index 911b50723..33efd4717 100644
--- a/src/BulletDynamics/Dynamics/btRigidBody.cpp
+++ b/src/BulletDynamics/Dynamics/btRigidBody.cpp
@@ -300,15 +300,15 @@ void btRigidBody::setCenterOfMassTransform(const btTransform& xform)
 }
 
 
-bool btRigidBody::checkCollideWithOverride(btCollisionObject* co)
+bool btRigidBody::checkCollideWithOverride(const  btCollisionObject* co) const
 {
-	btRigidBody* otherRb = btRigidBody::upcast(co);
+	const btRigidBody* otherRb = btRigidBody::upcast(co);
 	if (!otherRb)
 		return true;
 
 	for (int i = 0; i < m_constraintRefs.size(); ++i)
 	{
-		btTypedConstraint* c = m_constraintRefs[i];
+		const btTypedConstraint* c = m_constraintRefs[i];
 		if (c->isEnabled())
 			if (&c->getRigidBodyA() == otherRb || &c->getRigidBodyB() == otherRb)
 				return false;
diff --git a/src/BulletDynamics/Dynamics/btRigidBody.h b/src/BulletDynamics/Dynamics/btRigidBody.h
index 7c121e6df..e8927578d 100644
--- a/src/BulletDynamics/Dynamics/btRigidBody.h
+++ b/src/BulletDynamics/Dynamics/btRigidBody.h
@@ -494,7 +494,7 @@ public:
 		return (getBroadphaseProxy() != 0);
 	}
 
-	virtual bool checkCollideWithOverride(btCollisionObject* co);
+	virtual bool checkCollideWithOverride(const  btCollisionObject* co) const;
 
 	void addConstraintRef(btTypedConstraint* c);
 	void removeConstraintRef(btTypedConstraint* c);
diff --git a/src/BulletDynamics/Vehicle/btRaycastVehicle.cpp b/src/BulletDynamics/Vehicle/btRaycastVehicle.cpp
index 5b467883d..77b475b96 100644
--- a/src/BulletDynamics/Vehicle/btRaycastVehicle.cpp
+++ b/src/BulletDynamics/Vehicle/btRaycastVehicle.cpp
@@ -756,14 +756,14 @@ void* btDefaultVehicleRaycaster::castRay(const btVector3& from,const btVector3&
 	if (rayCallback.hasHit())
 	{
 		
-		btRigidBody* body = btRigidBody::upcast(rayCallback.m_collisionObject);
+		const btRigidBody* body = btRigidBody::upcast(rayCallback.m_collisionObject);
         if (body && body->hasContactResponse())
 		{
 			result.m_hitPointInWorld = rayCallback.m_hitPointWorld;
 			result.m_hitNormalInWorld = rayCallback.m_hitNormalWorld;
 			result.m_hitNormalInWorld.normalize();
 			result.m_distFraction = rayCallback.m_closestHitFraction;
-			return body;
+			return (void*)body;
 		}
 	}
 	return 0;
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
index b66722bdc..d6d67399a 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
@@ -48,7 +48,7 @@ static char* ComputeBoundsHLSLString =
 #include "HLSL/ComputeBounds.hlsl"
 static char* SolveCollisionsAndUpdateVelocitiesHLSLString =
 #include "HLSL/SolveCollisionsAndUpdateVelocities.hlsl"
-
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 btSoftBodyLinkDataDX11::btSoftBodyLinkDataDX11( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext ) : 
 		m_dx11Links( d3dDevice, d3dDeviceContext, &m_links, false ),
@@ -2162,14 +2162,14 @@ void btDX11SoftBodySolver::processCollision( btSoftBody*, btSoftBody* )
 }
 
 // Add the collision object to the set to deal with for a particular soft body
-void btDX11SoftBodySolver::processCollision( btSoftBody *softBody, btCollisionObject* collisionObject )
+void btDX11SoftBodySolver::processCollision( btSoftBody *softBody, const btCollisionObjectWrapper* collisionObject )
 {
 	int softBodyIndex = findSoftBodyIndex( softBody );
 
 	if( softBodyIndex >= 0 )
 	{
-		btCollisionShape *collisionShape = collisionObject->getCollisionShape();
-		float friction = collisionObject->getFriction();
+		const btCollisionShape *collisionShape = collisionObject->getCollisionShape();
+		float friction = collisionObject->getCollisionObject()->getFriction();
 		int shapeType = collisionShape->getShapeType();
 		if( shapeType == CAPSULE_SHAPE_PROXYTYPE )
 		{
@@ -2179,12 +2179,12 @@ void btDX11SoftBodySolver::processCollision( btSoftBody *softBody, btCollisionOb
 			newCollisionShapeDescription.collisionShapeType = shapeType;
 			// TODO: May need to transpose this matrix either here or in HLSL
 			newCollisionShapeDescription.shapeTransform = toTransform3(collisionObject->getWorldTransform());
-			btCapsuleShape *capsule = static_cast<btCapsuleShape*>( collisionShape );
+			const btCapsuleShape *capsule = static_cast<const btCapsuleShape*>( collisionShape );
 			newCollisionShapeDescription.radius = capsule->getRadius();
 			newCollisionShapeDescription.halfHeight = capsule->getHalfHeight();
 			newCollisionShapeDescription.margin = capsule->getMargin();
 			newCollisionShapeDescription.friction = friction;
-			btRigidBody* body = static_cast< btRigidBody* >( collisionObject );
+			const btRigidBody* body = static_cast< const btRigidBody* >( collisionObject->getCollisionObject() );
 			newCollisionShapeDescription.linearVelocity = toVector3(body->getLinearVelocity());
 			newCollisionShapeDescription.angularVelocity = toVector3(body->getAngularVelocity());
 			m_collisionObjectDetails.push_back( newCollisionShapeDescription );
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
index 62e629bed..f2a6c73f9 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
@@ -614,7 +614,7 @@ public:
 	virtual void predictMotion( float solverdt );
 
 	
-	virtual void processCollision( btSoftBody *, btCollisionObject* );
+	virtual void processCollision( btSoftBody *, const btCollisionObjectWrapper* );
 
 	virtual void processCollision( btSoftBody*, btSoftBody* );
 
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
index 0a5ca008b..33b24da03 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
@@ -25,7 +25,7 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btSphereShape.h"
 #include "LinearMath/btQuickprof.h"
 #include <limits.h>
-
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 #define BT_SUPPRESS_OPENCL_ASSERTS
 
@@ -770,7 +770,7 @@ void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &sof
 				desc.setInverseMass(vertexInverseMass);
 				getVertexData().setVertexAt( desc, firstVertex + vertex );
 
-				m_anchorIndex.push_back(-1.0);
+				m_anchorIndex.push_back(-1);
 			}
 
 			// Copy triangles similarly
@@ -1707,14 +1707,14 @@ void btOpenCLSoftBodySolver::processCollision( btSoftBody*, btSoftBody* )
 }
 
 // Add the collision object to the set to deal with for a particular soft body
-void btOpenCLSoftBodySolver::processCollision( btSoftBody *softBody, btCollisionObject* collisionObject )
+void btOpenCLSoftBodySolver::processCollision( btSoftBody *softBody, const btCollisionObjectWrapper* collisionObject )
 {
  	int softBodyIndex = findSoftBodyIndex( softBody );
 
 	if( softBodyIndex >= 0 )
 	{
-		btCollisionShape *collisionShape = collisionObject->getCollisionShape();
-		float friction = collisionObject->getFriction();
+		const btCollisionShape *collisionShape = collisionObject->getCollisionShape();
+		float friction = collisionObject->getCollisionObject()->getFriction();
 		int shapeType = collisionShape->getShapeType();
 		if( shapeType == CAPSULE_SHAPE_PROXYTYPE )
 		{
@@ -1724,13 +1724,13 @@ void btOpenCLSoftBodySolver::processCollision( btSoftBody *softBody, btCollision
 			newCollisionShapeDescription.collisionShapeType = shapeType;
 			// TODO: May need to transpose this matrix either here or in HLSL
 			newCollisionShapeDescription.shapeTransform = toTransform3(collisionObject->getWorldTransform());
-			btCapsuleShape *capsule = static_cast<btCapsuleShape*>( collisionShape );
+			const btCapsuleShape *capsule = static_cast<const btCapsuleShape*>( collisionShape );
 			newCollisionShapeDescription.radius = capsule->getRadius();
 			newCollisionShapeDescription.halfHeight = capsule->getHalfHeight();
 			newCollisionShapeDescription.margin = capsule->getMargin();
 			newCollisionShapeDescription.upAxis = capsule->getUpAxis();
 			newCollisionShapeDescription.friction = friction;
-			btRigidBody* body = static_cast< btRigidBody* >( collisionObject );
+			const btRigidBody* body = static_cast< const btRigidBody* >( collisionObject->getCollisionObject() );
 			newCollisionShapeDescription.linearVelocity = toVector3(body->getLinearVelocity());
 			newCollisionShapeDescription.angularVelocity = toVector3(body->getAngularVelocity());
 			m_collisionObjectDetails.push_back( newCollisionShapeDescription );
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
index 4c9c8e90e..dfcfe8cec 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
@@ -481,7 +481,7 @@ public:
 
 	virtual void predictMotion( float solverdt );
 
-	virtual void processCollision( btSoftBody *, btCollisionObject* );
+	virtual void processCollision( btSoftBody *, const btCollisionObjectWrapper* );
 
 	virtual void processCollision( btSoftBody*, btSoftBody* );
 
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
index 2216768a9..32adca146 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
@@ -251,7 +251,7 @@ void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody
 				desc.setInverseMass(vertexInverseMass);
 				getVertexData().setVertexAt( desc, firstVertex + vertex );
 
-				m_anchorIndex.push_back(-1.0);
+				m_anchorIndex.push_back(-1);
 			}
 			for( int vertex = numVertices; vertex < maxVertices; ++vertex )
 			{
diff --git a/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp b/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp
index 286b63191..b27988cf3 100644
--- a/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp
+++ b/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp
@@ -22,7 +22,7 @@ subject to the following restrictions:
 
 
 
-void SpuContactManifoldCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void SpuContactManifoldCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 	btAssert(0);
 }
diff --git a/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h b/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h
index d28d4db31..083e15f4f 100644
--- a/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h
+++ b/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h
@@ -45,7 +45,7 @@ ATTRIBUTE_ALIGNED16(class) SpuContactManifoldCollisionAlgorithm : public btColli
 	
 public:
 	
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
diff --git a/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp b/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp
index 1a76be082..b9e88a07f 100644
--- a/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp
+++ b/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp
@@ -24,7 +24,7 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btCollisionShape.h"
 #include "LinearMath/btQuickprof.h"
 #include "BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h"
-
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 
 
@@ -166,7 +166,10 @@ public:
 						collisionPair.m_internalTmpValue =  2;
 					} else
 					{
-						collisionPair.m_algorithm = m_dispatcher->findAlgorithm(colObj0,colObj1);
+						btCollisionObjectWrapper ob0(0,colObj0->getCollisionShape(),colObj0,colObj0->getWorldTransform());
+						btCollisionObjectWrapper ob1(0,colObj1->getCollisionShape(),colObj1,colObj1->getWorldTransform());
+
+						collisionPair.m_algorithm = m_dispatcher->findAlgorithm(&ob0,&ob1);
 						collisionPair.m_internalTmpValue = 3;
 					}
 				} 
@@ -241,12 +244,16 @@ void	SpuGatheringCollisionDispatcher::dispatchAllCollisionPairs(btOverlappingPai
 	
 							if (dispatcher->needsCollision(colObj0,colObj1))
 							{
-								btManifoldResult contactPointResult(colObj0,colObj1);
+							//discrete collision detection query
+								btCollisionObjectWrapper ob0(0,colObj0->getCollisionShape(),colObj0,colObj0->getWorldTransform());
+								btCollisionObjectWrapper ob1(0,colObj1->getCollisionShape(),colObj1,colObj1->getWorldTransform());
+
+								btManifoldResult contactPointResult(&ob0,&ob1);
 								
 								if (dispatchInfo.m_dispatchFunc == 		btDispatcherInfo::DISPATCH_DISCRETE)
 								{
-									//discrete collision detection query
-									collisionPair.m_algorithm->processCollision(colObj0,colObj1,dispatchInfo,&contactPointResult);
+									
+									collisionPair.m_algorithm->processCollision(&ob0,&ob1,dispatchInfo,&contactPointResult);
 								} else
 								{
 									//continuous collision detection query, time of impact (toi)
diff --git a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
index dfcd84266..8d755b223 100644
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
@@ -44,7 +44,7 @@ void computeAabb (btVector3& aabbMin, btVector3& aabbMax, btConvexInternalShape*
 		const btTransform& t = xform;
 		btMatrix3x3 abs_b = t.getBasis().absolute();  
 		btVector3 center = t.getOrigin();
-		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));
+        btVector3 extent = halfExtents.dot3( abs_b[0], abs_b[1], abs_b[2] );
 		
 		aabbMin = center - extent;
 		aabbMax = center + extent;
@@ -67,7 +67,7 @@ void computeAabb (btVector3& aabbMin, btVector3& aabbMax, btConvexInternalShape*
 		const btTransform& t = xform;
 		btMatrix3x3 abs_b = t.getBasis().absolute();  
 		btVector3 center = t.getOrigin();
-		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));
+        btVector3 extent = halfExtents.dot3( abs_b[0], abs_b[1], abs_b[2] );
 		
 		aabbMin = center - extent;
 		aabbMax = center + extent;
diff --git a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
index 42f5f45c0..e30e359f0 100644
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
@@ -1364,8 +1364,8 @@ void	processCollisionTask(void* userPtr, void* lsMemPtr)
 											)
 										{
 											handleCollisionPair(collisionPairInput, lsMem, spuContacts,
-												(ppu_address_t)lsMem.getColObj0()->getRootCollisionShape(), &lsMem.gCollisionShapes[0].collisionShape,
-												(ppu_address_t)lsMem.getColObj1()->getRootCollisionShape(), &lsMem.gCollisionShapes[1].collisionShape);
+												(ppu_address_t)lsMem.getColObj0()->getCollisionShape(), &lsMem.gCollisionShapes[0].collisionShape,
+												(ppu_address_t)lsMem.getColObj1()->getCollisionShape(), &lsMem.gCollisionShapes[1].collisionShape);
 										} else
 										{
 												//spu_printf("boxbox dist = %f\n",distance);
diff --git a/src/BulletMultiThreaded/TrbDynBody.h b/src/BulletMultiThreaded/TrbDynBody.h
index 4d14212c3..a7f4bf1b3 100644
--- a/src/BulletMultiThreaded/TrbDynBody.h
+++ b/src/BulletMultiThreaded/TrbDynBody.h
@@ -1,79 +1,79 @@
-/*
-   Copyright (C) 2009 Sony Computer Entertainment Inc.
-   All rights reserved.
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-
-*/
-
-#ifndef BT_RB_DYN_BODY_H__
-#define BT_RB_DYN_BODY_H__
-
-#include "vectormath/vmInclude.h"
-using namespace Vectormath::Aos;
-
-#include "TrbStateVec.h"
-
-class CollObject;
-
-class TrbDynBody
-{
-public:
-	TrbDynBody()
-	{
-		fMass   = 0.0f;
-		fCollObject = NULL;
-		fElasticity = 0.2f;
-		fFriction = 0.8f;
-	}
-
-	// Get methods
-	float          getMass() const {return fMass;};
-	float          getElasticity() const {return fElasticity;}
-	float          getFriction() const {return fFriction;}
-	CollObject*    getCollObject() const {return fCollObject;}
-	const Matrix3 &getBodyInertia() const {return fIBody;}
-	const Matrix3 &getBodyInertiaInv() const {return fIBodyInv;}
-	float          getMassInv() const {return fMassInv;}
-
-	// Set methods
-	void           setMass(float mass) {fMass=mass;fMassInv=mass>0.0f?1.0f/mass:0.0f;}
-	void           setBodyInertia(const Matrix3 bodyInertia) {fIBody = bodyInertia;fIBodyInv = inverse(bodyInertia);}
-	void           setElasticity(float elasticity) {fElasticity = elasticity;}
-	void           setFriction(float friction) {fFriction = friction;}
-	void           setCollObject(CollObject *collObj) {fCollObject = collObj;}
-	
-	void           setBodyInertiaInv(const Matrix3 bodyInertiaInv) 
-	{
-		fIBody = inverse(bodyInertiaInv);
-		fIBodyInv = bodyInertiaInv;
-	}
-	void           setMassInv(float invMass) {
-		fMass= invMass>0.0f ? 1.0f/invMass :0.0f;
-		fMassInv=invMass;
-	}
-
-
-private:
-	// Rigid Body constants
-	float          fMass;        // Rigid Body mass
-	float          fMassInv;     // Inverse of mass
-	Matrix3        fIBody;       // Inertia matrix in body's coords
-	Matrix3        fIBodyInv;    // Inertia matrix inverse in body's coords
-	float          fElasticity;  // Coefficient of restitution
-	float          fFriction;    // Coefficient of friction
-
-public:
-	CollObject*    fCollObject;  // Collision object corresponding the RB
-} __attribute__ ((aligned(16)));
-
-#endif //BT_RB_DYN_BODY_H__
-
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef BT_RB_DYN_BODY_H__
+#define BT_RB_DYN_BODY_H__
+
+#include "vectormath/vmInclude.h"
+using namespace Vectormath::Aos;
+
+#include "TrbStateVec.h"
+
+class CollObject;
+
+class TrbDynBody
+{
+public:
+	TrbDynBody()
+	{
+		fMass   = 0.0f;
+		fCollObject = NULL;
+		fElasticity = 0.2f;
+		fFriction = 0.8f;
+	}
+
+	// Get methods
+	float          getMass() const {return fMass;};
+	float          getElasticity() const {return fElasticity;}
+	float          getFriction() const {return fFriction;}
+	CollObject*    getCollObject() const {return fCollObject;}
+	const Matrix3 &getBodyInertia() const {return fIBody;}
+	const Matrix3 &getBodyInertiaInv() const {return fIBodyInv;}
+	float          getMassInv() const {return fMassInv;}
+
+	// Set methods
+	void           setMass(float mass) {fMass=mass;fMassInv=mass>0.0f?1.0f/mass:0.0f;}
+	void           setBodyInertia(const Matrix3 bodyInertia) {fIBody = bodyInertia;fIBodyInv = inverse(bodyInertia);}
+	void           setElasticity(float elasticity) {fElasticity = elasticity;}
+	void           setFriction(float friction) {fFriction = friction;}
+	void           setCollObject(CollObject *collObj) {fCollObject = collObj;}
+	
+	void           setBodyInertiaInv(const Matrix3 bodyInertiaInv) 
+	{
+		fIBody = inverse(bodyInertiaInv);
+		fIBodyInv = bodyInertiaInv;
+	}
+	void           setMassInv(float invMass) {
+		fMass= invMass>0.0f ? 1.0f/invMass :0.0f;
+		fMassInv=invMass;
+	}
+
+
+private:
+	// Rigid Body constants
+	float          fMass;        // Rigid Body mass
+	float          fMassInv;     // Inverse of mass
+	Matrix3        fIBody;       // Inertia matrix in body's coords
+	Matrix3        fIBodyInv;    // Inertia matrix inverse in body's coords
+	float          fElasticity;  // Coefficient of restitution
+	float          fFriction;    // Coefficient of friction
+
+public:
+	CollObject*    fCollObject;  // Collision object corresponding the RB
+} __attribute__ ((aligned(16)));
+
+#endif //BT_RB_DYN_BODY_H__
+
diff --git a/src/BulletMultiThreaded/TrbStateVec.h b/src/BulletMultiThreaded/TrbStateVec.h
index 46d1904aa..b6d895e12 100644
--- a/src/BulletMultiThreaded/TrbStateVec.h
+++ b/src/BulletMultiThreaded/TrbStateVec.h
@@ -1,339 +1,339 @@
-/*
-   Copyright (C) 2009 Sony Computer Entertainment Inc.
-   All rights reserved.
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-
-*/
-
-#ifndef BT_TRBSTATEVEC_H__
-#define BT_TRBSTATEVEC_H__
-
-#include <stdlib.h>
-#ifdef PFX_USE_FREE_VECTORMATH
-#include "vecmath/vmInclude.h"
-#else
-#include "vectormath/vmInclude.h"
-#endif //PFX_USE_FREE_VECTORMATH
-
-
-#include "PlatformDefinitions.h"
-
-
-static inline vmVector3 read_Vector3(const float* p)
-{
-	vmVector3 v;
-	loadXYZ(v, p);
-	return v;
-}
-
-static inline vmQuat read_Quat(const float* p)
-{
-	vmQuat vq;
-	loadXYZW(vq, p);
-	return vq;
-}
-
-static inline void store_Vector3(const vmVector3 &src, float* p)
-{
-	vmVector3 v = src;
-	storeXYZ(v, p);
-}
-
-static inline void store_Quat(const vmQuat &src, float* p)
-{
-	vmQuat vq = src;
-	storeXYZW(vq, p);
-}
-
-// Motion Type
-enum {
-	PfxMotionTypeFixed = 0,
-	PfxMotionTypeActive,
-	PfxMotionTypeKeyframe,
-	PfxMotionTypeOneWay,
-	PfxMotionTypeTrigger,
-	PfxMotionTypeCount
-};
-
-#define PFX_MOTION_MASK_DYNAMIC 0x0a // Active,OneWay
-#define PFX_MOTION_MASK_STATIC  0x95 // Fixed,Keyframe,Trigger,Sleeping
-#define PFX_MOTION_MASK_SLEEP   0x0e // Can sleep
-#define PFX_MOTION_MASK_TYPE    0x7f
-
-//
-// Rigid Body state
-//
-
-#ifdef __CELLOS_LV2__
-ATTRIBUTE_ALIGNED128(class) TrbState
-#else
-ATTRIBUTE_ALIGNED16(class) TrbState
-#endif
-
-{
-public:
-	TrbState()
-	{
-		setMotionType(PfxMotionTypeActive);
-		contactFilterSelf=contactFilterTarget=0xffffffff;
-		deleted = 0;
-		mSleeping = 0;
-		useSleep = 1;
-		trbBodyIdx=0;
-		mSleepCount=0;
-		useCcd = 0;
-		useContactCallback = 0;
-		useSleepCallback = 0;
-		linearDamping = 1.0f;
-		angularDamping = 0.99f;
-	}
-
-	TrbState(const uint8_t m, const vmVector3& x, const vmQuat& q, const vmVector3& v, const vmVector3& omega );
-	
-	uint16_t	mSleepCount;
-	uint8_t		mMotionType;
-	uint8_t		deleted            : 1;
-	uint8_t		mSleeping           : 1;
-	uint8_t		useSleep           : 1;
-	uint8_t		useCcd		       : 1;
-	uint8_t		useContactCallback : 1;
-	uint8_t		useSleepCallback   : 1;
-
-	uint16_t	trbBodyIdx;
-	uint32_t	contactFilterSelf;
-	uint32_t	contactFilterTarget;
-
-	float		center[3];		// AABB center(World)
-	float		half[3];		// AABB half(World)
-
-	float		linearDamping;
-	float		angularDamping;
-	
-	float		deltaLinearVelocity[3];
-	float		deltaAngularVelocity[3];
-
-	float     fX[3];				// position
-	float     fQ[4];				// orientation
-	float     fV[3];				// velocity
-	float     fOmega[3];			// angular velocity
-
-	inline void setZero();      // Zeroes out the elements
-	inline void setIdentity();  // Sets the rotation to identity and zeroes out the other elements
-
-	bool		isDeleted() const {return deleted==1;}
-
-	uint16_t	getRigidBodyId() const {return trbBodyIdx;}
-	void		setRigidBodyId(uint16_t i) {trbBodyIdx = i;}
-
-
-	uint32_t	getContactFilterSelf() const {return contactFilterSelf;}
-	void		setContactFilterSelf(uint32_t filter) {contactFilterSelf = filter;}
-
-	uint32_t	getContactFilterTarget() const {return contactFilterTarget;}
-	void		setContactFilterTarget(uint32_t filter) {contactFilterTarget = filter;}
-
-	float getLinearDamping() const {return linearDamping;}
-	float getAngularDamping() const {return angularDamping;}
-
-	void setLinearDamping(float damping) {linearDamping=damping;}
-	void setAngularDamping(float damping) {angularDamping=damping;}
-
-
-	uint8_t		getMotionType() const {return mMotionType;}
-	void		setMotionType(uint8_t t) {mMotionType = t;mSleeping=0;mSleepCount=0;}
-
-	uint8_t		getMotionMask() const {return (1<<mMotionType)|(mSleeping<<7);}
-
-	bool		isAsleep() const {return mSleeping==1;}
-	bool		isAwake() const {return mSleeping==0;}
-
-	void		wakeup() {mSleeping=0;mSleepCount=0;}
-	void		sleep() {if(useSleep) {mSleeping=1;mSleepCount=0;}}
-
-	uint8_t		getUseSleep() const {return useSleep;}
-	void		setUseSleep(uint8_t b) {useSleep=b;}
-
-	uint8_t		getUseCcd() const {return useCcd;}
-	void		setUseCcd(uint8_t b) {useCcd=b;}
-
-	uint8_t		getUseContactCallback() const {return useContactCallback;}
-	void		setUseContactCallback(uint8_t b) {useContactCallback=b;}
-
-	uint8_t		getUseSleepCallback() const {return useSleepCallback;}
-	void		setUseSleepCallback(uint8_t b) {useSleepCallback=b;}
-
-	void	 	incrementSleepCount() {mSleepCount++;}
-	void		resetSleepCount() {mSleepCount=0;}
-	uint16_t	getSleepCount() const {return mSleepCount;}
-
-	vmVector3 getPosition() const {return read_Vector3(fX);}
-	vmQuat    getOrientation() const {return read_Quat(fQ);}
-	vmVector3 getLinearVelocity() const {return read_Vector3(fV);}
-	vmVector3 getAngularVelocity() const {return read_Vector3(fOmega);}
-	vmVector3 getDeltaLinearVelocity() const {return read_Vector3(deltaLinearVelocity);}
-	vmVector3 getDeltaAngularVelocity() const {return read_Vector3(deltaAngularVelocity);}
-
-	void setPosition(const vmVector3 &pos) {store_Vector3(pos, fX);}
-	void setLinearVelocity(const vmVector3 &vel) {store_Vector3(vel, fV);}
-	void setAngularVelocity(const vmVector3 &vel) {store_Vector3(vel, fOmega);}
-	void setDeltaLinearVelocity(const vmVector3 &vel) {store_Vector3(vel, deltaLinearVelocity);}
-	void setDeltaAngularVelocity(const vmVector3 &vel) {store_Vector3(vel, deltaAngularVelocity);}
-	void setOrientation(const vmQuat &rot) {store_Quat(rot, fQ);}
-
-	inline void setAuxils(const vmVector3 &centerLocal,const vmVector3 &halfLocal);
-	inline void	setAuxilsCcd(const vmVector3 &centerLocal,const vmVector3 &halfLocal,float timeStep);
-	inline	void reset();
-};
-
-inline
-TrbState::TrbState(const uint8_t m, const vmVector3& x, const vmQuat& q, const vmVector3& v, const vmVector3& omega)
-{
-	setMotionType(m);
-	fX[0] = x[0];
-	fX[1] = x[1];
-	fX[2] = x[2];
-	fQ[0] = q[0];
-	fQ[1] = q[1];
-	fQ[2] = q[2];
-	fQ[3] = q[3];
-	fV[0] = v[0];
-	fV[1] = v[1];
-	fV[2] = v[2];
-	fOmega[0] = omega[0];
-	fOmega[1] = omega[1];
-	fOmega[2] = omega[2];
-	contactFilterSelf=contactFilterTarget=0xffff;
-	trbBodyIdx=0;
-	mSleeping = 0;
-	deleted = 0;
-	useSleep = 1;
-	useCcd = 0;
-	useContactCallback = 0;
-	useSleepCallback = 0;
-	mSleepCount=0;
-	linearDamping = 1.0f;
-	angularDamping = 0.99f;
-}
-
-inline void
-TrbState::setIdentity()
-{
-	fX[0] = 0.0f;
-	fX[1] = 0.0f;
-	fX[2] = 0.0f;
-	fQ[0] = 0.0f;
-	fQ[1] = 0.0f;
-	fQ[2] = 0.0f;
-	fQ[3] = 1.0f;
-	fV[0] = 0.0f;
-	fV[1] = 0.0f;
-	fV[2] = 0.0f;
-	fOmega[0] = 0.0f;
-	fOmega[1] = 0.0f;
-	fOmega[2] = 0.0f;
-}
-
-inline void
-TrbState::setZero()
-{
-	fX[0] = 0.0f;
-	fX[1] = 0.0f;
-	fX[2] = 0.0f;
-	fQ[0] = 0.0f;
-	fQ[1] = 0.0f;
-	fQ[2] = 0.0f;
-	fQ[3] = 0.0f;
-	fV[0] = 0.0f;
-	fV[1] = 0.0f;
-	fV[2] = 0.0f;
-	fOmega[0] = 0.0f;
-	fOmega[1] = 0.0f;
-	fOmega[2] = 0.0f;
-}
-
-inline void
-TrbState::setAuxils(const vmVector3 &centerLocal,const vmVector3 &halfLocal)
-{
-	vmVector3 centerW = getPosition() + rotate(getOrientation(),centerLocal);
-	vmVector3 halfW = absPerElem(vmMatrix3(getOrientation())) * halfLocal;
-	center[0] = centerW[0];
-	center[1] = centerW[1];
-	center[2] = centerW[2];
-	half[0] = halfW[0];
-	half[1] = halfW[1];
-	half[2] = halfW[2];
-}
-
-inline void
-TrbState::setAuxilsCcd(const vmVector3 &centerLocal,const vmVector3 &halfLocal,float timeStep)
-{
-	vmVector3 centerW = getPosition() + rotate(getOrientation(),centerLocal);
-	vmVector3 halfW = absPerElem(vmMatrix3(getOrientation())) * halfLocal;
-
-	vmVector3 diffvec = getLinearVelocity()*timeStep;
-
-	vmVector3 newCenter = centerW + diffvec;
-	vmVector3 aabbMin = minPerElem(newCenter - halfW,centerW - halfW);
-	vmVector3 aabbMax = maxPerElem(newCenter + halfW,centerW + halfW);
-	
-	centerW = 0.5f * (aabbMin + aabbMax);
-	halfW =0.5f * (aabbMax - aabbMin);
-
-	center[0] = centerW[0];
-	center[1] = centerW[1];
-	center[2] = centerW[2];
-
-	half[0] = halfW[0];
-	half[1] = halfW[1];
-	half[2] = halfW[2];
-}
-
-inline
-void TrbState::reset()
-{
-#if 0
-	mSleepCount = 0;
-	mMotionType = PfxMotionTypeActive;
-	mDeleted = 0;
-	mSleeping = 0;
-	mUseSleep = 1;
-	mUseCcd = 0;
-	mUseContactCallback = 0;
-	mUseSleepCallback = 0;
-	mRigidBodyId = 0;
-	mContactFilterSelf = 0xffffffff;
-	mContactFilterTarget = 0xffffffff;
-	mLinearDamping = 1.0f;
-	mAngularDamping = 0.99f;
-	mPosition = vmVector3(0.0f);
-	mOrientation = vmQuat::identity();
-	mLinearVelocity = vmVector3(0.0f);
-	mAngularVelocity = vmVector3(0.0f);
-#endif
-
-	setMotionType(PfxMotionTypeActive);
-	contactFilterSelf=contactFilterTarget=0xffffffff;
-	deleted = 0;
-	mSleeping = 0;
-	useSleep = 1;
-	trbBodyIdx=0;
-	mSleepCount=0;
-	useCcd = 0;
-	useContactCallback = 0;
-	useSleepCallback = 0;
-	linearDamping = 1.0f;
-	angularDamping = 0.99f;
-}
-
-#endif //BT_TRBSTATEVEC_H__
-
-
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef BT_TRBSTATEVEC_H__
+#define BT_TRBSTATEVEC_H__
+
+#include <stdlib.h>
+#ifdef PFX_USE_FREE_VECTORMATH
+#include "vecmath/vmInclude.h"
+#else
+#include "vectormath/vmInclude.h"
+#endif //PFX_USE_FREE_VECTORMATH
+
+
+#include "PlatformDefinitions.h"
+
+
+static inline vmVector3 read_Vector3(const float* p)
+{
+	vmVector3 v;
+	loadXYZ(v, p);
+	return v;
+}
+
+static inline vmQuat read_Quat(const float* p)
+{
+	vmQuat vq;
+	loadXYZW(vq, p);
+	return vq;
+}
+
+static inline void store_Vector3(const vmVector3 &src, float* p)
+{
+	vmVector3 v = src;
+	storeXYZ(v, p);
+}
+
+static inline void store_Quat(const vmQuat &src, float* p)
+{
+	vmQuat vq = src;
+	storeXYZW(vq, p);
+}
+
+// Motion Type
+enum {
+	PfxMotionTypeFixed = 0,
+	PfxMotionTypeActive,
+	PfxMotionTypeKeyframe,
+	PfxMotionTypeOneWay,
+	PfxMotionTypeTrigger,
+	PfxMotionTypeCount
+};
+
+#define PFX_MOTION_MASK_DYNAMIC 0x0a // Active,OneWay
+#define PFX_MOTION_MASK_STATIC  0x95 // Fixed,Keyframe,Trigger,Sleeping
+#define PFX_MOTION_MASK_SLEEP   0x0e // Can sleep
+#define PFX_MOTION_MASK_TYPE    0x7f
+
+//
+// Rigid Body state
+//
+
+#ifdef __CELLOS_LV2__
+ATTRIBUTE_ALIGNED128(class) TrbState
+#else
+ATTRIBUTE_ALIGNED16(class) TrbState
+#endif
+
+{
+public:
+	TrbState()
+	{
+		setMotionType(PfxMotionTypeActive);
+		contactFilterSelf=contactFilterTarget=0xffffffff;
+		deleted = 0;
+		mSleeping = 0;
+		useSleep = 1;
+		trbBodyIdx=0;
+		mSleepCount=0;
+		useCcd = 0;
+		useContactCallback = 0;
+		useSleepCallback = 0;
+		linearDamping = 1.0f;
+		angularDamping = 0.99f;
+	}
+
+	TrbState(const uint8_t m, const vmVector3& x, const vmQuat& q, const vmVector3& v, const vmVector3& omega );
+	
+	uint16_t	mSleepCount;
+	uint8_t		mMotionType;
+	uint8_t		deleted            : 1;
+	uint8_t		mSleeping           : 1;
+	uint8_t		useSleep           : 1;
+	uint8_t		useCcd		       : 1;
+	uint8_t		useContactCallback : 1;
+	uint8_t		useSleepCallback   : 1;
+
+	uint16_t	trbBodyIdx;
+	uint32_t	contactFilterSelf;
+	uint32_t	contactFilterTarget;
+
+	float		center[3];		// AABB center(World)
+	float		half[3];		// AABB half(World)
+
+	float		linearDamping;
+	float		angularDamping;
+	
+	float		deltaLinearVelocity[3];
+	float		deltaAngularVelocity[3];
+
+	float     fX[3];				// position
+	float     fQ[4];				// orientation
+	float     fV[3];				// velocity
+	float     fOmega[3];			// angular velocity
+
+	inline void setZero();      // Zeroes out the elements
+	inline void setIdentity();  // Sets the rotation to identity and zeroes out the other elements
+
+	bool		isDeleted() const {return deleted==1;}
+
+	uint16_t	getRigidBodyId() const {return trbBodyIdx;}
+	void		setRigidBodyId(uint16_t i) {trbBodyIdx = i;}
+
+
+	uint32_t	getContactFilterSelf() const {return contactFilterSelf;}
+	void		setContactFilterSelf(uint32_t filter) {contactFilterSelf = filter;}
+
+	uint32_t	getContactFilterTarget() const {return contactFilterTarget;}
+	void		setContactFilterTarget(uint32_t filter) {contactFilterTarget = filter;}
+
+	float getLinearDamping() const {return linearDamping;}
+	float getAngularDamping() const {return angularDamping;}
+
+	void setLinearDamping(float damping) {linearDamping=damping;}
+	void setAngularDamping(float damping) {angularDamping=damping;}
+
+
+	uint8_t		getMotionType() const {return mMotionType;}
+	void		setMotionType(uint8_t t) {mMotionType = t;mSleeping=0;mSleepCount=0;}
+
+	uint8_t		getMotionMask() const {return (1<<mMotionType)|(mSleeping<<7);}
+
+	bool		isAsleep() const {return mSleeping==1;}
+	bool		isAwake() const {return mSleeping==0;}
+
+	void		wakeup() {mSleeping=0;mSleepCount=0;}
+	void		sleep() {if(useSleep) {mSleeping=1;mSleepCount=0;}}
+
+	uint8_t		getUseSleep() const {return useSleep;}
+	void		setUseSleep(uint8_t b) {useSleep=b;}
+
+	uint8_t		getUseCcd() const {return useCcd;}
+	void		setUseCcd(uint8_t b) {useCcd=b;}
+
+	uint8_t		getUseContactCallback() const {return useContactCallback;}
+	void		setUseContactCallback(uint8_t b) {useContactCallback=b;}
+
+	uint8_t		getUseSleepCallback() const {return useSleepCallback;}
+	void		setUseSleepCallback(uint8_t b) {useSleepCallback=b;}
+
+	void	 	incrementSleepCount() {mSleepCount++;}
+	void		resetSleepCount() {mSleepCount=0;}
+	uint16_t	getSleepCount() const {return mSleepCount;}
+
+	vmVector3 getPosition() const {return read_Vector3(fX);}
+	vmQuat    getOrientation() const {return read_Quat(fQ);}
+	vmVector3 getLinearVelocity() const {return read_Vector3(fV);}
+	vmVector3 getAngularVelocity() const {return read_Vector3(fOmega);}
+	vmVector3 getDeltaLinearVelocity() const {return read_Vector3(deltaLinearVelocity);}
+	vmVector3 getDeltaAngularVelocity() const {return read_Vector3(deltaAngularVelocity);}
+
+	void setPosition(const vmVector3 &pos) {store_Vector3(pos, fX);}
+	void setLinearVelocity(const vmVector3 &vel) {store_Vector3(vel, fV);}
+	void setAngularVelocity(const vmVector3 &vel) {store_Vector3(vel, fOmega);}
+	void setDeltaLinearVelocity(const vmVector3 &vel) {store_Vector3(vel, deltaLinearVelocity);}
+	void setDeltaAngularVelocity(const vmVector3 &vel) {store_Vector3(vel, deltaAngularVelocity);}
+	void setOrientation(const vmQuat &rot) {store_Quat(rot, fQ);}
+
+	inline void setAuxils(const vmVector3 &centerLocal,const vmVector3 &halfLocal);
+	inline void	setAuxilsCcd(const vmVector3 &centerLocal,const vmVector3 &halfLocal,float timeStep);
+	inline	void reset();
+};
+
+inline
+TrbState::TrbState(const uint8_t m, const vmVector3& x, const vmQuat& q, const vmVector3& v, const vmVector3& omega)
+{
+	setMotionType(m);
+	fX[0] = x[0];
+	fX[1] = x[1];
+	fX[2] = x[2];
+	fQ[0] = q[0];
+	fQ[1] = q[1];
+	fQ[2] = q[2];
+	fQ[3] = q[3];
+	fV[0] = v[0];
+	fV[1] = v[1];
+	fV[2] = v[2];
+	fOmega[0] = omega[0];
+	fOmega[1] = omega[1];
+	fOmega[2] = omega[2];
+	contactFilterSelf=contactFilterTarget=0xffff;
+	trbBodyIdx=0;
+	mSleeping = 0;
+	deleted = 0;
+	useSleep = 1;
+	useCcd = 0;
+	useContactCallback = 0;
+	useSleepCallback = 0;
+	mSleepCount=0;
+	linearDamping = 1.0f;
+	angularDamping = 0.99f;
+}
+
+inline void
+TrbState::setIdentity()
+{
+	fX[0] = 0.0f;
+	fX[1] = 0.0f;
+	fX[2] = 0.0f;
+	fQ[0] = 0.0f;
+	fQ[1] = 0.0f;
+	fQ[2] = 0.0f;
+	fQ[3] = 1.0f;
+	fV[0] = 0.0f;
+	fV[1] = 0.0f;
+	fV[2] = 0.0f;
+	fOmega[0] = 0.0f;
+	fOmega[1] = 0.0f;
+	fOmega[2] = 0.0f;
+}
+
+inline void
+TrbState::setZero()
+{
+	fX[0] = 0.0f;
+	fX[1] = 0.0f;
+	fX[2] = 0.0f;
+	fQ[0] = 0.0f;
+	fQ[1] = 0.0f;
+	fQ[2] = 0.0f;
+	fQ[3] = 0.0f;
+	fV[0] = 0.0f;
+	fV[1] = 0.0f;
+	fV[2] = 0.0f;
+	fOmega[0] = 0.0f;
+	fOmega[1] = 0.0f;
+	fOmega[2] = 0.0f;
+}
+
+inline void
+TrbState::setAuxils(const vmVector3 &centerLocal,const vmVector3 &halfLocal)
+{
+	vmVector3 centerW = getPosition() + rotate(getOrientation(),centerLocal);
+	vmVector3 halfW = absPerElem(vmMatrix3(getOrientation())) * halfLocal;
+	center[0] = centerW[0];
+	center[1] = centerW[1];
+	center[2] = centerW[2];
+	half[0] = halfW[0];
+	half[1] = halfW[1];
+	half[2] = halfW[2];
+}
+
+inline void
+TrbState::setAuxilsCcd(const vmVector3 &centerLocal,const vmVector3 &halfLocal,float timeStep)
+{
+	vmVector3 centerW = getPosition() + rotate(getOrientation(),centerLocal);
+	vmVector3 halfW = absPerElem(vmMatrix3(getOrientation())) * halfLocal;
+
+	vmVector3 diffvec = getLinearVelocity()*timeStep;
+
+	vmVector3 newCenter = centerW + diffvec;
+	vmVector3 aabbMin = minPerElem(newCenter - halfW,centerW - halfW);
+	vmVector3 aabbMax = maxPerElem(newCenter + halfW,centerW + halfW);
+	
+	centerW = 0.5f * (aabbMin + aabbMax);
+	halfW =0.5f * (aabbMax - aabbMin);
+
+	center[0] = centerW[0];
+	center[1] = centerW[1];
+	center[2] = centerW[2];
+
+	half[0] = halfW[0];
+	half[1] = halfW[1];
+	half[2] = halfW[2];
+}
+
+inline
+void TrbState::reset()
+{
+#if 0
+	mSleepCount = 0;
+	mMotionType = PfxMotionTypeActive;
+	mDeleted = 0;
+	mSleeping = 0;
+	mUseSleep = 1;
+	mUseCcd = 0;
+	mUseContactCallback = 0;
+	mUseSleepCallback = 0;
+	mRigidBodyId = 0;
+	mContactFilterSelf = 0xffffffff;
+	mContactFilterTarget = 0xffffffff;
+	mLinearDamping = 1.0f;
+	mAngularDamping = 0.99f;
+	mPosition = vmVector3(0.0f);
+	mOrientation = vmQuat::identity();
+	mLinearVelocity = vmVector3(0.0f);
+	mAngularVelocity = vmVector3(0.0f);
+#endif
+
+	setMotionType(PfxMotionTypeActive);
+	contactFilterSelf=contactFilterTarget=0xffffffff;
+	deleted = 0;
+	mSleeping = 0;
+	useSleep = 1;
+	trbBodyIdx=0;
+	mSleepCount=0;
+	useCcd = 0;
+	useContactCallback = 0;
+	useSleepCallback = 0;
+	linearDamping = 1.0f;
+	angularDamping = 0.99f;
+}
+
+#endif //BT_TRBSTATEVEC_H__
+
+
diff --git a/src/BulletSoftBody/btDefaultSoftBodySolver.cpp b/src/BulletSoftBody/btDefaultSoftBodySolver.cpp
index d1435b65c..e90d24e6e 100644
--- a/src/BulletSoftBody/btDefaultSoftBodySolver.cpp
+++ b/src/BulletSoftBody/btDefaultSoftBodySolver.cpp
@@ -130,9 +130,9 @@ void btDefaultSoftBodySolver::processCollision( btSoftBody* softBody, btSoftBody
 }
 
 // For the default solver just leave the soft body to do its collision processing
-void btDefaultSoftBodySolver::processCollision( btSoftBody *softBody, btCollisionObject* collisionObject )
+void btDefaultSoftBodySolver::processCollision( btSoftBody *softBody, const btCollisionObjectWrapper* collisionObjectWrap )
 {
-	softBody->defaultCollisionHandler( collisionObject );
+	softBody->defaultCollisionHandler( collisionObjectWrap );
 } // btDefaultSoftBodySolver::processCollision
 
 
diff --git a/src/BulletSoftBody/btDefaultSoftBodySolver.h b/src/BulletSoftBody/btDefaultSoftBodySolver.h
index 7d9092ce5..1c17ffcbb 100644
--- a/src/BulletSoftBody/btDefaultSoftBodySolver.h
+++ b/src/BulletSoftBody/btDefaultSoftBodySolver.h
@@ -19,7 +19,7 @@ subject to the following restrictions:
 
 #include "BulletSoftBody/btSoftBodySolvers.h"
 #include "btSoftBodySolverVertexBuffer.h"
-
+struct btCollisionObjectWrapper;
 
 class btDefaultSoftBodySolver : public btSoftBodySolver
 {
@@ -54,7 +54,7 @@ public:
 
 	virtual void copySoftBodyToVertexBuffer( const btSoftBody *const softBody, btVertexBufferDescriptor *vertexBuffer );
 
-	virtual void processCollision( btSoftBody *, btCollisionObject* );
+	virtual void processCollision( btSoftBody *, const btCollisionObjectWrapper* );
 
 	virtual void processCollision( btSoftBody*, btSoftBody* );
 
diff --git a/src/BulletSoftBody/btSoftBody.cpp b/src/BulletSoftBody/btSoftBody.cpp
index d1b5eb43d..b05efe598 100644
--- a/src/BulletSoftBody/btSoftBody.cpp
+++ b/src/BulletSoftBody/btSoftBody.cpp
@@ -105,7 +105,7 @@ void	btSoftBody::initDefaults()
 	/* Collision shape	*/ 
 	///for now, create a collision shape internally
 	m_collisionShape = new btSoftBodyCollisionShape(this);
-	m_collisionShape->setMargin(0.25);
+	m_collisionShape->setMargin(0.25f);
 	
 	m_initialWorldTransform.setIdentity();
 
@@ -1388,12 +1388,12 @@ void			btSoftBody::refine(ImplicitFn* ifn,btScalar accurary,bool cut)
 							m=mc*f;
 						}
 						else
-						{ a.m_im/=0.5;m=1/a.m_im; }
+						{ a.m_im/=0.5f;m=1/a.m_im; }
 					}
 					else
 					{
 						if(b.m_im>0)
-						{ b.m_im/=0.5;m=1/b.m_im; }
+						{ b.m_im/=0.5f;m=1/b.m_im; }
 						else
 							m=0;
 					}
@@ -1473,7 +1473,7 @@ void			btSoftBody::refine(ImplicitFn* ifn,btScalar accurary,bool cut)
 			{
 				const btVector3	v=m_nodes[i].m_v;
 				btScalar		m=getMass(i);
-				if(m>0) { m*=0.5;m_nodes[i].m_im/=0.5; }
+				if(m>0) { m*=0.5f;m_nodes[i].m_im/=0.5f; }
 				appendNode(x,m);
 				cnodes[i]=m_nodes.size()-1;
 				m_nodes[cnodes[i]].m_v=v;
@@ -2171,15 +2171,18 @@ btVector3		btSoftBody::evaluateCom() const
 }
 
 //
-bool				btSoftBody::checkContact(	btCollisionObject* colObj,
+bool				btSoftBody::checkContact(	const btCollisionObjectWrapper* colObjWrap,
 											 const btVector3& x,
 											 btScalar margin,
 											 btSoftBody::sCti& cti) const
 {
 	btVector3 nrm;
-	btCollisionShape *shp = colObj->getCollisionShape();
-	btRigidBody *tmpRigid = btRigidBody::upcast(colObj);
-	const btTransform &wtr = tmpRigid ? tmpRigid->getWorldTransform() : colObj->getWorldTransform();
+	const btCollisionShape *shp = colObjWrap->getCollisionShape();
+	const btRigidBody *tmpRigid = btRigidBody::upcast(colObjWrap->getCollisionObject());
+	//const btTransform &wtr = tmpRigid ? tmpRigid->getWorldTransform() : colObjWrap->getWorldTransform();
+	const btTransform &wtr = colObjWrap->getWorldTransform();
+	//todo: check which transform is needed here
+
 	btScalar dst = 
 		m_worldInfo->m_sparsesdf.Evaluate(	
 			wtr.invXform(x),
@@ -2188,7 +2191,7 @@ bool				btSoftBody::checkContact(	btCollisionObject* colObj,
 			margin);
 	if(dst<0)
 	{
-		cti.m_colObj = colObj;
+		cti.m_colObj = colObjWrap->getCollisionObject();
 		cti.m_normal = wtr.getBasis()*nrm;
 		cti.m_offset = -btDot( cti.m_normal, x - cti.m_normal * dst );
 		return(true);
@@ -2910,7 +2913,7 @@ void btSoftBody::PSolve_RContacts(btSoftBody* psb, btScalar kst, btScalar ti)
 	{
 		const RContact&		c = psb->m_rcontacts[i];
 		const sCti&			cti = c.m_cti;	
-		btRigidBody* tmpRigid = btRigidBody::upcast(cti.m_colObj);
+		btRigidBody* tmpRigid = (btRigidBody*)btRigidBody::upcast(cti.m_colObj);
 
 		const btVector3		va = tmpRigid ? tmpRigid->getVelocityInLocalPoint(c.m_c1)*dt : btVector3(0,0,0);
 		const btVector3		vb = c.m_node->m_x-c.m_node->m_q;	
@@ -3031,7 +3034,7 @@ btSoftBody::vsolver_t	btSoftBody::getSolver(eVSolver::_ solver)
 }
 
 //
-void			btSoftBody::defaultCollisionHandler(btCollisionObject* pco)
+void			btSoftBody::defaultCollisionHandler(const btCollisionObjectWrapper* pcoWrap)
 {
 
 	switch(m_cfg.collisions&fCollision::RVSmask)
@@ -3039,22 +3042,22 @@ void			btSoftBody::defaultCollisionHandler(btCollisionObject* pco)
 	case	fCollision::SDF_RS:
 		{
 			btSoftColliders::CollideSDF_RS	docollide;		
-			btRigidBody*		prb1=btRigidBody::upcast(pco);
-			btTransform	wtr=pco->getWorldTransform();
+			btRigidBody*		prb1=(btRigidBody*) btRigidBody::upcast(pcoWrap->getCollisionObject());
+			btTransform	wtr=pcoWrap->getWorldTransform();
 
-			const btTransform	ctr=pco->getWorldTransform();
+			const btTransform	ctr=pcoWrap->getWorldTransform();
 			const btScalar		timemargin=(wtr.getOrigin()-ctr.getOrigin()).length();
 			const btScalar		basemargin=getCollisionShape()->getMargin();
 			btVector3			mins;
 			btVector3			maxs;
 			ATTRIBUTE_ALIGNED16(btDbvtVolume)		volume;
-			pco->getCollisionShape()->getAabb(	pco->getWorldTransform(),
+			pcoWrap->getCollisionShape()->getAabb(	pcoWrap->getWorldTransform(),
 				mins,
 				maxs);
 			volume=btDbvtVolume::FromMM(mins,maxs);
 			volume.Expand(btVector3(basemargin,basemargin,basemargin));		
 			docollide.psb		=	this;
-			docollide.m_colObj1 = pco;
+			docollide.m_colObj1Wrap = pcoWrap;
 			docollide.m_rigidBody = prb1;
 
 			docollide.dynmargin	=	basemargin+timemargin;
@@ -3065,7 +3068,7 @@ void			btSoftBody::defaultCollisionHandler(btCollisionObject* pco)
 	case	fCollision::CL_RS:
 		{
 			btSoftColliders::CollideCL_RS	collider;
-			collider.Process(this,pco);
+			collider.Process(this,pcoWrap);
 		}
 		break;
 	}
diff --git a/src/BulletSoftBody/btSoftBody.h b/src/BulletSoftBody/btSoftBody.h
index ba589486f..981c83240 100644
--- a/src/BulletSoftBody/btSoftBody.h
+++ b/src/BulletSoftBody/btSoftBody.h
@@ -69,7 +69,7 @@ struct	btSoftBodyWorldInfo
 class	btSoftBody : public btCollisionObject
 {
 public:
-	btAlignedObjectArray<class btCollisionObject*> m_collisionDisabledObjects;
+	btAlignedObjectArray<const class btCollisionObject*> m_collisionDisabledObjects;
 
 	// The solver object that handles this soft body
 	btSoftBodySolver *m_softBodySolver;
@@ -182,7 +182,7 @@ public:
 	/* sCti is Softbody contact info	*/ 
 	struct	sCti
 	{
-		btCollisionObject*	m_colObj;		/* Rigid body			*/ 
+		const btCollisionObject*	m_colObj;		/* Rigid body			*/ 
 		btVector3		m_normal;	/* Outward normal		*/ 
 		btScalar		m_offset;	/* Offset from origin	*/ 
 	};	
@@ -374,13 +374,13 @@ public:
 	{
 		Cluster*			m_soft;
 		btRigidBody*		m_rigid;
-		btCollisionObject*	m_collisionObject;
+		const btCollisionObject*	m_collisionObject;
 
 		Body() : m_soft(0),m_rigid(0),m_collisionObject(0)				{}
 		Body(Cluster* p) : m_soft(p),m_rigid(0),m_collisionObject(0)	{}
-		Body(btCollisionObject* colObj) : m_soft(0),m_collisionObject(colObj)
+		Body(const btCollisionObject* colObj) : m_soft(0),m_collisionObject(colObj)
 		{
-			m_rigid = btRigidBody::upcast(m_collisionObject);
+			m_rigid = (btRigidBody*)btRigidBody::upcast(m_collisionObject);
 		}
 
 		void						activate() const
@@ -867,7 +867,7 @@ public:
 	/* integrateMotion														*/ 
 	void				integrateMotion();
 	/* defaultCollisionHandlers												*/ 
-	void				defaultCollisionHandler(btCollisionObject* pco);
+	void				defaultCollisionHandler(const btCollisionObjectWrapper* pcoWrap);
 	void				defaultCollisionHandler(btSoftBody* psb);
 
 
@@ -949,7 +949,7 @@ public:
 		btScalar& mint,eFeature::_& feature,int& index,bool bcountonly) const;
 	void				initializeFaceTree();
 	btVector3			evaluateCom() const;
-	bool				checkContact(btCollisionObject* colObj,const btVector3& x,btScalar margin,btSoftBody::sCti& cti) const;
+	bool				checkContact(const btCollisionObjectWrapper* colObjWrap,const btVector3& x,btScalar margin,btSoftBody::sCti& cti) const;
 	void				updateNormals();
 	void				updateBounds();
 	void				updatePose();
diff --git a/src/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.cpp b/src/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.cpp
index d99be3b81..6e94d0a81 100644
--- a/src/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.cpp
+++ b/src/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.cpp
@@ -25,7 +25,7 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btSphereShape.h"
 #include "BulletCollision/CollisionShapes/btTetrahedronShape.h"
 #include "BulletCollision/CollisionShapes/btConvexHullShape.h"
-
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 
 #include "LinearMath/btIDebugDraw.h"
@@ -34,10 +34,10 @@ subject to the following restrictions:
 
 #define BT_SOFTBODY_TRIANGLE_EXTRUSION btScalar(0.06)//make this configurable
 
-btSoftBodyConcaveCollisionAlgorithm::btSoftBodyConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1,bool isSwapped)
+btSoftBodyConcaveCollisionAlgorithm::btSoftBodyConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped)
 : btCollisionAlgorithm(ci),
 m_isSwapped(isSwapped),
-m_btSoftBodyTriangleCallback(ci.m_dispatcher1,body0,body1,isSwapped)
+m_btSoftBodyTriangleCallback(ci.m_dispatcher1,body0Wrap,body1Wrap,isSwapped)
 {
 }
 
@@ -49,12 +49,12 @@ btSoftBodyConcaveCollisionAlgorithm::~btSoftBodyConcaveCollisionAlgorithm()
 
 
 
-btSoftBodyTriangleCallback::btSoftBodyTriangleCallback(btDispatcher*  dispatcher,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped):
+btSoftBodyTriangleCallback::btSoftBodyTriangleCallback(btDispatcher*  dispatcher,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped):
 m_dispatcher(dispatcher),
 m_dispatchInfoPtr(0)
 {
-	m_softBody = (btSoftBody*) (isSwapped? body1:body0);
-	m_triBody = isSwapped? body0:body1;
+	m_softBody = (isSwapped? (btSoftBody*)body1Wrap->getCollisionObject():(btSoftBody*)body0Wrap->getCollisionObject());
+	m_triBody = isSwapped? body0Wrap->getCollisionObject():body1Wrap->getCollisionObject();
 
 	//
 	// create the manifold from the dispatcher 'manifold pool'
@@ -90,7 +90,7 @@ void btSoftBodyTriangleCallback::processTriangle(btVector3* triangle,int partId,
 {
 	//just for debugging purposes
 	//printf("triangle %d",m_triangleCount++);
-	btCollisionObject* ob = static_cast<btCollisionObject*>(m_triBody);
+	
 	btCollisionAlgorithmConstructionInfo ci;
 	ci.m_dispatcher1 = m_dispatcher;
 
@@ -98,7 +98,7 @@ void btSoftBodyTriangleCallback::processTriangle(btVector3* triangle,int partId,
 	if (m_dispatchInfoPtr && m_dispatchInfoPtr->m_debugDraw && (m_dispatchInfoPtr->m_debugDraw->getDebugMode() &btIDebugDraw::DBG_DrawWireframe))
 	{
 		btVector3 color(1,1,0);
-		btTransform& tr = ob->getWorldTransform();
+		const btTransform& tr = m_triBody->getWorldTransform();
 		m_dispatchInfoPtr->m_debugDraw->drawLine(tr(triangle[0]),tr(triangle[1]),color);
 		m_dispatchInfoPtr->m_debugDraw->drawLine(tr(triangle[1]),tr(triangle[2]),color);
 		m_dispatchInfoPtr->m_debugDraw->drawLine(tr(triangle[2]),tr(triangle[0]),color);
@@ -115,18 +115,18 @@ void btSoftBodyTriangleCallback::processTriangle(btVector3* triangle,int partId,
 		btAssert(tm);
 
 		//copy over user pointers to temporary shape
-		tm->setUserPointer(ob->getRootCollisionShape()->getUserPointer());
+		tm->setUserPointer(m_triBody->getCollisionShape()->getUserPointer());
 
-		btCollisionShape* tmpShape = ob->getCollisionShape();
-		ob->internalSetTemporaryCollisionShape( tm );
+		btCollisionObjectWrapper softBody(0,m_softBody->getCollisionShape(),m_softBody,m_softBody->getWorldTransform());
+		//btCollisionObjectWrapper triBody(0,tm, ob, btTransform::getIdentity());//ob->getWorldTransform());//??
+		btCollisionObjectWrapper triBody(0,tm, m_triBody, m_triBody->getWorldTransform());
 
+		btCollisionAlgorithm* colAlgo = ci.m_dispatcher1->findAlgorithm(&softBody,&triBody,0);//m_manifoldPtr);
 
-		btCollisionAlgorithm* colAlgo = ci.m_dispatcher1->findAlgorithm(m_softBody,m_triBody,0);//m_manifoldPtr);
-
-		colAlgo->processCollision(m_softBody,m_triBody,*m_dispatchInfoPtr,m_resultOut);
+		colAlgo->processCollision(&softBody,&triBody,*m_dispatchInfoPtr,m_resultOut);
 		colAlgo->~btCollisionAlgorithm();
 		ci.m_dispatcher1->freeCollisionAlgorithm(colAlgo);
-		ob->internalSetTemporaryCollisionShape( tmpShape);
+		
 		return;
 	}
 
@@ -158,24 +158,18 @@ void btSoftBodyTriangleCallback::processTriangle(btVector3* triangle,int partId,
 		//	tm.setMargin(m_collisionMarginTriangle);
 
 		//copy over user pointers to temporary shape
-		tm->setUserPointer(ob->getRootCollisionShape()->getUserPointer());
+		tm->setUserPointer(m_triBody->getCollisionShape()->getUserPointer());
 
-		btCollisionShape* tmpShape = ob->getCollisionShape();
-		ob->internalSetTemporaryCollisionShape( tm );
+		
+		btCollisionObjectWrapper softBody(0,m_softBody->getCollisionShape(),m_softBody,m_softBody->getWorldTransform());
+		btCollisionObjectWrapper triBody(0,tm, m_triBody, m_triBody->getWorldTransform());//btTransform::getIdentity());//??
 
+		btCollisionAlgorithm* colAlgo = ci.m_dispatcher1->findAlgorithm(&softBody,&triBody,0);//m_manifoldPtr);
 
-		btCollisionAlgorithm* colAlgo = ci.m_dispatcher1->findAlgorithm(m_softBody,m_triBody,0);//m_manifoldPtr);
-		///this should use the btDispatcher, so the actual registered algorithm is used
-		//		btConvexConvexAlgorithm cvxcvxalgo(m_manifoldPtr,ci,m_convexBody,m_triBody);
-
-		//m_resultOut->setShapeIdentifiersB(partId,triangleIndex);
-		//		cvxcvxalgo.processCollision(m_convexBody,m_triBody,*m_dispatchInfoPtr,m_resultOut);
-		colAlgo->processCollision(m_softBody,m_triBody,*m_dispatchInfoPtr,m_resultOut);
+		colAlgo->processCollision(&softBody,&triBody,*m_dispatchInfoPtr,m_resultOut);
 		colAlgo->~btCollisionAlgorithm();
 		ci.m_dispatcher1->freeCollisionAlgorithm(colAlgo);
 
-
-		ob->internalSetTemporaryCollisionShape( tmpShape );
 		triIndex.m_childShape = tm;
 		m_shapeCache.insert(triKey,triIndex);
 
@@ -187,7 +181,7 @@ void btSoftBodyTriangleCallback::processTriangle(btVector3* triangle,int partId,
 
 
 
-void	btSoftBodyTriangleCallback::setTimeStepAndCounters(btScalar collisionMarginTriangle,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void	btSoftBodyTriangleCallback::setTimeStepAndCounters(btScalar collisionMarginTriangle,const btCollisionObjectWrapper* triBodyWrap, const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 	m_dispatchInfoPtr = &dispatchInfo;
 	m_collisionMarginTriangle = collisionMarginTriangle+btScalar(BT_SOFTBODY_TRIANGLE_EXTRUSION);
@@ -204,7 +198,7 @@ void	btSoftBodyTriangleCallback::setTimeStepAndCounters(btScalar collisionMargin
 	softTransform.setOrigin(softBodyCenter);
 
 	btTransform convexInTriangleSpace;
-	convexInTriangleSpace = m_triBody->getWorldTransform().inverse() * softTransform;
+	convexInTriangleSpace = triBodyWrap->getWorldTransform().inverse() * softTransform;
 	btTransformAabb(halfExtents,m_collisionMarginTriangle,convexInTriangleSpace,m_aabbMin,m_aabbMax);
 }
 
@@ -214,33 +208,28 @@ void btSoftBodyConcaveCollisionAlgorithm::clearCache()
 
 }
 
-void btSoftBodyConcaveCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btSoftBodyConcaveCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 
 
 	//btCollisionObject* convexBody = m_isSwapped ? body1 : body0;
-	btCollisionObject* triBody = m_isSwapped ? body0 : body1;
+	const btCollisionObjectWrapper* triBody = m_isSwapped ? body0Wrap : body1Wrap;
 
 	if (triBody->getCollisionShape()->isConcave())
 	{
 
 
-		btCollisionObject*	triOb = triBody;
-		btConcaveShape* concaveShape = static_cast<btConcaveShape*>( triOb->getCollisionShape());
+		const btCollisionObject*	triOb = triBody->getCollisionObject();
+		const btConcaveShape* concaveShape = static_cast<const btConcaveShape*>( triOb->getCollisionShape());
 
 		//	if (convexBody->getCollisionShape()->isConvex())
 		{
 			btScalar collisionMarginTriangle = concaveShape->getMargin();
 
 			//			resultOut->setPersistentManifold(m_btSoftBodyTriangleCallback.m_manifoldPtr);
-			m_btSoftBodyTriangleCallback.setTimeStepAndCounters(collisionMarginTriangle,dispatchInfo,resultOut);
-
-			//Disable persistency. previously, some older algorithm calculated all contacts in one go, so you can clear it here.
-			//m_dispatcher->clearManifold(m_btSoftBodyTriangleCallback.m_manifoldPtr);
-
-			//			m_btSoftBodyTriangleCallback.m_manifoldPtr->setBodies(convexBody,triBody);
-
+			m_btSoftBodyTriangleCallback.setTimeStepAndCounters(collisionMarginTriangle,triBody,dispatchInfo,resultOut);
 
+		
 			concaveShape->processAllTriangles( &m_btSoftBodyTriangleCallback,m_btSoftBodyTriangleCallback.getAabbMin(),m_btSoftBodyTriangleCallback.getAabbMax());
 
 			//	resultOut->refreshContactPoints();
diff --git a/src/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.h b/src/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.h
index 11ec5b37e..11c7b88f9 100644
--- a/src/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.h
+++ b/src/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.h
@@ -45,7 +45,9 @@ struct btTriIndex
 	int	getTriangleIndex() const
 	{
 		// Get only the lower bits where the triangle index is stored
-		return (m_PartIdTriangleIndex&~((~0)<<(31-MAX_NUM_PARTS_IN_BITS)));
+		unsigned int x = 0;
+		unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+		return (m_PartIdTriangleIndex&~(y));
 	}
 	int	getPartId() const
 	{
@@ -63,7 +65,7 @@ struct btTriIndex
 class btSoftBodyTriangleCallback : public btTriangleCallback
 {
 	btSoftBody* m_softBody;
-	btCollisionObject* m_triBody;
+	const btCollisionObject* m_triBody;
 
 	btVector3	m_aabbMin;
 	btVector3	m_aabbMax ;
@@ -81,9 +83,9 @@ public:
 
 	//	btPersistentManifold*	m_manifoldPtr;
 
-	btSoftBodyTriangleCallback(btDispatcher* dispatcher,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped);
+	btSoftBodyTriangleCallback(btDispatcher* dispatcher,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped);
 
-	void	setTimeStepAndCounters(btScalar collisionMarginTriangle,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	void	setTimeStepAndCounters(btScalar collisionMarginTriangle,const btCollisionObjectWrapper* triObjWrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual ~btSoftBodyTriangleCallback();
 
@@ -115,11 +117,11 @@ class btSoftBodyConcaveCollisionAlgorithm  : public btCollisionAlgorithm
 
 public:
 
-	btSoftBodyConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped);
+	btSoftBodyConcaveCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,bool isSwapped);
 
 	virtual ~btSoftBodyConcaveCollisionAlgorithm();
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	btScalar	calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -132,19 +134,19 @@ public:
 
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btSoftBodyConcaveCollisionAlgorithm));
-			return new(mem) btSoftBodyConcaveCollisionAlgorithm(ci,body0,body1,false);
+			return new(mem) btSoftBodyConcaveCollisionAlgorithm(ci,body0Wrap,body1Wrap,false);
 		}
 	};
 
 	struct SwappedCreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btSoftBodyConcaveCollisionAlgorithm));
-			return new(mem) btSoftBodyConcaveCollisionAlgorithm(ci,body0,body1,true);
+			return new(mem) btSoftBodyConcaveCollisionAlgorithm(ci,body0Wrap,body1Wrap,true);
 		}
 	};
 
diff --git a/src/BulletSoftBody/btSoftBodyData.h b/src/BulletSoftBody/btSoftBodyData.h
index a2fde77a6..40dc65c3d 100644
--- a/src/BulletSoftBody/btSoftBodyData.h
+++ b/src/BulletSoftBody/btSoftBodyData.h
@@ -1,217 +1,217 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it freely,
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#ifndef BT_SOFTBODY_FLOAT_DATA
-#define BT_SOFTBODY_FLOAT_DATA
-
-#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
-
-
-
-struct	SoftBodyMaterialData
-{
-	float	m_linearStiffness;
-	float	m_angularStiffness;
-	float	m_volumeStiffness;
-	int		m_flags;
-};
-
-struct	SoftBodyNodeData
-{
-	SoftBodyMaterialData		*m_material;
-	btVector3FloatData			m_position;
-	btVector3FloatData			m_previousPosition;
-	btVector3FloatData			m_velocity;
-	btVector3FloatData			m_accumulatedForce;
-	btVector3FloatData			m_normal;
-	float						m_inverseMass;
-	float						m_area;
-	int							m_attach;
-	int							m_pad;
-};
-
-struct	SoftBodyLinkData
-{
-	SoftBodyMaterialData	*m_material;
-	int						m_nodeIndices[2];			// Node pointers
-	float					m_restLength;			// Rest length		
-	int						m_bbending;		// Bending link
-};
-
-struct	SoftBodyFaceData
-{
-	btVector3FloatData		m_normal;		// Normal
-	SoftBodyMaterialData	*m_material;
-	int						m_nodeIndices[3];			// Node pointers
-	float					m_restArea;			// Rest area
-};	
-
-struct	SoftBodyTetraData
-{
-	btVector3FloatData		m_c0[4];		// gradients
-	SoftBodyMaterialData	*m_material;
-	int						m_nodeIndices[4];			// Node pointers		
-	float					m_restVolume;			// Rest volume
-	float					m_c1;			// (4*kVST)/(im0+im1+im2+im3)
-	float					m_c2;			// m_c1/sum(|g0..3|^2)
-	int						m_pad;
-};
-
-struct	SoftRigidAnchorData
-{
-	btMatrix3x3FloatData	m_c0;			// Impulse matrix
-	btVector3FloatData		m_c1;			// Relative anchor
-	btVector3FloatData		m_localFrame;		// Anchor position in body space
-	btRigidBodyData			*m_rigidBody;
-	int						m_nodeIndex;			// Node pointer
-	float					m_c2;			// ima*dt
-};
-
-
-
-struct	SoftBodyConfigData
-{
-	int					m_aeroModel;		// Aerodynamic model (default: V_Point)
-	float				m_baumgarte;			// Velocities correction factor (Baumgarte)
-	float				m_damping;			// Damping coefficient [0,1]
-	float				m_drag;			// Drag coefficient [0,+inf]
-	float				m_lift;			// Lift coefficient [0,+inf]
-	float				m_pressure;			// Pressure coefficient [-inf,+inf]
-	float				m_volume;			// Volume conversation coefficient [0,+inf]
-	float				m_dynamicFriction;			// Dynamic friction coefficient [0,1]
-	float				m_poseMatch;			// Pose matching coefficient [0,1]		
-	float				m_rigidContactHardness;			// Rigid contacts hardness [0,1]
-	float				m_kineticContactHardness;			// Kinetic contacts hardness [0,1]
-	float				m_softContactHardness;			// Soft contacts hardness [0,1]
-	float				m_anchorHardness;			// Anchors hardness [0,1]
-	float				m_softRigidClusterHardness;		// Soft vs rigid hardness [0,1] (cluster only)
-	float				m_softKineticClusterHardness;		// Soft vs kinetic hardness [0,1] (cluster only)
-	float				m_softSoftClusterHardness;		// Soft vs soft hardness [0,1] (cluster only)
-	float				m_softRigidClusterImpulseSplit;	// Soft vs rigid impulse split [0,1] (cluster only)
-	float				m_softKineticClusterImpulseSplit;	// Soft vs rigid impulse split [0,1] (cluster only)
-	float				m_softSoftClusterImpulseSplit;	// Soft vs rigid impulse split [0,1] (cluster only)
-	float				m_maxVolume;		// Maximum volume ratio for pose
-	float				m_timeScale;		// Time scale
-	int					m_velocityIterations;	// Velocities solver iterations
-	int					m_positionIterations;	// Positions solver iterations
-	int					m_driftIterations;	// Drift solver iterations
-	int					m_clusterIterations;	// Cluster solver iterations
-	int					m_collisionFlags;	// Collisions flags
-};
-
-struct	SoftBodyPoseData
-{
-	btMatrix3x3FloatData	m_rot;			// Rotation
-	btMatrix3x3FloatData	m_scale;			// Scale
-	btMatrix3x3FloatData	m_aqq;			// Base scaling
-	btVector3FloatData		m_com;			// COM
-
-	btVector3FloatData		*m_positions;			// Reference positions
-	float					*m_weights;	// Weights
-	int						m_numPositions;
-	int						m_numWeigts;
-
-	int						m_bvolume;		// Is valid
-	int						m_bframe;		// Is frame
-	float					m_restVolume;		// Rest volume
-	int						m_pad;
-};
-
-struct	SoftBodyClusterData
-{
-		btTransformFloatData		m_framexform;
-		btMatrix3x3FloatData		m_locii;
-		btMatrix3x3FloatData		m_invwi;
-		btVector3FloatData			m_com;
-		btVector3FloatData			m_vimpulses[2];
-		btVector3FloatData			m_dimpulses[2];
-		btVector3FloatData			m_lv;
-		btVector3FloatData			m_av;
-		
-		btVector3FloatData			*m_framerefs;
-		int							*m_nodeIndices;
-		float						*m_masses;
-
-		int							m_numFrameRefs;
-		int							m_numNodes;
-		int							m_numMasses;
-
-		float						m_idmass;
-		float						m_imass;
-		int							m_nvimpulses;
-		int							m_ndimpulses;
-		float						m_ndamping;
-		float						m_ldamping;
-		float						m_adamping;
-		float						m_matching;
-		float						m_maxSelfCollisionImpulse;
-		float						m_selfCollisionImpulseFactor;
-		int							m_containsAnchor;
-		int							m_collide;
-		int							m_clusterIndex;
-};
-
-
-enum	btSoftJointBodyType
-{
-	BT_JOINT_SOFT_BODY_CLUSTER=1,
-	BT_JOINT_RIGID_BODY,
-	BT_JOINT_COLLISION_OBJECT
-};
-
-struct	btSoftBodyJointData
-{
-	void						*m_bodyA;
-	void						*m_bodyB;
-	btVector3FloatData			m_refs[2];
-	float						m_cfm;
-	float						m_erp;
-	float						m_split;
-	int							m_delete;
-	btVector3FloatData			m_relPosition[2];//linear
-	int							m_bodyAtype;
-	int							m_bodyBtype;
-	int							m_jointType;
-	int							m_pad;
-};
-
-///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
-struct	btSoftBodyFloatData
-{
-	btCollisionObjectFloatData	m_collisionObjectData;
-
-	SoftBodyPoseData		*m_pose;
-	SoftBodyMaterialData	**m_materials;
-	SoftBodyNodeData		*m_nodes;
-	SoftBodyLinkData		*m_links;
-	SoftBodyFaceData		*m_faces;
-	SoftBodyTetraData		*m_tetrahedra;
-	SoftRigidAnchorData		*m_anchors;
-	SoftBodyClusterData		*m_clusters;
-	btSoftBodyJointData		*m_joints;
-
-	int						m_numMaterials;
-	int						m_numNodes;
-	int						m_numLinks;
-	int						m_numFaces;
-	int						m_numTetrahedra;
-	int						m_numAnchors;
-	int						m_numClusters;
-	int						m_numJoints;
-	SoftBodyConfigData		m_config;
-};
-
-#endif //BT_SOFTBODY_FLOAT_DATA
-
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SOFTBODY_FLOAT_DATA
+#define BT_SOFTBODY_FLOAT_DATA
+
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+
+
+
+struct	SoftBodyMaterialData
+{
+	float	m_linearStiffness;
+	float	m_angularStiffness;
+	float	m_volumeStiffness;
+	int		m_flags;
+};
+
+struct	SoftBodyNodeData
+{
+	SoftBodyMaterialData		*m_material;
+	btVector3FloatData			m_position;
+	btVector3FloatData			m_previousPosition;
+	btVector3FloatData			m_velocity;
+	btVector3FloatData			m_accumulatedForce;
+	btVector3FloatData			m_normal;
+	float						m_inverseMass;
+	float						m_area;
+	int							m_attach;
+	int							m_pad;
+};
+
+struct	SoftBodyLinkData
+{
+	SoftBodyMaterialData	*m_material;
+	int						m_nodeIndices[2];			// Node pointers
+	float					m_restLength;			// Rest length		
+	int						m_bbending;		// Bending link
+};
+
+struct	SoftBodyFaceData
+{
+	btVector3FloatData		m_normal;		// Normal
+	SoftBodyMaterialData	*m_material;
+	int						m_nodeIndices[3];			// Node pointers
+	float					m_restArea;			// Rest area
+};	
+
+struct	SoftBodyTetraData
+{
+	btVector3FloatData		m_c0[4];		// gradients
+	SoftBodyMaterialData	*m_material;
+	int						m_nodeIndices[4];			// Node pointers		
+	float					m_restVolume;			// Rest volume
+	float					m_c1;			// (4*kVST)/(im0+im1+im2+im3)
+	float					m_c2;			// m_c1/sum(|g0..3|^2)
+	int						m_pad;
+};
+
+struct	SoftRigidAnchorData
+{
+	btMatrix3x3FloatData	m_c0;			// Impulse matrix
+	btVector3FloatData		m_c1;			// Relative anchor
+	btVector3FloatData		m_localFrame;		// Anchor position in body space
+	btRigidBodyData			*m_rigidBody;
+	int						m_nodeIndex;			// Node pointer
+	float					m_c2;			// ima*dt
+};
+
+
+
+struct	SoftBodyConfigData
+{
+	int					m_aeroModel;		// Aerodynamic model (default: V_Point)
+	float				m_baumgarte;			// Velocities correction factor (Baumgarte)
+	float				m_damping;			// Damping coefficient [0,1]
+	float				m_drag;			// Drag coefficient [0,+inf]
+	float				m_lift;			// Lift coefficient [0,+inf]
+	float				m_pressure;			// Pressure coefficient [-inf,+inf]
+	float				m_volume;			// Volume conversation coefficient [0,+inf]
+	float				m_dynamicFriction;			// Dynamic friction coefficient [0,1]
+	float				m_poseMatch;			// Pose matching coefficient [0,1]		
+	float				m_rigidContactHardness;			// Rigid contacts hardness [0,1]
+	float				m_kineticContactHardness;			// Kinetic contacts hardness [0,1]
+	float				m_softContactHardness;			// Soft contacts hardness [0,1]
+	float				m_anchorHardness;			// Anchors hardness [0,1]
+	float				m_softRigidClusterHardness;		// Soft vs rigid hardness [0,1] (cluster only)
+	float				m_softKineticClusterHardness;		// Soft vs kinetic hardness [0,1] (cluster only)
+	float				m_softSoftClusterHardness;		// Soft vs soft hardness [0,1] (cluster only)
+	float				m_softRigidClusterImpulseSplit;	// Soft vs rigid impulse split [0,1] (cluster only)
+	float				m_softKineticClusterImpulseSplit;	// Soft vs rigid impulse split [0,1] (cluster only)
+	float				m_softSoftClusterImpulseSplit;	// Soft vs rigid impulse split [0,1] (cluster only)
+	float				m_maxVolume;		// Maximum volume ratio for pose
+	float				m_timeScale;		// Time scale
+	int					m_velocityIterations;	// Velocities solver iterations
+	int					m_positionIterations;	// Positions solver iterations
+	int					m_driftIterations;	// Drift solver iterations
+	int					m_clusterIterations;	// Cluster solver iterations
+	int					m_collisionFlags;	// Collisions flags
+};
+
+struct	SoftBodyPoseData
+{
+	btMatrix3x3FloatData	m_rot;			// Rotation
+	btMatrix3x3FloatData	m_scale;			// Scale
+	btMatrix3x3FloatData	m_aqq;			// Base scaling
+	btVector3FloatData		m_com;			// COM
+
+	btVector3FloatData		*m_positions;			// Reference positions
+	float					*m_weights;	// Weights
+	int						m_numPositions;
+	int						m_numWeigts;
+
+	int						m_bvolume;		// Is valid
+	int						m_bframe;		// Is frame
+	float					m_restVolume;		// Rest volume
+	int						m_pad;
+};
+
+struct	SoftBodyClusterData
+{
+		btTransformFloatData		m_framexform;
+		btMatrix3x3FloatData		m_locii;
+		btMatrix3x3FloatData		m_invwi;
+		btVector3FloatData			m_com;
+		btVector3FloatData			m_vimpulses[2];
+		btVector3FloatData			m_dimpulses[2];
+		btVector3FloatData			m_lv;
+		btVector3FloatData			m_av;
+		
+		btVector3FloatData			*m_framerefs;
+		int							*m_nodeIndices;
+		float						*m_masses;
+
+		int							m_numFrameRefs;
+		int							m_numNodes;
+		int							m_numMasses;
+
+		float						m_idmass;
+		float						m_imass;
+		int							m_nvimpulses;
+		int							m_ndimpulses;
+		float						m_ndamping;
+		float						m_ldamping;
+		float						m_adamping;
+		float						m_matching;
+		float						m_maxSelfCollisionImpulse;
+		float						m_selfCollisionImpulseFactor;
+		int							m_containsAnchor;
+		int							m_collide;
+		int							m_clusterIndex;
+};
+
+
+enum	btSoftJointBodyType
+{
+	BT_JOINT_SOFT_BODY_CLUSTER=1,
+	BT_JOINT_RIGID_BODY,
+	BT_JOINT_COLLISION_OBJECT
+};
+
+struct	btSoftBodyJointData
+{
+	void						*m_bodyA;
+	void						*m_bodyB;
+	btVector3FloatData			m_refs[2];
+	float						m_cfm;
+	float						m_erp;
+	float						m_split;
+	int							m_delete;
+	btVector3FloatData			m_relPosition[2];//linear
+	int							m_bodyAtype;
+	int							m_bodyBtype;
+	int							m_jointType;
+	int							m_pad;
+};
+
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	btSoftBodyFloatData
+{
+	btCollisionObjectFloatData	m_collisionObjectData;
+
+	SoftBodyPoseData		*m_pose;
+	SoftBodyMaterialData	**m_materials;
+	SoftBodyNodeData		*m_nodes;
+	SoftBodyLinkData		*m_links;
+	SoftBodyFaceData		*m_faces;
+	SoftBodyTetraData		*m_tetrahedra;
+	SoftRigidAnchorData		*m_anchors;
+	SoftBodyClusterData		*m_clusters;
+	btSoftBodyJointData		*m_joints;
+
+	int						m_numMaterials;
+	int						m_numNodes;
+	int						m_numLinks;
+	int						m_numFaces;
+	int						m_numTetrahedra;
+	int						m_numAnchors;
+	int						m_numClusters;
+	int						m_numJoints;
+	SoftBodyConfigData		m_config;
+};
+
+#endif //BT_SOFTBODY_FLOAT_DATA
+
diff --git a/src/BulletSoftBody/btSoftBodyInternals.h b/src/BulletSoftBody/btSoftBodyInternals.h
index 5ef8db193..efb2948a6 100644
--- a/src/BulletSoftBody/btSoftBodyInternals.h
+++ b/src/BulletSoftBody/btSoftBodyInternals.h
@@ -666,7 +666,7 @@ struct btSoftColliders
 			threshold	=(btScalar)0;
 		}
 		bool				SolveContact(	const btGjkEpaSolver2::sResults& res,
-			btSoftBody::Body ba,btSoftBody::Body bb,
+			btSoftBody::Body ba,const btSoftBody::Body bb,
 			btSoftBody::CJoint& joint)
 		{
 			if(res.distance<m_margin)
@@ -717,30 +717,30 @@ struct btSoftColliders
 	struct	CollideCL_RS : ClusterBase
 	{
 		btSoftBody*		psb;
-		
-		btCollisionObject*	m_colObj;
+		const btCollisionObjectWrapper*	m_colObjWrap;
+
 		void		Process(const btDbvtNode* leaf)
 		{
 			btSoftBody::Cluster*		cluster=(btSoftBody::Cluster*)leaf->data;
 			btSoftClusterCollisionShape	cshape(cluster);
 			
-			const btConvexShape*		rshape=(const btConvexShape*)m_colObj->getCollisionShape();
+			const btConvexShape*		rshape=(const btConvexShape*)m_colObjWrap->getCollisionShape();
 
 			///don't collide an anchored cluster with a static/kinematic object
-			if(m_colObj->isStaticOrKinematicObject() && cluster->m_containsAnchor)
+			if(m_colObjWrap->getCollisionObject()->isStaticOrKinematicObject() && cluster->m_containsAnchor)
 				return;
 
 			btGjkEpaSolver2::sResults	res;		
 			if(btGjkEpaSolver2::SignedDistance(	&cshape,btTransform::getIdentity(),
-				rshape,m_colObj->getWorldTransform(),
+				rshape,m_colObjWrap->getWorldTransform(),
 				btVector3(1,0,0),res))
 			{
 				btSoftBody::CJoint	joint;
-				if(SolveContact(res,cluster,m_colObj,joint))//prb,joint))
+				if(SolveContact(res,cluster,m_colObjWrap->getCollisionObject(),joint))//prb,joint))
 				{
 					btSoftBody::CJoint*	pj=new(btAlignedAlloc(sizeof(btSoftBody::CJoint),16)) btSoftBody::CJoint();
 					*pj=joint;psb->m_joints.push_back(pj);
-					if(m_colObj->isStaticOrKinematicObject())
+					if(m_colObjWrap->getCollisionObject()->isStaticOrKinematicObject())
 					{
 						pj->m_erp	*=	psb->m_cfg.kSKHR_CL;
 						pj->m_split	*=	psb->m_cfg.kSK_SPLT_CL;
@@ -753,19 +753,19 @@ struct btSoftColliders
 				}
 			}
 		}
-		void		Process(btSoftBody* ps,btCollisionObject* colOb)
+		void		Process(btSoftBody* ps,const btCollisionObjectWrapper* colObWrap)
 		{
 			psb			=	ps;
-			m_colObj			=	colOb;
+			m_colObjWrap			=	colObWrap;
 			idt			=	ps->m_sst.isdt;
-			m_margin		=	m_colObj->getCollisionShape()->getMargin()+psb->getCollisionShape()->getMargin();
+			m_margin		=	m_colObjWrap->getCollisionShape()->getMargin()+psb->getCollisionShape()->getMargin();
 			///Bullet rigid body uses multiply instead of minimum to determine combined friction. Some customization would be useful.
-			friction	=	btMin(psb->m_cfg.kDF,m_colObj->getFriction());
+			friction	=	btMin(psb->m_cfg.kDF,m_colObjWrap->getCollisionObject()->getFriction());
 			btVector3			mins;
 			btVector3			maxs;
 
 			ATTRIBUTE_ALIGNED16(btDbvtVolume)		volume;
-			colOb->getCollisionShape()->getAabb(colOb->getWorldTransform(),mins,maxs);
+			colObWrap->getCollisionShape()->getAabb(colObWrap->getWorldTransform(),mins,maxs);
 			volume=btDbvtVolume::FromMM(mins,maxs);
 			volume.Expand(btVector3(1,1,1)*m_margin);
 			ps->m_cdbvt.collideTV(ps->m_cdbvt.m_root,volume,*this);
@@ -840,15 +840,16 @@ struct btSoftColliders
 		{
 			const btScalar			m=n.m_im>0?dynmargin:stamargin;
 			btSoftBody::RContact	c;
+
 			if(	(!n.m_battach)&&
-				psb->checkContact(m_colObj1,n.m_x,m,c.m_cti))
+				psb->checkContact(m_colObj1Wrap,n.m_x,m,c.m_cti))
 			{
 				const btScalar	ima=n.m_im;
 				const btScalar	imb= m_rigidBody? m_rigidBody->getInvMass() : 0.f;
 				const btScalar	ms=ima+imb;
 				if(ms>0)
 				{
-					const btTransform&	wtr=m_rigidBody?m_rigidBody->getWorldTransform() : m_colObj1->getWorldTransform();
+					const btTransform&	wtr=m_rigidBody?m_rigidBody->getWorldTransform() : m_colObj1Wrap->getCollisionObject()->getWorldTransform();
 					static const btMatrix3x3	iwiStatic(0,0,0,0,0,0,0,0,0);
 					const btMatrix3x3&	iwi=m_rigidBody?m_rigidBody->getInvInertiaTensorWorld() : iwiStatic;
 					const btVector3		ra=n.m_x-wtr.getOrigin();
@@ -857,13 +858,13 @@ struct btSoftColliders
 					const btVector3		vr=vb-va;
 					const btScalar		dn=btDot(vr,c.m_cti.m_normal);
 					const btVector3		fv=vr-c.m_cti.m_normal*dn;
-					const btScalar		fc=psb->m_cfg.kDF*m_colObj1->getFriction();
+					const btScalar		fc=psb->m_cfg.kDF*m_colObj1Wrap->getCollisionObject()->getFriction();
 					c.m_node	=	&n;
 					c.m_c0		=	ImpulseMatrix(psb->m_sst.sdt,ima,imb,iwi,ra);
 					c.m_c1		=	ra;
 					c.m_c2		=	ima*psb->m_sst.sdt;
 					c.m_c3		=	fv.length2()<(btFabs(dn)*fc)?0:1-fc;
-					c.m_c4		=	m_colObj1->isStaticOrKinematicObject()?psb->m_cfg.kKHR:psb->m_cfg.kCHR;
+					c.m_c4		=	m_colObj1Wrap->getCollisionObject()->isStaticOrKinematicObject()?psb->m_cfg.kKHR:psb->m_cfg.kCHR;
 					psb->m_rcontacts.push_back(c);
 					if (m_rigidBody)
 						m_rigidBody->activate();
@@ -871,7 +872,7 @@ struct btSoftColliders
 			}
 		}
 		btSoftBody*		psb;
-		btCollisionObject*	m_colObj1;
+		const btCollisionObjectWrapper*	m_colObj1Wrap;
 		btRigidBody*	m_rigidBody;
 		btScalar		dynmargin;
 		btScalar		stamargin;
diff --git a/src/BulletSoftBody/btSoftBodySolverVertexBuffer.h b/src/BulletSoftBody/btSoftBodySolverVertexBuffer.h
index 4c75bf216..c4733d640 100644
--- a/src/BulletSoftBody/btSoftBodySolverVertexBuffer.h
+++ b/src/BulletSoftBody/btSoftBodySolverVertexBuffer.h
@@ -1,165 +1,165 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_H
-#define BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_H
-
-
-class btVertexBufferDescriptor
-{
-public:
-	enum BufferTypes
-	{
-		CPU_BUFFER,
-		DX11_BUFFER,
-		OPENGL_BUFFER
-	};
-
-protected:	
-
-	bool m_hasVertexPositions;
-	bool m_hasNormals;
-
-	int m_vertexOffset;
-	int m_vertexStride;
-
-	int m_normalOffset;
-	int m_normalStride;
-
-public:
-	btVertexBufferDescriptor()
-	{
-		m_hasVertexPositions = false;
-		m_hasNormals = false;
-		m_vertexOffset = 0;
-		m_vertexStride = 0;
-		m_normalOffset = 0;
-		m_normalStride = 0;
-	}
-
-	virtual ~btVertexBufferDescriptor()
-	{
-
-	}
-
-	virtual bool hasVertexPositions() const
-	{
-		return m_hasVertexPositions;
-	}
-
-	virtual bool hasNormals() const
-	{
-		return m_hasNormals;
-	}
-
-	/**
-	 * Return the type of the vertex buffer descriptor.
-	 */
-	virtual BufferTypes getBufferType() const = 0;
-
-	/**
-	 * Return the vertex offset in floats from the base pointer.
-	 */
-	virtual int getVertexOffset() const
-	{
-		return m_vertexOffset;
-	}
-
-	/**
-	 * Return the vertex stride in number of floats between vertices.
-	 */
-	virtual int getVertexStride() const
-	{
-		return m_vertexStride;
-	}
-
-	/**
-	 * Return the vertex offset in floats from the base pointer.
-	 */
-	virtual int getNormalOffset() const
-	{
-		return m_normalOffset;
-	}
-
-	/**
-	 * Return the vertex stride in number of floats between vertices.
-	 */
-	virtual int getNormalStride() const
-	{
-		return m_normalStride;
-	}
-};
-
-
-class btCPUVertexBufferDescriptor : public btVertexBufferDescriptor
-{
-protected:
-	float *m_basePointer;
-
-public:
-	/**
-	 * vertexBasePointer is pointer to beginning of the buffer.
-	 * vertexOffset is the offset in floats to the first vertex.
-	 * vertexStride is the stride in floats between vertices.
-	 */
-	btCPUVertexBufferDescriptor( float *basePointer, int vertexOffset, int vertexStride )
-	{
-		m_basePointer = basePointer;
-		m_vertexOffset = vertexOffset;
-		m_vertexStride = vertexStride;
-		m_hasVertexPositions = true;
-	}
-
-	/**
-	 * vertexBasePointer is pointer to beginning of the buffer.
-	 * vertexOffset is the offset in floats to the first vertex.
-	 * vertexStride is the stride in floats between vertices.
-	 */
-	btCPUVertexBufferDescriptor( float *basePointer, int vertexOffset, int vertexStride, int normalOffset, int normalStride )
-	{
-		m_basePointer = basePointer;
-
-		m_vertexOffset = vertexOffset;
-		m_vertexStride = vertexStride;
-		m_hasVertexPositions = true;
-
-		m_normalOffset = normalOffset;
-		m_normalStride = normalStride;
-		m_hasNormals = true;
-	}
-
-	virtual ~btCPUVertexBufferDescriptor()
-	{
-
-	}
-
-	/**
-	 * Return the type of the vertex buffer descriptor.
-	 */
-	virtual BufferTypes getBufferType() const
-	{
-		return CPU_BUFFER;
-	}
-
-	/**
-	 * Return the base pointer in memory to the first vertex.
-	 */
-	virtual float *getBasePointer() const
-	{
-		return m_basePointer;
-	}
-};
-
-#endif // #ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_H
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_H
+#define BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_H
+
+
+class btVertexBufferDescriptor
+{
+public:
+	enum BufferTypes
+	{
+		CPU_BUFFER,
+		DX11_BUFFER,
+		OPENGL_BUFFER
+	};
+
+protected:	
+
+	bool m_hasVertexPositions;
+	bool m_hasNormals;
+
+	int m_vertexOffset;
+	int m_vertexStride;
+
+	int m_normalOffset;
+	int m_normalStride;
+
+public:
+	btVertexBufferDescriptor()
+	{
+		m_hasVertexPositions = false;
+		m_hasNormals = false;
+		m_vertexOffset = 0;
+		m_vertexStride = 0;
+		m_normalOffset = 0;
+		m_normalStride = 0;
+	}
+
+	virtual ~btVertexBufferDescriptor()
+	{
+
+	}
+
+	virtual bool hasVertexPositions() const
+	{
+		return m_hasVertexPositions;
+	}
+
+	virtual bool hasNormals() const
+	{
+		return m_hasNormals;
+	}
+
+	/**
+	 * Return the type of the vertex buffer descriptor.
+	 */
+	virtual BufferTypes getBufferType() const = 0;
+
+	/**
+	 * Return the vertex offset in floats from the base pointer.
+	 */
+	virtual int getVertexOffset() const
+	{
+		return m_vertexOffset;
+	}
+
+	/**
+	 * Return the vertex stride in number of floats between vertices.
+	 */
+	virtual int getVertexStride() const
+	{
+		return m_vertexStride;
+	}
+
+	/**
+	 * Return the vertex offset in floats from the base pointer.
+	 */
+	virtual int getNormalOffset() const
+	{
+		return m_normalOffset;
+	}
+
+	/**
+	 * Return the vertex stride in number of floats between vertices.
+	 */
+	virtual int getNormalStride() const
+	{
+		return m_normalStride;
+	}
+};
+
+
+class btCPUVertexBufferDescriptor : public btVertexBufferDescriptor
+{
+protected:
+	float *m_basePointer;
+
+public:
+	/**
+	 * vertexBasePointer is pointer to beginning of the buffer.
+	 * vertexOffset is the offset in floats to the first vertex.
+	 * vertexStride is the stride in floats between vertices.
+	 */
+	btCPUVertexBufferDescriptor( float *basePointer, int vertexOffset, int vertexStride )
+	{
+		m_basePointer = basePointer;
+		m_vertexOffset = vertexOffset;
+		m_vertexStride = vertexStride;
+		m_hasVertexPositions = true;
+	}
+
+	/**
+	 * vertexBasePointer is pointer to beginning of the buffer.
+	 * vertexOffset is the offset in floats to the first vertex.
+	 * vertexStride is the stride in floats between vertices.
+	 */
+	btCPUVertexBufferDescriptor( float *basePointer, int vertexOffset, int vertexStride, int normalOffset, int normalStride )
+	{
+		m_basePointer = basePointer;
+
+		m_vertexOffset = vertexOffset;
+		m_vertexStride = vertexStride;
+		m_hasVertexPositions = true;
+
+		m_normalOffset = normalOffset;
+		m_normalStride = normalStride;
+		m_hasNormals = true;
+	}
+
+	virtual ~btCPUVertexBufferDescriptor()
+	{
+
+	}
+
+	/**
+	 * Return the type of the vertex buffer descriptor.
+	 */
+	virtual BufferTypes getBufferType() const
+	{
+		return CPU_BUFFER;
+	}
+
+	/**
+	 * Return the base pointer in memory to the first vertex.
+	 */
+	virtual float *getBasePointer() const
+	{
+		return m_basePointer;
+	}
+};
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_H
diff --git a/src/BulletSoftBody/btSoftBodySolvers.h b/src/BulletSoftBody/btSoftBodySolvers.h
index 824b7e985..6947bc27d 100644
--- a/src/BulletSoftBody/btSoftBodySolvers.h
+++ b/src/BulletSoftBody/btSoftBodySolvers.h
@@ -1,154 +1,154 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#ifndef BT_SOFT_BODY_SOLVERS_H
-#define BT_SOFT_BODY_SOLVERS_H
-
-#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
-
-
-class btSoftBodyTriangleData;
-class btSoftBodyLinkData;
-class btSoftBodyVertexData;
-class btVertexBufferDescriptor;
-class btCollisionObject;
-class btSoftBody;
-
-
-class btSoftBodySolver
-{
-public:
-	enum SolverTypes
-	{
-		DEFAULT_SOLVER,
-		CPU_SOLVER,
-		CL_SOLVER,
-		CL_SIMD_SOLVER,
-		DX_SOLVER,
-		DX_SIMD_SOLVER
-	};
-
-
-protected:
-	int m_numberOfPositionIterations;
-	int m_numberOfVelocityIterations;
-	// Simulation timescale
-	float m_timeScale;
-	
-public:
-	btSoftBodySolver() :
-		m_numberOfPositionIterations( 10 ),
-		m_timeScale( 1 )
-	{
-		m_numberOfVelocityIterations = 0;
-		m_numberOfPositionIterations = 5;
-	}
-
-	virtual ~btSoftBodySolver()
-	{
-	}
-	
-	/**
-	 * Return the type of the solver.
-	 */
-	virtual SolverTypes getSolverType() const = 0;
-
-
-	/** Ensure that this solver is initialized. */
-	virtual bool checkInitialized() = 0;
-
-	/** Optimize soft bodies in this solver. */
-	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false) = 0;
-
-	/** Copy necessary data back to the original soft body source objects. */
-	virtual void copyBackToSoftBodies(bool bMove = true) = 0;
-
-	/** Predict motion of soft bodies into next timestep */
-	virtual void predictMotion( float solverdt ) = 0;
-
-	/** Solve constraints for a set of soft bodies */
-	virtual void solveConstraints( float solverdt ) = 0;
-
-	/** Perform necessary per-step updates of soft bodies such as recomputing normals and bounding boxes */
-	virtual void updateSoftBodies() = 0;
-
-	/** Process a collision between one of the world's soft bodies and another collision object */
-	virtual void processCollision( btSoftBody *, btCollisionObject* ) = 0;
-
-	/** Process a collision between two soft bodies */
-	virtual void processCollision( btSoftBody*, btSoftBody* ) = 0;
-
-	/** Set the number of velocity constraint solver iterations this solver uses. */
-	virtual void setNumberOfPositionIterations( int iterations )
-	{
-		m_numberOfPositionIterations = iterations;
-	}
-
-	/** Get the number of velocity constraint solver iterations this solver uses. */
-	virtual int getNumberOfPositionIterations()
-	{
-		return m_numberOfPositionIterations;
-	}
-
-	/** Set the number of velocity constraint solver iterations this solver uses. */
-	virtual void setNumberOfVelocityIterations( int iterations )
-	{
-		m_numberOfVelocityIterations = iterations;
-	}
-
-	/** Get the number of velocity constraint solver iterations this solver uses. */
-	virtual int getNumberOfVelocityIterations()
-	{
-		return m_numberOfVelocityIterations;
-	}
-
-	/** Return the timescale that the simulation is using */
-	float getTimeScale()
-	{
-		return m_timeScale;
-	}
-
-#if 0
-	/**
-	 * Add a collision object to be used by the indicated softbody.
-	 */
-	virtual void addCollisionObjectForSoftBody( int clothIdentifier, btCollisionObject *collisionObject ) = 0;
-#endif
-};
-
-/** 
- * Class to manage movement of data from a solver to a given target.
- * This version is abstract. Subclasses will have custom pairings for different combinations.
- */
-class btSoftBodySolverOutput
-{
-protected:
-
-public:
-	btSoftBodySolverOutput()
-	{
-	}
-
-	virtual ~btSoftBodySolverOutput()
-	{
-	}
-
-
-	/** Output current computed vertex data to the vertex buffers for all cloths in the solver. */
-	virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer ) = 0;
-};
-
-
-#endif // #ifndef BT_SOFT_BODY_SOLVERS_H
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SOFT_BODY_SOLVERS_H
+#define BT_SOFT_BODY_SOLVERS_H
+
+#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
+
+
+class btSoftBodyTriangleData;
+class btSoftBodyLinkData;
+class btSoftBodyVertexData;
+class btVertexBufferDescriptor;
+class btCollisionObject;
+class btSoftBody;
+
+
+class btSoftBodySolver
+{
+public:
+	enum SolverTypes
+	{
+		DEFAULT_SOLVER,
+		CPU_SOLVER,
+		CL_SOLVER,
+		CL_SIMD_SOLVER,
+		DX_SOLVER,
+		DX_SIMD_SOLVER
+	};
+
+
+protected:
+	int m_numberOfPositionIterations;
+	int m_numberOfVelocityIterations;
+	// Simulation timescale
+	float m_timeScale;
+	
+public:
+	btSoftBodySolver() :
+		m_numberOfPositionIterations( 10 ),
+		m_timeScale( 1 )
+	{
+		m_numberOfVelocityIterations = 0;
+		m_numberOfPositionIterations = 5;
+	}
+
+	virtual ~btSoftBodySolver()
+	{
+	}
+	
+	/**
+	 * Return the type of the solver.
+	 */
+	virtual SolverTypes getSolverType() const = 0;
+
+
+	/** Ensure that this solver is initialized. */
+	virtual bool checkInitialized() = 0;
+
+	/** Optimize soft bodies in this solver. */
+	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false) = 0;
+
+	/** Copy necessary data back to the original soft body source objects. */
+	virtual void copyBackToSoftBodies(bool bMove = true) = 0;
+
+	/** Predict motion of soft bodies into next timestep */
+	virtual void predictMotion( float solverdt ) = 0;
+
+	/** Solve constraints for a set of soft bodies */
+	virtual void solveConstraints( float solverdt ) = 0;
+
+	/** Perform necessary per-step updates of soft bodies such as recomputing normals and bounding boxes */
+	virtual void updateSoftBodies() = 0;
+
+	/** Process a collision between one of the world's soft bodies and another collision object */
+	virtual void processCollision( btSoftBody *, const struct btCollisionObjectWrapper* ) = 0;
+
+	/** Process a collision between two soft bodies */
+	virtual void processCollision( btSoftBody*, btSoftBody* ) = 0;
+
+	/** Set the number of velocity constraint solver iterations this solver uses. */
+	virtual void setNumberOfPositionIterations( int iterations )
+	{
+		m_numberOfPositionIterations = iterations;
+	}
+
+	/** Get the number of velocity constraint solver iterations this solver uses. */
+	virtual int getNumberOfPositionIterations()
+	{
+		return m_numberOfPositionIterations;
+	}
+
+	/** Set the number of velocity constraint solver iterations this solver uses. */
+	virtual void setNumberOfVelocityIterations( int iterations )
+	{
+		m_numberOfVelocityIterations = iterations;
+	}
+
+	/** Get the number of velocity constraint solver iterations this solver uses. */
+	virtual int getNumberOfVelocityIterations()
+	{
+		return m_numberOfVelocityIterations;
+	}
+
+	/** Return the timescale that the simulation is using */
+	float getTimeScale()
+	{
+		return m_timeScale;
+	}
+
+#if 0
+	/**
+	 * Add a collision object to be used by the indicated softbody.
+	 */
+	virtual void addCollisionObjectForSoftBody( int clothIdentifier, btCollisionObject *collisionObject ) = 0;
+#endif
+};
+
+/** 
+ * Class to manage movement of data from a solver to a given target.
+ * This version is abstract. Subclasses will have custom pairings for different combinations.
+ */
+class btSoftBodySolverOutput
+{
+protected:
+
+public:
+	btSoftBodySolverOutput()
+	{
+	}
+
+	virtual ~btSoftBodySolverOutput()
+	{
+	}
+
+
+	/** Output current computed vertex data to the vertex buffers for all cloths in the solver. */
+	virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer ) = 0;
+};
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVERS_H
diff --git a/src/BulletSoftBody/btSoftRigidCollisionAlgorithm.cpp b/src/BulletSoftBody/btSoftRigidCollisionAlgorithm.cpp
index bc374c805..e3696cd8a 100644
--- a/src/BulletSoftBody/btSoftRigidCollisionAlgorithm.cpp
+++ b/src/BulletSoftBody/btSoftRigidCollisionAlgorithm.cpp
@@ -20,13 +20,14 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "btSoftBody.h"
 #include "BulletSoftBody/btSoftBodySolvers.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 ///TODO: include all the shapes that the softbody can collide with
 ///alternatively, implement special case collision algorithms (just like for rigid collision shapes)
 
 //#include <stdio.h>
 
-btSoftRigidCollisionAlgorithm::btSoftRigidCollisionAlgorithm(btPersistentManifold* /*mf*/,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* /*col0*/,btCollisionObject* /*col1*/, bool isSwapped)
+btSoftRigidCollisionAlgorithm::btSoftRigidCollisionAlgorithm(btPersistentManifold* /*mf*/,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* ,const btCollisionObjectWrapper* , bool isSwapped)
 : btCollisionAlgorithm(ci),
 //m_ownManifold(false),
 //m_manifoldPtr(mf),
@@ -52,18 +53,19 @@ btSoftRigidCollisionAlgorithm::~btSoftRigidCollisionAlgorithm()
 
 #include <stdio.h>
 
-void btSoftRigidCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void btSoftRigidCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 	(void)dispatchInfo;
 	(void)resultOut;
 	//printf("btSoftRigidCollisionAlgorithm\n");
-
-	btSoftBody* softBody =  m_isSwapped? (btSoftBody*)body1 : (btSoftBody*)body0;
-	btCollisionObject* rigidCollisionObject = m_isSwapped? body0 : body1;
+	const btCollisionObjectWrapper* softWrap = m_isSwapped?body1Wrap:body0Wrap;
+	const btCollisionObjectWrapper* rigidWrap = m_isSwapped?body0Wrap:body1Wrap;
+	btSoftBody* softBody =  m_isSwapped? (btSoftBody*)body1Wrap->getCollisionObject() : (btSoftBody*)body0Wrap->getCollisionObject();
+	const btCollisionObjectWrapper* rigidCollisionObjectWrap = m_isSwapped? body0Wrap : body1Wrap;
 	
-	if (softBody->m_collisionDisabledObjects.findLinearSearch(rigidCollisionObject)==softBody->m_collisionDisabledObjects.size())
+	if (softBody->m_collisionDisabledObjects.findLinearSearch(rigidCollisionObjectWrap->getCollisionObject())==softBody->m_collisionDisabledObjects.size())
 	{
-		softBody->getSoftBodySolver()->processCollision(softBody, rigidCollisionObject);
+		softBody->getSoftBodySolver()->processCollision(softBody, rigidCollisionObjectWrap);
 	}
 
 
diff --git a/src/BulletSoftBody/btSoftRigidCollisionAlgorithm.h b/src/BulletSoftBody/btSoftRigidCollisionAlgorithm.h
index 7658e3c22..a9b513e36 100644
--- a/src/BulletSoftBody/btSoftRigidCollisionAlgorithm.h
+++ b/src/BulletSoftBody/btSoftRigidCollisionAlgorithm.h
@@ -39,11 +39,11 @@ class btSoftRigidCollisionAlgorithm : public btCollisionAlgorithm
 
 public:
 
-	btSoftRigidCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* col0,btCollisionObject* col1, bool isSwapped);
+	btSoftRigidCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* col0,const btCollisionObjectWrapper* col1Wrap, bool isSwapped);
 
 	virtual ~btSoftRigidCollisionAlgorithm();
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -55,15 +55,15 @@ public:
 
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(btSoftRigidCollisionAlgorithm));
 			if (!m_swapped)
 			{
-				return new(mem) btSoftRigidCollisionAlgorithm(0,ci,body0,body1,false);
+				return new(mem) btSoftRigidCollisionAlgorithm(0,ci,body0Wrap,body1Wrap,false);
 			} else
 			{
-				return new(mem) btSoftRigidCollisionAlgorithm(0,ci,body0,body1,true);
+				return new(mem) btSoftRigidCollisionAlgorithm(0,ci,body0Wrap,body1Wrap,true);
 			}
 		}
 	};
diff --git a/src/BulletSoftBody/btSoftSoftCollisionAlgorithm.cpp b/src/BulletSoftBody/btSoftSoftCollisionAlgorithm.cpp
index 1b8cfa723..72043e69e 100644
--- a/src/BulletSoftBody/btSoftSoftCollisionAlgorithm.cpp
+++ b/src/BulletSoftBody/btSoftSoftCollisionAlgorithm.cpp
@@ -19,10 +19,11 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletSoftBody/btSoftBodySolvers.h"
 #include "btSoftBody.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
 
 #define USE_PERSISTENT_CONTACTS 1
 
-btSoftSoftCollisionAlgorithm::btSoftSoftCollisionAlgorithm(btPersistentManifold* /*mf*/,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* /*obj0*/,btCollisionObject* /*obj1*/)
+btSoftSoftCollisionAlgorithm::btSoftSoftCollisionAlgorithm(btPersistentManifold* /*mf*/,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* /*obj0*/,const btCollisionObjectWrapper* /*obj1*/)
 : btCollisionAlgorithm(ci)
 //m_ownManifold(false),
 //m_manifoldPtr(mf)
@@ -33,10 +34,10 @@ btSoftSoftCollisionAlgorithm::~btSoftSoftCollisionAlgorithm()
 {
 }
 
-void btSoftSoftCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& /*dispatchInfo*/,btManifoldResult* /*resultOut*/)
+void btSoftSoftCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& /*dispatchInfo*/,btManifoldResult* /*resultOut*/)
 {
-	btSoftBody* soft0 =	(btSoftBody*)body0;
-	btSoftBody* soft1 =	(btSoftBody*)body1;
+	btSoftBody* soft0 =	(btSoftBody*)body0Wrap->getCollisionObject();
+	btSoftBody* soft1 =	(btSoftBody*)body1Wrap->getCollisionObject();
 	soft0->getSoftBodySolver()->processCollision(soft0, soft1);
 }
 
diff --git a/src/BulletSoftBody/btSoftSoftCollisionAlgorithm.h b/src/BulletSoftBody/btSoftSoftCollisionAlgorithm.h
index 92d683c1d..43b1439cc 100644
--- a/src/BulletSoftBody/btSoftSoftCollisionAlgorithm.h
+++ b/src/BulletSoftBody/btSoftSoftCollisionAlgorithm.h
@@ -38,7 +38,7 @@ public:
 	btSoftSoftCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci)
 		: btCollisionAlgorithm(ci) {}
 
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
 
@@ -48,17 +48,17 @@ public:
 			manifoldArray.push_back(m_manifoldPtr);
 	}
 
-	btSoftSoftCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
+	btSoftSoftCollisionAlgorithm(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap);
 
 	virtual ~btSoftSoftCollisionAlgorithm();
 
 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
 		{
 			int bbsize = sizeof(btSoftSoftCollisionAlgorithm);
 			void* ptr = ci.m_dispatcher1->allocateCollisionAlgorithm(bbsize);
-			return new(ptr) btSoftSoftCollisionAlgorithm(0,ci,body0,body1);
+			return new(ptr) btSoftSoftCollisionAlgorithm(0,ci,body0Wrap,body1Wrap);
 		}
 	};
 
diff --git a/src/BulletSoftBody/btSparseSDF.h b/src/BulletSoftBody/btSparseSDF.h
index 90a26cdf7..180e3c218 100644
--- a/src/BulletSoftBody/btSparseSDF.h
+++ b/src/BulletSoftBody/btSparseSDF.h
@@ -58,7 +58,7 @@ struct	btSparseSdf
 		int					c[3];
 		int					puid;
 		unsigned			hash;
-		btCollisionShape*	pclient;
+		const btCollisionShape*	pclient;
 		Cell*				next;
 	};
 	//
@@ -152,7 +152,7 @@ struct	btSparseSdf
 	}
 	//
 	btScalar				Evaluate(	const btVector3& x,
-		btCollisionShape* shape,
+		const btCollisionShape* shape,
 		btVector3& normal,
 		btScalar margin)
 	{
@@ -248,14 +248,14 @@ struct	btSparseSdf
 	}
 	//
 	static inline btScalar	DistanceToShape(const btVector3& x,
-		btCollisionShape* shape)
+		const btCollisionShape* shape)
 	{
 		btTransform	unit;
 		unit.setIdentity();
 		if(shape->isConvex())
 		{
 			btGjkEpaSolver2::sResults	res;
-			btConvexShape*				csh=static_cast<btConvexShape*>(shape);
+			const btConvexShape*				csh=static_cast<const btConvexShape*>(shape);
 			return(btGjkEpaSolver2::SignedDistance(x,0,csh,unit,res));
 		}
 		return(0);
@@ -282,7 +282,7 @@ struct	btSparseSdf
 
 
 	//
-	static inline unsigned int	Hash(int x,int y,int z,btCollisionShape* shape)
+	static inline unsigned int	Hash(int x,int y,int z,const btCollisionShape* shape)
 	{
 		struct btS
 		{ 
@@ -292,7 +292,7 @@ struct	btSparseSdf
 
 		btS myset;
 
-		myset.x=x;myset.y=y;myset.z=z;myset.p=shape;
+		myset.x=x;myset.y=y;myset.z=z;myset.p=(void*)shape;
 		const void* ptr = &myset;
 
 		unsigned int result = HsiehHash<sizeof(btS)/4> (ptr);
diff --git a/src/LinearMath/CMakeLists.txt b/src/LinearMath/CMakeLists.txt
index 4cbefc8c0..7a5fc445e 100644
--- a/src/LinearMath/CMakeLists.txt
+++ b/src/LinearMath/CMakeLists.txt
@@ -10,6 +10,7 @@ SET(LinearMath_SRCS
 	btGeometryUtil.cpp
 	btQuickprof.cpp
 	btSerializer.cpp
+	btVector3.cpp
 )
 
 SET(LinearMath_HDRS
diff --git a/src/LinearMath/btAabbUtil2.h b/src/LinearMath/btAabbUtil2.h
index 42b721dea..d2997b4e6 100644
--- a/src/LinearMath/btAabbUtil2.h
+++ b/src/LinearMath/btAabbUtil2.h
@@ -184,9 +184,7 @@ SIMD_FORCE_INLINE	void btTransformAabb(const btVector3& halfExtents, btScalar ma
 	btVector3 halfExtentsWithMargin = halfExtents+btVector3(margin,margin,margin);
 	btMatrix3x3 abs_b = t.getBasis().absolute();  
 	btVector3 center = t.getOrigin();
-	btVector3 extent = btVector3(abs_b[0].dot(halfExtentsWithMargin),
-		   abs_b[1].dot(halfExtentsWithMargin),
-		  abs_b[2].dot(halfExtentsWithMargin));
+    btVector3 extent = halfExtentsWithMargin.dot3( abs_b[0], abs_b[1], abs_b[2] );
 	aabbMinOut = center - extent;
 	aabbMaxOut = center + extent;
 }
@@ -203,9 +201,7 @@ SIMD_FORCE_INLINE	void btTransformAabb(const btVector3& localAabbMin,const btVec
 		btVector3 localCenter = btScalar(0.5)*(localAabbMax+localAabbMin);
 		btMatrix3x3 abs_b = trans.getBasis().absolute();  
 		btVector3 center = trans(localCenter);
-		btVector3 extent = btVector3(abs_b[0].dot(localHalfExtents),
-			   abs_b[1].dot(localHalfExtents),
-			  abs_b[2].dot(localHalfExtents));
+        btVector3 extent = localHalfExtents.dot3( abs_b[0], abs_b[1], abs_b[2] );
 		aabbMinOut = center-extent;
 		aabbMaxOut = center+extent;
 }
diff --git a/src/LinearMath/btAlignedAllocator.cpp b/src/LinearMath/btAlignedAllocator.cpp
index c4c0ceb2e..a65296c6a 100644
--- a/src/LinearMath/btAlignedAllocator.cpp
+++ b/src/LinearMath/btAlignedAllocator.cpp
@@ -119,7 +119,7 @@ void*   btAlignedAllocInternal  (size_t size, int alignment,int line,char* filen
  
  real = (char *)sAllocFunc(size + 2*sizeof(void *) + (alignment-1));
  if (real) {
-   ret = (void*) btAlignPointer((real + 2*sizeof(void *), alignment);
+   ret = (void*) btAlignPointer(real + 2*sizeof(void *), alignment);
    *((void **)(ret)-1) = (void *)(real);
        *((int*)(ret)-2) = size;
 
diff --git a/src/LinearMath/btAlignedObjectArray.h b/src/LinearMath/btAlignedObjectArray.h
index 36090e13c..24e59ab65 100644
--- a/src/LinearMath/btAlignedObjectArray.h
+++ b/src/LinearMath/btAlignedObjectArray.h
@@ -197,8 +197,26 @@ protected:
 			m_data[m_size].~T();
 		}
 
+
 		///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument.
 		///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations.
+		SIMD_FORCE_INLINE	void	resizeNoInitialize(int newsize)
+		{
+			int curSize = size();
+
+			if (newsize < curSize)
+			{
+			} else
+			{
+				if (newsize > size())
+				{
+					reserve(newsize);
+				}
+				//leave this uninitialized
+			}
+			m_size = newsize;
+		}
+	
 		SIMD_FORCE_INLINE	void	resize(int newsize, const T& fillData=T())
 		{
 			int curSize = size();
@@ -226,7 +244,6 @@ protected:
 
 			m_size = newsize;
 		}
-	
 		SIMD_FORCE_INLINE	T&  expandNonInitializing( )
 		{	
 			int sz = size();
diff --git a/src/LinearMath/btConvexHull.cpp b/src/LinearMath/btConvexHull.cpp
index 532d76d88..e57f8d88a 100644
--- a/src/LinearMath/btConvexHull.cpp
+++ b/src/LinearMath/btConvexHull.cpp
@@ -22,13 +22,6 @@ subject to the following restrictions:
 
 
 
-template <class T>
-void Swap(T &a,T &b)
-{
-	T tmp = a;
-	a=b;
-	b=tmp;
-}
 
 
 //----------------------------------
@@ -518,7 +511,7 @@ int4 HullLibrary::FindSimplex(btVector3 *verts,int verts_count,btAlignedObjectAr
 	if(p3==p0||p3==p1||p3==p2) 
 		return int4(-1,-1,-1,-1);
 	btAssert(!(p0==p1||p0==p2||p0==p3||p1==p2||p1==p3||p2==p3));
-	if(btDot(verts[p3]-verts[p0],btCross(verts[p1]-verts[p0],verts[p2]-verts[p0])) <0) {Swap(p2,p3);}
+	if(btDot(verts[p3]-verts[p0],btCross(verts[p1]-verts[p0],verts[p2]-verts[p0])) <0) {btSwap(p2,p3);}
 	return int4(p0,p1,p2,p3);
 }
 
diff --git a/src/LinearMath/btConvexHullComputer.cpp b/src/LinearMath/btConvexHullComputer.cpp
index b47cb81f2..c03c901c0 100644
--- a/src/LinearMath/btConvexHullComputer.cpp
+++ b/src/LinearMath/btConvexHullComputer.cpp
@@ -1,2751 +1,2751 @@
-/*
-Copyright (c) 2011 Ole Kniemeyer, MAXON, www.maxon.net
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#include <string.h>
-
-#include "btConvexHullComputer.h"
-#include "btAlignedObjectArray.h"
-#include "btMinMax.h"
-#include "btVector3.h"
-
-#ifdef __GNUC__
-	#include <stdint.h>
-#elif defined(_MSC_VER)
-	typedef __int32 int32_t;
-	typedef __int64 int64_t;
-	typedef unsigned __int32 uint32_t;
-	typedef unsigned __int64 uint64_t;
-#else
-	typedef int int32_t;
-	typedef long long int int64_t;
-	typedef unsigned int uint32_t;
-	typedef unsigned long long int uint64_t;
-#endif
-
-
-//The definition of USE_X86_64_ASM is moved into the build system. You can enable it manually by commenting out the following lines
-//#if (defined(__GNUC__) && defined(__x86_64__) && !defined(__ICL))  // || (defined(__ICL) && defined(_M_X64))   bug in Intel compiler, disable inline assembly
-//	#define USE_X86_64_ASM
-//#endif
-
-
-//#define DEBUG_CONVEX_HULL
-//#define SHOW_ITERATIONS
-
-#if defined(DEBUG_CONVEX_HULL) || defined(SHOW_ITERATIONS)
-	#include <stdio.h>
-#endif
-
-// Convex hull implementation based on Preparata and Hong
-// Ole Kniemeyer, MAXON Computer GmbH
-class btConvexHullInternal
-{
-	public:
-		
-		class Point64
-		{
-			public:
-				int64_t x;
-				int64_t y;
-				int64_t z;
-				
-				Point64(int64_t x, int64_t y, int64_t z): x(x), y(y), z(z)
-				{
-				}
-
-				bool isZero()
-				{
-					return (x == 0) && (y == 0) && (z == 0);
-				}
-
-				int64_t dot(const Point64& b) const
-				{
-					return x * b.x + y * b.y + z * b.z;
-				}
-		};
-		
-		class Point32
-		{
-			public:
-				int32_t x;
-				int32_t y;
-				int32_t z;
-				int index;
-				
-				Point32()
-				{
-				}
-				
-				Point32(int32_t x, int32_t y, int32_t z): x(x), y(y), z(z), index(-1)
-				{
-				}
-				
-				bool operator==(const Point32& b) const
-				{
-					return (x == b.x) && (y == b.y) && (z == b.z);
-				}
-
-				bool operator!=(const Point32& b) const
-				{
-					return (x != b.x) || (y != b.y) || (z != b.z);
-				}
-
-				bool isZero()
-				{
-					return (x == 0) && (y == 0) && (z == 0);
-				}
-
-				Point64 cross(const Point32& b) const
-				{
-					return Point64(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x);
-				}
-
-				Point64 cross(const Point64& b) const
-				{
-					return Point64(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x);
-				}
-
-				int64_t dot(const Point32& b) const
-				{
-					return x * b.x + y * b.y + z * b.z;
-				}
-
-				int64_t dot(const Point64& b) const
-				{
-					return x * b.x + y * b.y + z * b.z;
-				}
-
-				Point32 operator+(const Point32& b) const
-				{
-					return Point32(x + b.x, y + b.y, z + b.z);
-				}
-
-				Point32 operator-(const Point32& b) const
-				{
-					return Point32(x - b.x, y - b.y, z - b.z);
-				}
-		};
-
-		class Int128
-		{
-			public:
-				uint64_t low;
-				uint64_t high;
-
-				Int128()
-				{
-				}
-
-				Int128(uint64_t low, uint64_t high): low(low), high(high)
-				{
-				}
-
-				Int128(uint64_t low): low(low), high(0)
-				{
-				}
-
-				Int128(int64_t value): low(value), high((value >= 0) ? 0 : (uint64_t) -1LL)
-				{
-				}
-
-				static Int128 mul(int64_t a, int64_t b);
-
-				static Int128 mul(uint64_t a, uint64_t b);
-
-				Int128 operator-() const
-				{
-					return Int128((uint64_t) -(int64_t)low, ~high + (low == 0));
-				}
-
-				Int128 operator+(const Int128& b) const
-				{
-#ifdef USE_X86_64_ASM
-					Int128 result;
-					__asm__ ("addq %[bl], %[rl]\n\t"
-									 "adcq %[bh], %[rh]\n\t"
-									 : [rl] "=r" (result.low), [rh] "=r" (result.high)
-									 : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high)
-									 : "cc" );
-					return result;
-#else
-					uint64_t lo = low + b.low;
-					return Int128(lo, high + b.high + (lo < low));
-#endif
-				}
-
-				Int128 operator-(const Int128& b) const
-				{
-#ifdef USE_X86_64_ASM
-					Int128 result;
-					__asm__ ("subq %[bl], %[rl]\n\t"
-									 "sbbq %[bh], %[rh]\n\t"
-									 : [rl] "=r" (result.low), [rh] "=r" (result.high)
-									 : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high)
-									 : "cc" );
-					return result;
-#else
-					return *this + -b;
-#endif
-				}
-
-				Int128& operator+=(const Int128& b)
-				{
-#ifdef USE_X86_64_ASM
-					__asm__ ("addq %[bl], %[rl]\n\t"
-									 "adcq %[bh], %[rh]\n\t"
-									 : [rl] "=r" (low), [rh] "=r" (high)
-									 : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high)
-									 : "cc" );
-#else
-					uint64_t lo = low + b.low;
-					if (lo < low)
-					{
-						++high;
-					}
-					low = lo;
-					high += b.high;
-#endif
-					return *this;
-				}
-
-				Int128& operator++()
-				{
-					if (++low == 0)
-					{
-						++high;
-					}
-					return *this;
-				}
-
-				Int128 operator*(int64_t b) const;
-
-				btScalar toScalar() const
-				{
-					return ((int64_t) high >= 0) ? btScalar(high) * (btScalar(0x100000000LL) * btScalar(0x100000000LL)) + btScalar(low)
-						: -(-*this).toScalar();
-				}
-
-				int getSign() const
-				{
-					return ((int64_t) high < 0) ? -1 : (high || low) ? 1 : 0;
-				}
-
-				bool operator<(const Int128& b) const
-				{
-					return (high < b.high) || ((high == b.high) && (low < b.low));
-				}
-
-				int ucmp(const Int128&b) const
-				{
-					if (high < b.high)
-					{
-						return -1;
-					}
-					if (high > b.high)
-					{
-						return 1;
-					}
-					if (low < b.low)
-					{
-						return -1;
-					}
-					if (low > b.low)
-					{
-						return 1;
-					}
-					return 0;
-				}
-		};
-
-
-		class Rational64
-		{
-			private:
-				uint64_t m_numerator;
-				uint64_t m_denominator;
-				int sign;
-				
-			public:
-				Rational64(int64_t numerator, int64_t denominator)
-				{
-					if (numerator > 0)
-					{
-						sign = 1;
-						m_numerator = (uint64_t) numerator;
-					}
-					else if (numerator < 0)
-					{
-						sign = -1;
-						m_numerator = (uint64_t) -numerator;
-					}
-					else
-					{
-						sign = 0;
-						m_numerator = 0;
-					}
-					if (denominator > 0)
-					{
-						m_denominator = (uint64_t) denominator;
-					}
-					else if (denominator < 0)
-					{
-						sign = -sign;
-						m_denominator = (uint64_t) -denominator;
-					}
-					else
-					{
-						m_denominator = 0;
-					}
-				}
-				
-				bool isNegativeInfinity() const
-				{
-					return (sign < 0) && (m_denominator == 0);
-				}
-				
-				bool isNaN() const
-				{
-					return (sign == 0) && (m_denominator == 0);
-				}
-				
-				int compare(const Rational64& b) const;
-				
-				btScalar toScalar() const
-				{
-					return sign * ((m_denominator == 0) ? SIMD_INFINITY : (btScalar) m_numerator / m_denominator);
-				}
-		};
-
-
-		class Rational128
-		{
-			private:
-				Int128 numerator;
-				Int128 denominator;
-				int sign;
-				bool isInt64;
-
-			public:
-				Rational128(int64_t value)
-				{
-					if (value > 0)
-					{
-						sign = 1;
-						this->numerator = value;
-					}
-					else if (value < 0)
-					{
-						sign = -1;
-						this->numerator = -value;
-					}
-					else
-					{
-						sign = 0;
-						this->numerator = (uint64_t) 0;
-					}
-					this->denominator = (uint64_t) 1;
-					isInt64 = true;
-				}
-
-				Rational128(const Int128& numerator, const Int128& denominator)
-				{
-					sign = numerator.getSign();
-					if (sign >= 0)
-					{
-						this->numerator = numerator;
-					}
-					else
-					{
-						this->numerator = -numerator;
-					}
-					int dsign = denominator.getSign();
-					if (dsign >= 0)
-					{
-						this->denominator = denominator;
-					}
-					else
-					{
-						sign = -sign;
-						this->denominator = -denominator;
-					}
-					isInt64 = false;
-				}
-
-				int compare(const Rational128& b) const;
-
-				int compare(int64_t b) const;
-
-				btScalar toScalar() const
-				{
-					return sign * ((denominator.getSign() == 0) ? SIMD_INFINITY : numerator.toScalar() / denominator.toScalar());
-				}
-		};
-
-		class PointR128
-		{
-			public:
-				Int128 x;
-				Int128 y;
-				Int128 z;
-				Int128 denominator;
-
-				PointR128()
-				{
-				}
-
-				PointR128(Int128 x, Int128 y, Int128 z, Int128 denominator): x(x), y(y), z(z), denominator(denominator)
-				{
-				}
-
-				btScalar xvalue() const
-				{
-					return x.toScalar() / denominator.toScalar();
-				}
-
-				btScalar yvalue() const
-				{
-					return y.toScalar() / denominator.toScalar();
-				}
-
-				btScalar zvalue() const
-				{
-					return z.toScalar() / denominator.toScalar();
-				}
-		};
-
-
-		class Edge;
-		class Face;
-
-		class Vertex
-		{
-			public:
-				Vertex* next;
-				Vertex* prev;
-				Edge* edges;
-				Face* firstNearbyFace;
-				Face* lastNearbyFace;
-				PointR128 point128;
-				Point32 point;
-				int copy;
-				
-				Vertex(): next(NULL), prev(NULL), edges(NULL), firstNearbyFace(NULL), lastNearbyFace(NULL), copy(-1)
-				{
-				}
-
-#ifdef DEBUG_CONVEX_HULL
-				void print()
-				{
-					printf("V%d (%d, %d, %d)", point.index, point.x, point.y, point.z);
-				}
-
-				void printGraph();
-#endif
-
-				Point32 operator-(const Vertex& b) const
-				{
-					return point - b.point;
-				}
-
-				Rational128 dot(const Point64& b) const
-				{
-					return (point.index >= 0) ? Rational128(point.dot(b))
-						: Rational128(point128.x * b.x + point128.y * b.y + point128.z * b.z, point128.denominator);
-				}
-
-				btScalar xvalue() const
-				{
-					return (point.index >= 0) ? btScalar(point.x) : point128.xvalue();
-				}
-
-				btScalar yvalue() const
-				{
-					return (point.index >= 0) ? btScalar(point.y) : point128.yvalue();
-				}
-
-				btScalar zvalue() const
-				{
-					return (point.index >= 0) ? btScalar(point.z) : point128.zvalue();
-				}
-
-				void receiveNearbyFaces(Vertex* src)
-				{
-					if (lastNearbyFace)
-					{
-						lastNearbyFace->nextWithSameNearbyVertex = src->firstNearbyFace;
-					}
-					else
-					{
-						firstNearbyFace = src->firstNearbyFace;
-					}
-					if (src->lastNearbyFace)
-					{
-						lastNearbyFace = src->lastNearbyFace;
-					}
-					for (Face* f = src->firstNearbyFace; f; f = f->nextWithSameNearbyVertex)
-					{
-						btAssert(f->nearbyVertex == src);
-						f->nearbyVertex = this;
-					}
-					src->firstNearbyFace = NULL;
-					src->lastNearbyFace = NULL;
-				}
-		};
-
-
-		class Edge
-		{
-			public:
-				Edge* next;
-				Edge* prev;
-				Edge* reverse;
-				Vertex* target;
-				Face* face;
-				int copy;
-
-				~Edge()
-				{
-					next = NULL;
-					prev = NULL;
-					reverse = NULL;
-					target = NULL;
-					face = NULL;
-				}
-
-				void link(Edge* n)
-				{
-					btAssert(reverse->target == n->reverse->target);
-					next = n;
-					n->prev = this;
-				}
-
-#ifdef DEBUG_CONVEX_HULL
-				void print()
-				{
-					printf("E%p : %d -> %d,  n=%p p=%p   (0 %d\t%d\t%d) -> (%d %d %d)", this, reverse->target->point.index, target->point.index, next, prev,
-								 reverse->target->point.x, reverse->target->point.y, reverse->target->point.z, target->point.x, target->point.y, target->point.z);
-				}
-#endif
-		};
-
-		class Face
-		{
-			public:
-				Face* next;
-				Vertex* nearbyVertex;
-				Face* nextWithSameNearbyVertex;
-				Point32 origin;
-				Point32 dir0;
-				Point32 dir1;
-
-				Face(): next(NULL), nearbyVertex(NULL), nextWithSameNearbyVertex(NULL)
-				{
-				}
-
-				void init(Vertex* a, Vertex* b, Vertex* c)
-				{
-					nearbyVertex = a;
-					origin = a->point;
-					dir0 = *b - *a;
-					dir1 = *c - *a;
-					if (a->lastNearbyFace)
-					{
-						a->lastNearbyFace->nextWithSameNearbyVertex = this;
-					}
-					else
-					{
-						a->firstNearbyFace = this;
-					}
-					a->lastNearbyFace = this;
-				}
-
-				Point64 getNormal()
-				{
-					return dir0.cross(dir1);
-				}
-		};
-
-		template<typename UWord, typename UHWord> class DMul
-		{
-			private:
-				static uint32_t high(uint64_t value)
-				{
-					return (uint32_t) (value >> 32);
-				}
-				
-				static uint32_t low(uint64_t value)
-				{
-					return (uint32_t) value;
-				}
-				
-				static uint64_t mul(uint32_t a, uint32_t b)
-				{
-					return (uint64_t) a * (uint64_t) b;
-				}
-				
-				static void shlHalf(uint64_t& value)
-				{
-					value <<= 32;
-				}
-				
-				static uint64_t high(Int128 value)
-				{
-					return value.high;
-				}
-				
-				static uint64_t low(Int128 value)
-				{
-					return value.low;
-				}
-				
-				static Int128 mul(uint64_t a, uint64_t b)
-				{
-					return Int128::mul(a, b);
-				}
-				
-				static void shlHalf(Int128& value)
-				{
-					value.high = value.low;
-					value.low = 0;
-				}
-				
-			public:
-				
-				static void mul(UWord a, UWord b, UWord& resLow, UWord& resHigh)
-				{
-					UWord p00 = mul(low(a), low(b));
-					UWord p01 = mul(low(a), high(b));
-					UWord p10 = mul(high(a), low(b));
-					UWord p11 = mul(high(a), high(b));
-					UWord p0110 = UWord(low(p01)) + UWord(low(p10));
-					p11 += high(p01);
-					p11 += high(p10);
-					p11 += high(p0110);
-					shlHalf(p0110);
-					p00 += p0110;
-					if (p00 < p0110)
-					{
-						++p11;
-					}
-					resLow = p00;
-					resHigh = p11;
-				}
-		};
-	
-	private:
-
-		class IntermediateHull
-		{
-			public:
-				Vertex* minXy;
-				Vertex* maxXy;
-				Vertex* minYx;
-				Vertex* maxYx;
-				
-				IntermediateHull(): minXy(NULL), maxXy(NULL), minYx(NULL), maxYx(NULL)
-				{
-				}
-				
-				void print();
-		};
-	
-		enum Orientation {NONE, CLOCKWISE, COUNTER_CLOCKWISE};
-
-		template <typename T> class PoolArray
-		{
-			private:
-				T* array;
-				int size;
-
-			public:
-				PoolArray<T>* next;
-
-				PoolArray(int size): size(size), next(NULL)
-				{
-					array = (T*) btAlignedAlloc(sizeof(T) * size, 16);
-				}
-
-				~PoolArray()
-				{
-					btAlignedFree(array);
-				}
-
-				T* init()
-				{
-					T* o = array;
-					for (int i = 0; i < size; i++, o++)
-					{
-						o->next = (i+1 < size) ? o + 1 : NULL;
-					}
-					return array;
-				}
-		};
-
-		template <typename T> class Pool
-		{
-			private:
-				PoolArray<T>* arrays;
-				PoolArray<T>* nextArray;
-				T* freeObjects;
-				int arraySize;
-
-			public:
-				Pool(): arrays(NULL), nextArray(NULL), freeObjects(NULL), arraySize(256)
-				{
-				}
-
-				~Pool()
-				{
-					while (arrays)
-					{
-						PoolArray<T>* p = arrays;
-						arrays = p->next;
-						p->~PoolArray<T>();
-						btAlignedFree(p);
-					}
-				}
-
-				void reset()
-				{
-					nextArray = arrays;
-					freeObjects = NULL;
-				}
-
-				void setArraySize(int arraySize)
-				{
-					this->arraySize = arraySize;
-				}
-
-				T* newObject()
-				{
-					T* o = freeObjects;
-					if (!o)
-					{
-						PoolArray<T>* p = nextArray;
-						if (p)
-						{
-							nextArray = p->next;
-						}
-						else
-						{
-							p = new(btAlignedAlloc(sizeof(PoolArray<T>), 16)) PoolArray<T>(arraySize);
-							p->next = arrays;
-							arrays = p;
-						}
-						o = p->init();
-					}
-					freeObjects = o->next;
-					return new(o) T();
-				};
-
-				void freeObject(T* object)
-				{
-					object->~T();
-					object->next = freeObjects;
-					freeObjects = object;
-				}
-		};
-
-		btVector3 scaling;
-		btVector3 center;
-		Pool<Vertex> vertexPool;
-		Pool<Edge> edgePool;
-		Pool<Face> facePool;
-		btAlignedObjectArray<Vertex*> originalVertices;
-		int mergeStamp;
-		int minAxis;
-		int medAxis;
-		int maxAxis;
-		int usedEdgePairs;
-		int maxUsedEdgePairs;
-
-		static Orientation getOrientation(const Edge* prev, const Edge* next, const Point32& s, const Point32& t);
-		Edge* findMaxAngle(bool ccw, const Vertex* start, const Point32& s, const Point64& rxs, const Point64& sxrxs, Rational64& minCot);
-		void findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge*& e0, Edge*& e1, Vertex* stop0, Vertex* stop1);
-
-		Edge* newEdgePair(Vertex* from, Vertex* to);
-
-		void removeEdgePair(Edge* edge)
-		{
-			Edge* n = edge->next;
-			Edge* r = edge->reverse;
-
-			btAssert(edge->target && r->target);
-
-			if (n != edge)
-			{
-				n->prev = edge->prev;
-				edge->prev->next = n;
-				r->target->edges = n;
-			}
-			else
-			{
-				r->target->edges = NULL;
-			}
-			
-			n = r->next;
-			
-			if (n != r)
-			{
-				n->prev = r->prev;
-				r->prev->next = n;
-				edge->target->edges = n;
-			}
-			else
-			{
-				edge->target->edges = NULL;
-			}
-
-			edgePool.freeObject(edge);
-			edgePool.freeObject(r);
-			usedEdgePairs--;
-		}
-		
-		void computeInternal(int start, int end, IntermediateHull& result);
-		
-		bool mergeProjection(IntermediateHull& h0, IntermediateHull& h1, Vertex*& c0, Vertex*& c1);
-		
-		void merge(IntermediateHull& h0, IntermediateHull& h1);
-
-		btVector3 toBtVector(const Point32& v);
-
-		btVector3 getBtNormal(Face* face);
-
-		bool shiftFace(Face* face, btScalar amount, btAlignedObjectArray<Vertex*> stack);
-
-	public:
-		Vertex* vertexList;
-
-		void compute(const void* coords, bool doubleCoords, int stride, int count);
-
-		btVector3 getCoordinates(const Vertex* v);
-
-		btScalar shrink(btScalar amount, btScalar clampAmount);
-};
-
-
-btConvexHullInternal::Int128 btConvexHullInternal::Int128::operator*(int64_t b) const
-{
-	bool negative = (int64_t) high < 0;
-	Int128 a = negative ? -*this : *this;
-	if (b < 0)
-	{
-		negative = !negative;
-		b = -b;
-	}
-	Int128 result = mul(a.low, (uint64_t) b);
-	result.high += a.high * (uint64_t) b;
-	return negative ? -result : result;
-}
-
-btConvexHullInternal::Int128 btConvexHullInternal::Int128::mul(int64_t a, int64_t b)
-{
-	Int128 result;
-	
-#ifdef USE_X86_64_ASM
-	__asm__ ("imulq %[b]"
-					 : "=a" (result.low), "=d" (result.high)
-					 : "0"(a), [b] "r"(b)
-					 : "cc" );
-	return result;
-	
-#else
-	bool negative = a < 0;
-	if (negative)
-	{
-		a = -a;
-	}
-	if (b < 0)
-	{
-		negative = !negative;
-		b = -b;
-	}
-	DMul<uint64_t, uint32_t>::mul((uint64_t) a, (uint64_t) b, result.low, result.high);
-	return negative ? -result : result;
-#endif
-}
-
-btConvexHullInternal::Int128 btConvexHullInternal::Int128::mul(uint64_t a, uint64_t b)
-{
-	Int128 result;
-
-#ifdef USE_X86_64_ASM
-	__asm__ ("mulq %[b]"
-					 : "=a" (result.low), "=d" (result.high)
-					 : "0"(a), [b] "r"(b)
-					 : "cc" );
-
-#else
-	DMul<uint64_t, uint32_t>::mul(a, b, result.low, result.high);
-#endif
-
-	return result;
-}
-
-int btConvexHullInternal::Rational64::compare(const Rational64& b) const
-{
-	if (sign != b.sign)
-	{
-		return sign - b.sign;
-	}
-	else if (sign == 0)
-	{
-		return 0;
-	}
-
-	//	return (numerator * b.denominator > b.numerator * denominator) ? sign : (numerator * b.denominator < b.numerator * denominator) ? -sign : 0;
-
-#ifdef USE_X86_64_ASM
-
-	int result;
-	int64_t tmp;
-	int64_t dummy;
-	__asm__ ("mulq %[bn]\n\t"
-					 "movq %%rax, %[tmp]\n\t"
-					 "movq %%rdx, %%rbx\n\t"
-					 "movq %[tn], %%rax\n\t"
-					 "mulq %[bd]\n\t"
-					 "subq %[tmp], %%rax\n\t"
-					 "sbbq %%rbx, %%rdx\n\t" // rdx:rax contains 128-bit-difference "numerator*b.denominator - b.numerator*denominator"
-					 "setnsb %%bh\n\t" // bh=1 if difference is non-negative, bh=0 otherwise
-					 "orq %%rdx, %%rax\n\t"
-					 "setnzb %%bl\n\t" // bl=1 if difference if non-zero, bl=0 if it is zero
-					 "decb %%bh\n\t" // now bx=0x0000 if difference is zero, 0xff01 if it is negative, 0x0001 if it is positive (i.e., same sign as difference)
-					 "shll $16, %%ebx\n\t" // ebx has same sign as difference
-					 : "=&b"(result), [tmp] "=&r"(tmp), "=a"(dummy)
-					 : "a"(denominator), [bn] "g"(b.numerator), [tn] "g"(numerator), [bd] "g"(b.denominator)
-					 : "%rdx", "cc" );
-	return result ? result ^ sign // if sign is +1, only bit 0 of result is inverted, which does not change the sign of result (and cannot result in zero)
-																// if sign is -1, all bits of result are inverted, which changes the sign of result (and again cannot result in zero)
-								: 0;
-
-#else
-
-	return sign * Int128::mul(m_numerator, b.m_denominator).ucmp(Int128::mul(m_denominator, b.m_numerator));
-
-#endif
-}
-
-int btConvexHullInternal::Rational128::compare(const Rational128& b) const
-{
-	if (sign != b.sign)
-	{
-		return sign - b.sign;
-	}
-	else if (sign == 0)
-	{
-		return 0;
-	}
-	if (isInt64)
-	{
-		return -b.compare(sign * (int64_t) numerator.low);
-	}
-
-	Int128 nbdLow, nbdHigh, dbnLow, dbnHigh;
-	DMul<Int128, uint64_t>::mul(numerator, b.denominator, nbdLow, nbdHigh);
-	DMul<Int128, uint64_t>::mul(denominator, b.numerator, dbnLow, dbnHigh);
-
-	int cmp = nbdHigh.ucmp(dbnHigh);
-	if (cmp)
-	{
-		return cmp * sign;
-	}
-	return nbdLow.ucmp(dbnLow) * sign;
-}
-
-int btConvexHullInternal::Rational128::compare(int64_t b) const
-{
-	if (isInt64)
-	{
-		int64_t a = sign * (int64_t) numerator.low;
-		return (a > b) ? 1 : (a < b) ? -1 : 0;
-	}
-	if (b > 0)
-	{
-		if (sign <= 0)
-		{
-			return -1;
-		}
-	}
-	else if (b < 0)
-	{
-		if (sign >= 0)
-		{
-			return 1;
-		}
-		b = -b;
-	}
-	else
-	{
-		return sign;
-	}
-
-	return numerator.ucmp(denominator * b) * sign;
-}
-
-
-btConvexHullInternal::Edge* btConvexHullInternal::newEdgePair(Vertex* from, Vertex* to)
-{
-	btAssert(from && to);
-	Edge* e = edgePool.newObject();
-	Edge* r = edgePool.newObject();
-	e->reverse = r;
-	r->reverse = e;
-	e->copy = mergeStamp;
-	r->copy = mergeStamp;
-	e->target = to;
-	r->target = from;
-	e->face = NULL;
-	r->face = NULL;
-	usedEdgePairs++;
-	if (usedEdgePairs > maxUsedEdgePairs)
-	{
-		maxUsedEdgePairs = usedEdgePairs;
-	}
-	return e;
-}
-
-bool btConvexHullInternal::mergeProjection(IntermediateHull& h0, IntermediateHull& h1, Vertex*& c0, Vertex*& c1)
-{
-	Vertex* v0 = h0.maxYx;
-	Vertex* v1 = h1.minYx;
-	if ((v0->point.x == v1->point.x) && (v0->point.y == v1->point.y))
-	{
-		btAssert(v0->point.z < v1->point.z);
-		Vertex* v1p = v1->prev;
-		if (v1p == v1)
-		{
-			c0 = v0;
-			if (v1->edges)
-			{
-				btAssert(v1->edges->next == v1->edges);
-				v1 = v1->edges->target;
-				btAssert(v1->edges->next == v1->edges);
-			}
-			c1 = v1;
-			return false;
-		}
-		Vertex* v1n = v1->next;
-		v1p->next = v1n;
-		v1n->prev = v1p;
-		if (v1 == h1.minXy)
-		{
-			if ((v1n->point.x < v1p->point.x) || ((v1n->point.x == v1p->point.x) && (v1n->point.y < v1p->point.y)))
-			{
-				h1.minXy = v1n;
-			}
-			else
-			{
-				h1.minXy = v1p;
-			}
-		}
-		if (v1 == h1.maxXy)
-		{
-			if ((v1n->point.x > v1p->point.x) || ((v1n->point.x == v1p->point.x) && (v1n->point.y > v1p->point.y)))
-			{
-				h1.maxXy = v1n;
-			}
-			else
-			{
-				h1.maxXy = v1p;
-			}
-		}
-	}
-	
-	v0 = h0.maxXy;
-	v1 = h1.maxXy;
-	Vertex* v00 = NULL;
-	Vertex* v10 = NULL;
-	int32_t sign = 1;
-
-	for (int side = 0; side <= 1; side++)
-	{		
-		int32_t dx = (v1->point.x - v0->point.x) * sign;
-		if (dx > 0)
-		{
-			while (true)
-			{
-				int32_t dy = v1->point.y - v0->point.y;
-
-				Vertex* w0 = side ? v0->next : v0->prev;
-				if (w0 != v0)
-				{
-					int32_t dx0 = (w0->point.x - v0->point.x) * sign;
-					int32_t dy0 = w0->point.y - v0->point.y;
-					if ((dy0 <= 0) && ((dx0 == 0) || ((dx0 < 0) && (dy0 * dx <= dy * dx0))))
-					{
-						v0 = w0;
-						dx = (v1->point.x - v0->point.x) * sign;
-						continue;
-					}
-				}
-
-				Vertex* w1 = side ? v1->next : v1->prev;
-				if (w1 != v1)
-				{
-					int32_t dx1 = (w1->point.x - v1->point.x) * sign;
-					int32_t dy1 = w1->point.y - v1->point.y;
-					int32_t dxn = (w1->point.x - v0->point.x) * sign;
-					if ((dxn > 0) && (dy1 < 0) && ((dx1 == 0) || ((dx1 < 0) && (dy1 * dx < dy * dx1))))
-					{
-						v1 = w1;
-						dx = dxn;
-						continue;
-					}
-				}
-
-				break;
-			}
-		}
-		else if (dx < 0)
-		{
-			while (true)
-			{
-				int32_t dy = v1->point.y - v0->point.y;
-				
-				Vertex* w1 = side ? v1->prev : v1->next;
-				if (w1 != v1)
-				{
-					int32_t dx1 = (w1->point.x - v1->point.x) * sign;
-					int32_t dy1 = w1->point.y - v1->point.y;
-					if ((dy1 >= 0) && ((dx1 == 0) || ((dx1 < 0) && (dy1 * dx <= dy * dx1))))
-					{
-						v1 = w1;
-						dx = (v1->point.x - v0->point.x) * sign;
-						continue;
-					}
-				}
-				
-				Vertex* w0 = side ? v0->prev : v0->next;
-				if (w0 != v0)
-				{
-					int32_t dx0 = (w0->point.x - v0->point.x) * sign;
-					int32_t dy0 = w0->point.y - v0->point.y;
-					int32_t dxn = (v1->point.x - w0->point.x) * sign;
-					if ((dxn < 0) && (dy0 > 0) && ((dx0 == 0) || ((dx0 < 0) && (dy0 * dx < dy * dx0))))
-					{
-						v0 = w0;
-						dx = dxn;
-						continue;
-					}
-				}
-				
-				break;
-			}
-		}
-		else
-		{
-			int32_t x = v0->point.x;
-			int32_t y0 = v0->point.y;
-			Vertex* w0 = v0;
-			Vertex* t;
-			while (((t = side ? w0->next : w0->prev) != v0) && (t->point.x == x) && (t->point.y <= y0))
-			{
-				w0 = t;
-				y0 = t->point.y;
-			}
-			v0 = w0;
-
-			int32_t y1 = v1->point.y;
-			Vertex* w1 = v1;
-			while (((t = side ? w1->prev : w1->next) != v1) && (t->point.x == x) && (t->point.y >= y1))
-			{
-				w1 = t;
-				y1 = t->point.y;
-			}
-			v1 = w1;
-		}
-		
-		if (side == 0)
-		{
-			v00 = v0;
-			v10 = v1;
-
-			v0 = h0.minXy;
-			v1 = h1.minXy;
-			sign = -1;
-		}
-	}
-
-	v0->prev = v1;
-	v1->next = v0;
-
-	v00->next = v10;
-	v10->prev = v00;
-
-	if (h1.minXy->point.x < h0.minXy->point.x)
-	{
-		h0.minXy = h1.minXy;
-	}
-	if (h1.maxXy->point.x >= h0.maxXy->point.x)
-	{
-		h0.maxXy = h1.maxXy;
-	}
-	
-	h0.maxYx = h1.maxYx;
-
-	c0 = v00;
-	c1 = v10;
-
-	return true;
-}
-
-void btConvexHullInternal::computeInternal(int start, int end, IntermediateHull& result)
-{
-	int n = end - start;
-	switch (n)
-	{
-		case 0:
-			result.minXy = NULL;
-			result.maxXy = NULL;
-			result.minYx = NULL;
-			result.maxYx = NULL;
-			return;
-		case 2:
-		{
-			Vertex* v = originalVertices[start];
-			Vertex* w = v + 1;
-			if (v->point != w->point)
-			{
-				int32_t dx = v->point.x - w->point.x;
-				int32_t dy = v->point.y - w->point.y;
-
-				if ((dx == 0) && (dy == 0))
-				{
-					if (v->point.z > w->point.z)
-					{
-						Vertex* t = w;
-						w = v;
-						v = t;
-					}
-					btAssert(v->point.z < w->point.z);
-					v->next = v;
-					v->prev = v;
-					result.minXy = v;
-					result.maxXy = v;
-					result.minYx = v;
-					result.maxYx = v;
-				}
-				else
-				{
-					v->next = w;
-					v->prev = w;
-					w->next = v;
-					w->prev = v;
-
-					if ((dx < 0) || ((dx == 0) && (dy < 0)))
-					{
-						result.minXy = v;
-						result.maxXy = w;
-					}
-					else
-					{
-						result.minXy = w;
-						result.maxXy = v;
-					}
-
-					if ((dy < 0) || ((dy == 0) && (dx < 0)))
-					{
-						result.minYx = v;
-						result.maxYx = w;
-					}
-					else
-					{
-						result.minYx = w;
-						result.maxYx = v;
-					}
-				}
-
-				Edge* e = newEdgePair(v, w);
-				e->link(e);
-				v->edges = e;
-
-				e = e->reverse;
-				e->link(e);
-				w->edges = e;
-
-				return;
-			}
-		}
-		// lint -fallthrough
-		case 1:
-		{
-			Vertex* v = originalVertices[start];
-			v->edges = NULL;
-			v->next = v;
-			v->prev = v;
-
-			result.minXy = v;
-			result.maxXy = v;
-			result.minYx = v;
-			result.maxYx = v;
-
-			return;
-		}
-	}
-
-	int split0 = start + n / 2;
-	Point32 p = originalVertices[split0-1]->point;
-	int split1 = split0;
-	while ((split1 < end) && (originalVertices[split1]->point == p))
-	{
-		split1++;
-	}
-	computeInternal(start, split0, result);
-	IntermediateHull hull1;
-	computeInternal(split1, end, hull1);
-#ifdef DEBUG_CONVEX_HULL
-	printf("\n\nMerge\n");
-	result.print();
-	hull1.print();
-#endif
-	merge(result, hull1);
-#ifdef DEBUG_CONVEX_HULL
-	printf("\n  Result\n");
-	result.print();
-#endif
-}
-
-#ifdef DEBUG_CONVEX_HULL
-void btConvexHullInternal::IntermediateHull::print()
-{
-	printf("    Hull\n");
-	for (Vertex* v = minXy; v; )
-	{
-		printf("      ");
-		v->print();
-		if (v == maxXy)
-		{
-			printf(" maxXy");
-		}
-		if (v == minYx)
-		{
-			printf(" minYx");
-		}
-		if (v == maxYx)
-		{
-			printf(" maxYx");
-		}
-		if (v->next->prev != v)
-		{
-			printf(" Inconsistency");
-		}
-		printf("\n");
-		v = v->next;
-		if (v == minXy)
-		{
-			break;
-		}
-	}
-	if (minXy)
-	{		
-		minXy->copy = (minXy->copy == -1) ? -2 : -1;
-		minXy->printGraph();
-	}
-}
-
-void btConvexHullInternal::Vertex::printGraph()
-{
-	print();
-	printf("\nEdges\n");
-	Edge* e = edges;
-	if (e)
-	{
-		do
-		{
-			e->print();
-			printf("\n");
-			e = e->next;
-		} while (e != edges);
-		do
-		{
-			Vertex* v = e->target;
-			if (v->copy != copy)
-			{
-				v->copy = copy;
-				v->printGraph();
-			}
-			e = e->next;
-		} while (e != edges);
-	}
-}
-#endif
-
-btConvexHullInternal::Orientation btConvexHullInternal::getOrientation(const Edge* prev, const Edge* next, const Point32& s, const Point32& t)
-{
-	btAssert(prev->reverse->target == next->reverse->target);
-	if (prev->next == next)
-	{
-		if (prev->prev == next)
-		{
-			Point64 n = t.cross(s);
-			Point64 m = (*prev->target - *next->reverse->target).cross(*next->target - *next->reverse->target);
-			btAssert(!m.isZero());
-			int64_t dot = n.dot(m);
-			btAssert(dot != 0);
-			return (dot > 0) ? COUNTER_CLOCKWISE : CLOCKWISE;
-		}
-		return COUNTER_CLOCKWISE;
-	}
-	else if (prev->prev == next)
-	{
-		return CLOCKWISE;
-	}
-	else
-	{
-		return NONE;
-	}
-}
-
-btConvexHullInternal::Edge* btConvexHullInternal::findMaxAngle(bool ccw, const Vertex* start, const Point32& s, const Point64& rxs, const Point64& sxrxs, Rational64& minCot)
-{
-	Edge* minEdge = NULL;
-
-#ifdef DEBUG_CONVEX_HULL
-	printf("find max edge for %d\n", start->point.index);
-#endif
-	Edge* e = start->edges;
-	if (e)
-	{
-		do
-		{
-			if (e->copy > mergeStamp)
-			{
-				Point32 t = *e->target - *start;
-				Rational64 cot(t.dot(sxrxs), t.dot(rxs));
-#ifdef DEBUG_CONVEX_HULL
-				printf("      Angle is %f (%d) for ", (float) btAtan(cot.toScalar()), (int) cot.isNaN());
-				e->print();
-#endif
-				if (cot.isNaN())
-				{
-					btAssert(ccw ? (t.dot(s) < 0) : (t.dot(s) > 0));
-				}
-				else
-				{
-					int cmp;
-					if (minEdge == NULL)
-					{
-						minCot = cot;
-						minEdge = e;
-					}
-					else if ((cmp = cot.compare(minCot)) < 0)
-					{
-						minCot = cot;
-						minEdge = e;
-					}
-					else if ((cmp == 0) && (ccw == (getOrientation(minEdge, e, s, t) == COUNTER_CLOCKWISE)))
-					{
-						minEdge = e;
-					}
-				}
-#ifdef DEBUG_CONVEX_HULL
-				printf("\n");
-#endif
-			}
-			e = e->next;
-		} while (e != start->edges);
-	}
-	return minEdge;
-}
-
-void btConvexHullInternal::findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge*& e0, Edge*& e1, Vertex* stop0, Vertex* stop1)
-{
-	Edge* start0 = e0;
-	Edge* start1 = e1;
-	Point32 et0 = start0 ? start0->target->point : c0->point;
-	Point32 et1 = start1 ? start1->target->point : c1->point;
-	Point32 s = c1->point - c0->point;
-	Point64 normal = ((start0 ? start0 : start1)->target->point - c0->point).cross(s);
-	int64_t dist = c0->point.dot(normal);
-	btAssert(!start1 || (start1->target->point.dot(normal) == dist));
-	Point64 perp = s.cross(normal);
-	btAssert(!perp.isZero());
-	
-#ifdef DEBUG_CONVEX_HULL
-	printf("   Advancing %d %d  (%p %p, %d %d)\n", c0->point.index, c1->point.index, start0, start1, start0 ? start0->target->point.index : -1, start1 ? start1->target->point.index : -1);
-#endif
-
-	int64_t maxDot0 = et0.dot(perp);
-	if (e0)
-	{
-		while (e0->target != stop0)
-		{
-			Edge* e = e0->reverse->prev;
-			if (e->target->point.dot(normal) < dist)
-			{
-				break;
-			}
-			btAssert(e->target->point.dot(normal) == dist);
-			if (e->copy == mergeStamp)
-			{
-				break;
-			}
-			int64_t dot = e->target->point.dot(perp);
-			if (dot <= maxDot0)
-			{
-				break;
-			}
-			maxDot0 = dot;
-			e0 = e;
-			et0 = e->target->point;
-		}
-	}
-	
-	int64_t maxDot1 = et1.dot(perp);
-	if (e1)
-	{
-		while (e1->target != stop1)
-		{
-			Edge* e = e1->reverse->next;
-			if (e->target->point.dot(normal) < dist)
-			{
-				break;
-			}
-			btAssert(e->target->point.dot(normal) == dist);
-			if (e->copy == mergeStamp)
-			{
-				break;
-			}
-			int64_t dot = e->target->point.dot(perp);
-			if (dot <= maxDot1)
-			{
-				break;
-			}
-			maxDot1 = dot;
-			e1 = e;
-			et1 = e->target->point;
-		}
-	}
-
-#ifdef DEBUG_CONVEX_HULL
-	printf("   Starting at %d %d\n", et0.index, et1.index);
-#endif
-
-	int64_t dx = maxDot1 - maxDot0;
-	if (dx > 0)
-	{
-		while (true)
-		{
-			int64_t dy = (et1 - et0).dot(s);
-			
-			if (e0 && (e0->target != stop0))
-			{
-				Edge* f0 = e0->next->reverse;
-				if (f0->copy > mergeStamp)
-				{
-					int64_t dx0 = (f0->target->point - et0).dot(perp);
-					int64_t dy0 = (f0->target->point - et0).dot(s);
-					if ((dx0 == 0) ? (dy0 < 0) : ((dx0 < 0) && (Rational64(dy0, dx0).compare(Rational64(dy, dx)) >= 0)))
-					{
-						et0 = f0->target->point;
-						dx = (et1 - et0).dot(perp);
-						e0 = (e0 == start0) ? NULL : f0;
-						continue;
-					}
-				}
-			}
-			
-			if (e1 && (e1->target != stop1))
-			{
-				Edge* f1 = e1->reverse->next;
-				if (f1->copy > mergeStamp)
-				{
-					Point32 d1 = f1->target->point - et1;
-					if (d1.dot(normal) == 0)
-					{
-						int64_t dx1 = d1.dot(perp);
-						int64_t dy1 = d1.dot(s);
-						int64_t dxn = (f1->target->point - et0).dot(perp);
-						if ((dxn > 0) && ((dx1 == 0) ? (dy1 < 0) : ((dx1 < 0) && (Rational64(dy1, dx1).compare(Rational64(dy, dx)) > 0))))
-						{
-							e1 = f1;
-							et1 = e1->target->point;
-							dx = dxn;
-							continue;
-						}
-					}
-					else
-					{
-						btAssert((e1 == start1) && (d1.dot(normal) < 0));
-					}
-				}
-			}
-
-			break;
-		}
-	}
-	else if (dx < 0)
-	{
-		while (true)
-		{
-			int64_t dy = (et1 - et0).dot(s);
-			
-			if (e1 && (e1->target != stop1))
-			{
-				Edge* f1 = e1->prev->reverse;
-				if (f1->copy > mergeStamp)
-				{
-					int64_t dx1 = (f1->target->point - et1).dot(perp);
-					int64_t dy1 = (f1->target->point - et1).dot(s);
-					if ((dx1 == 0) ? (dy1 > 0) : ((dx1 < 0) && (Rational64(dy1, dx1).compare(Rational64(dy, dx)) <= 0)))
-					{
-						et1 = f1->target->point;
-						dx = (et1 - et0).dot(perp);
-						e1 = (e1 == start1) ? NULL : f1;
-						continue;
-					}
-				}
-			}
-			
-			if (e0 && (e0->target != stop0))
-			{
-				Edge* f0 = e0->reverse->prev;
-				if (f0->copy > mergeStamp)
-				{
-					Point32 d0 = f0->target->point - et0;
-					if (d0.dot(normal) == 0)
-					{
-						int64_t dx0 = d0.dot(perp);
-						int64_t dy0 = d0.dot(s);
-						int64_t dxn = (et1 - f0->target->point).dot(perp);
-						if ((dxn < 0) && ((dx0 == 0) ? (dy0 > 0) : ((dx0 < 0) && (Rational64(dy0, dx0).compare(Rational64(dy, dx)) < 0))))
-						{
-							e0 = f0;
-							et0 = e0->target->point;
-							dx = dxn;
-							continue;
-						}
-					}
-					else
-					{
-						btAssert((e0 == start0) && (d0.dot(normal) < 0));
-					}
-				}
-			}
-
-			break;
-		}
-	}
-#ifdef DEBUG_CONVEX_HULL
-	printf("   Advanced edges to %d %d\n", et0.index, et1.index);
-#endif
-}
-
-
-void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1)
-{
-	if (!h1.maxXy)
-	{
-		return;
-	}
-	if (!h0.maxXy)
-	{
-		h0 = h1;
-		return;
-	}
-	
-	mergeStamp--;
-
-	Vertex* c0 = NULL;
-	Edge* toPrev0 = NULL;
-	Edge* firstNew0 = NULL;
-	Edge* pendingHead0 = NULL;
-	Edge* pendingTail0 = NULL;
-	Vertex* c1 = NULL;
-	Edge* toPrev1 = NULL;
-	Edge* firstNew1 = NULL;
-	Edge* pendingHead1 = NULL;
-	Edge* pendingTail1 = NULL;
-	Point32 prevPoint;
-
-	if (mergeProjection(h0, h1, c0, c1))
-	{
-		Point32 s = *c1 - *c0;
-		Point64 normal = Point32(0, 0, -1).cross(s);
-		Point64 t = s.cross(normal);
-		btAssert(!t.isZero());
-
-		Edge* e = c0->edges;
-		Edge* start0 = NULL;
-		if (e)
-		{
-			do
-			{
-				int64_t dot = (*e->target - *c0).dot(normal);
-				btAssert(dot <= 0);
-				if ((dot == 0) && ((*e->target - *c0).dot(t) > 0))
-				{
-					if (!start0 || (getOrientation(start0, e, s, Point32(0, 0, -1)) == CLOCKWISE))
-					{
-						start0 = e;
-					}
-				}
-				e = e->next;
-			} while (e != c0->edges);
-		}
-		
-		e = c1->edges;
-		Edge* start1 = NULL;
-		if (e)
-		{
-			do
-			{
-				int64_t dot = (*e->target - *c1).dot(normal);
-				btAssert(dot <= 0);
-				if ((dot == 0) && ((*e->target - *c1).dot(t) > 0))
-				{
-					if (!start1 || (getOrientation(start1, e, s, Point32(0, 0, -1)) == COUNTER_CLOCKWISE))
-					{
-						start1 = e;
-					}
-				}
-				e = e->next;
-			} while (e != c1->edges);
-		}
-
-		if (start0 || start1)
-		{
-			findEdgeForCoplanarFaces(c0, c1, start0, start1, NULL, NULL);
-			if (start0)
-			{
-				c0 = start0->target;
-			}
-			if (start1)
-			{
-				c1 = start1->target;
-			}
-		}
-
-		prevPoint = c1->point;
-		prevPoint.z++;
-	}
-	else
-	{
-		prevPoint = c1->point;
-		prevPoint.x++;
-	}
-
-	Vertex* first0 = c0;
-	Vertex* first1 = c1;
-	bool firstRun = true;
-
-	while (true)
-	{
-		Point32 s = *c1 - *c0;
-		Point32 r = prevPoint - c0->point;
-		Point64 rxs = r.cross(s);
-		Point64 sxrxs = s.cross(rxs);
-		
-#ifdef DEBUG_CONVEX_HULL
-		printf("\n  Checking %d %d\n", c0->point.index, c1->point.index);
-#endif
-		Rational64 minCot0(0, 0);
-		Edge* min0 = findMaxAngle(false, c0, s, rxs, sxrxs, minCot0);
-		Rational64 minCot1(0, 0);
-		Edge* min1 = findMaxAngle(true, c1, s, rxs, sxrxs, minCot1);
-		if (!min0 && !min1)
-		{
-			Edge* e = newEdgePair(c0, c1);
-			e->link(e);
-			c0->edges = e;
-
-			e = e->reverse;
-			e->link(e);
-			c1->edges = e;
-			return;
-		}
-		else
-		{
-			int cmp = !min0 ? 1 : !min1 ? -1 : minCot0.compare(minCot1);
-#ifdef DEBUG_CONVEX_HULL
-			printf("    -> Result %d\n", cmp);
-#endif
-			if (firstRun || ((cmp >= 0) ? !minCot1.isNegativeInfinity() : !minCot0.isNegativeInfinity()))
-			{
-				Edge* e = newEdgePair(c0, c1);
-				if (pendingTail0)
-				{
-					pendingTail0->prev = e;
-				}
-				else
-				{
-					pendingHead0 = e;
-				}
-				e->next = pendingTail0;
-				pendingTail0 = e;
-
-				e = e->reverse;
-				if (pendingTail1)
-				{
-					pendingTail1->next = e;
-				}
-				else
-				{
-					pendingHead1 = e;
-				}
-				e->prev = pendingTail1;
-				pendingTail1 = e;
-			}
-			
-			Edge* e0 = min0;
-			Edge* e1 = min1;
-
-#ifdef DEBUG_CONVEX_HULL
-			printf("   Found min edges to %d %d\n", e0 ? e0->target->point.index : -1, e1 ? e1->target->point.index : -1);
-#endif
-
-			if (cmp == 0)
-			{
-				findEdgeForCoplanarFaces(c0, c1, e0, e1, NULL, NULL);
-			}
-
-			if ((cmp >= 0) && e1)
-			{
-				if (toPrev1)
-				{
-					for (Edge* e = toPrev1->next, *n = NULL; e != min1; e = n)
-					{
-						n = e->next;
-						removeEdgePair(e);
-					}
-				}
-
-				if (pendingTail1)
-				{
-					if (toPrev1)
-					{
-						toPrev1->link(pendingHead1);
-					}
-					else
-					{
-						min1->prev->link(pendingHead1);
-						firstNew1 = pendingHead1;
-					}
-					pendingTail1->link(min1);
-					pendingHead1 = NULL;
-					pendingTail1 = NULL;
-				}
-				else if (!toPrev1)
-				{
-					firstNew1 = min1;
-				}
-
-				prevPoint = c1->point;
-				c1 = e1->target;
-				toPrev1 = e1->reverse;
-			}
-
-			if ((cmp <= 0) && e0)
-			{
-				if (toPrev0)
-				{
-					for (Edge* e = toPrev0->prev, *n = NULL; e != min0; e = n)
-					{
-						n = e->prev;
-						removeEdgePair(e);
-					}
-				}
-
-				if (pendingTail0)
-				{
-					if (toPrev0)
-					{
-						pendingHead0->link(toPrev0);
-					}
-					else
-					{
-						pendingHead0->link(min0->next);
-						firstNew0 = pendingHead0;
-					}
-					min0->link(pendingTail0);
-					pendingHead0 = NULL;
-					pendingTail0 = NULL;
-				}
-				else if (!toPrev0)
-				{
-					firstNew0 = min0;
-				}
-
-				prevPoint = c0->point;
-				c0 = e0->target;
-				toPrev0 = e0->reverse;
-			}
-		}
-
-		if ((c0 == first0) && (c1 == first1))
-		{
-			if (toPrev0 == NULL)
-			{
-				pendingHead0->link(pendingTail0);
-				c0->edges = pendingTail0;
-			}
-			else
-			{
-				for (Edge* e = toPrev0->prev, *n = NULL; e != firstNew0; e = n)
-				{
-					n = e->prev;
-					removeEdgePair(e);
-				}
-				if (pendingTail0)
-				{
-					pendingHead0->link(toPrev0);
-					firstNew0->link(pendingTail0);
-				}
-			}
-
-			if (toPrev1 == NULL)
-			{
-				pendingTail1->link(pendingHead1);
-				c1->edges = pendingTail1;
-			}
-			else
-			{
-				for (Edge* e = toPrev1->next, *n = NULL; e != firstNew1; e = n)
-				{
-					n = e->next;
-					removeEdgePair(e);
-				}
-				if (pendingTail1)
-				{
-					toPrev1->link(pendingHead1);
-					pendingTail1->link(firstNew1);
-				}
-			}
-			
-			return;
-		}
-
-		firstRun = false;
-	}
-}
-
-
-static bool pointCmp(const btConvexHullInternal::Point32& p, const btConvexHullInternal::Point32& q)
-{
-	return (p.y < q.y) || ((p.y == q.y) && ((p.x < q.x) || ((p.x == q.x) && (p.z < q.z))));
-}
-
-void btConvexHullInternal::compute(const void* coords, bool doubleCoords, int stride, int count)
-{
-	btVector3 min(btScalar(1e30), btScalar(1e30), btScalar(1e30)), max(btScalar(-1e30), btScalar(-1e30), btScalar(-1e30));
-	const char* ptr = (const char*) coords;
-	if (doubleCoords)
-	{
-		for (int i = 0; i < count; i++)
-		{
-			const double* v = (const double*) ptr;
-			btVector3 p((btScalar) v[0], (btScalar) v[1], (btScalar) v[2]);
-			ptr += stride;
-			min.setMin(p);
-			max.setMax(p);
-		}
-	}
-	else
-	{
-		for (int i = 0; i < count; i++)
-		{
-			const float* v = (const float*) ptr;
-			btVector3 p(v[0], v[1], v[2]);
-			ptr += stride;
-			min.setMin(p);
-			max.setMax(p);
-		}
-	}
-
-	btVector3 s = max - min;
-	maxAxis = s.maxAxis();
-	minAxis = s.minAxis();
-	if (minAxis == maxAxis)
-	{
-		minAxis = (maxAxis + 1) % 3;
-	}
-	medAxis = 3 - maxAxis - minAxis;
-
-	s /= btScalar(10216);
-	if (((medAxis + 1) % 3) != maxAxis)
-	{
-		s *= -1;
-	}
-	scaling = s;
-
-	if (s[0] != 0)
-	{
-		s[0] = btScalar(1) / s[0];
-	}
-	if (s[1] != 0)
-	{
-		s[1] = btScalar(1) / s[1];
-	}
-	if (s[2] != 0)
-	{
-		s[2] = btScalar(1) / s[2];
-	}
-
-	center = (min + max) * btScalar(0.5);
-
-	btAlignedObjectArray<Point32> points;
-	points.resize(count);
-	ptr = (const char*) coords;
-	if (doubleCoords)
-	{
-		for (int i = 0; i < count; i++)
-		{
-			const double* v = (const double*) ptr;
-			btVector3 p((btScalar) v[0], (btScalar) v[1], (btScalar) v[2]);
-			ptr += stride;
-			p = (p - center) * s;
-			points[i].x = (int32_t) p[medAxis];
-			points[i].y = (int32_t) p[maxAxis];
-			points[i].z = (int32_t) p[minAxis];
-			points[i].index = i;
-		}
-	}
-	else
-	{
-		for (int i = 0; i < count; i++)
-		{
-			const float* v = (const float*) ptr;
-			btVector3 p(v[0], v[1], v[2]);
-			ptr += stride;
-			p = (p - center) * s;
-			points[i].x = (int32_t) p[medAxis];
-			points[i].y = (int32_t) p[maxAxis];
-			points[i].z = (int32_t) p[minAxis];
-			points[i].index = i;
-		}
-	}
-	points.quickSort(pointCmp);
-
-	vertexPool.reset();
-	vertexPool.setArraySize(count);
-	originalVertices.resize(count);
-	for (int i = 0; i < count; i++)
-	{
-		Vertex* v = vertexPool.newObject();
-		v->edges = NULL;
-		v->point = points[i];
-		v->copy = -1;
-		originalVertices[i] = v;
-	}
-
-	points.clear();
-
-	edgePool.reset();
-	edgePool.setArraySize(6 * count);
-
-	usedEdgePairs = 0;
-	maxUsedEdgePairs = 0;
-
-	mergeStamp = -3;
-
-	IntermediateHull hull;
-	computeInternal(0, count, hull);
-	vertexList = hull.minXy;
-#ifdef DEBUG_CONVEX_HULL
-	printf("max. edges %d (3v = %d)", maxUsedEdgePairs, 3 * count);
-#endif
-}
-
-btVector3 btConvexHullInternal::toBtVector(const Point32& v)
-{
-	btVector3 p;
-	p[medAxis] = btScalar(v.x);
-	p[maxAxis] = btScalar(v.y);
-	p[minAxis] = btScalar(v.z);
-	return p * scaling;
-}
-
-btVector3 btConvexHullInternal::getBtNormal(Face* face)
-{
-	return toBtVector(face->dir0).cross(toBtVector(face->dir1)).normalized();
-}
-
-btVector3 btConvexHullInternal::getCoordinates(const Vertex* v)
-{
-	btVector3 p;
-	p[medAxis] = v->xvalue();
-	p[maxAxis] = v->yvalue();
-	p[minAxis] = v->zvalue();
-	return p * scaling + center;
-}
-
-btScalar btConvexHullInternal::shrink(btScalar amount, btScalar clampAmount)
-{
-	if (!vertexList)
-	{
-		return 0;
-	}
-	int stamp = --mergeStamp;
-	btAlignedObjectArray<Vertex*> stack;
-	vertexList->copy = stamp;
-	stack.push_back(vertexList);
-	btAlignedObjectArray<Face*> faces;
-
-	Point32 ref = vertexList->point;
-	Int128 hullCenterX(0, 0);
-	Int128 hullCenterY(0, 0);
-	Int128 hullCenterZ(0, 0);
-	Int128 volume(0, 0);
-
-	while (stack.size() > 0)
-	{
-		Vertex* v = stack[stack.size() - 1];
-		stack.pop_back();
-		Edge* e = v->edges;
-		if (e)
-		{
-			do
-			{
-				if (e->target->copy != stamp)
-				{
-					e->target->copy = stamp;
-					stack.push_back(e->target);
-				}
-				if (e->copy != stamp)
-				{
-					Face* face = facePool.newObject();
-					face->init(e->target, e->reverse->prev->target, v);
-					faces.push_back(face);
-					Edge* f = e;
-
-					Vertex* a = NULL;
-					Vertex* b = NULL;
-					do
-					{
-						if (a && b)
-						{
-							int64_t vol = (v->point - ref).dot((a->point - ref).cross(b->point - ref));
-							btAssert(vol >= 0);
-							Point32 c = v->point + a->point + b->point + ref;
-							hullCenterX += vol * c.x;
-							hullCenterY += vol * c.y;
-							hullCenterZ += vol * c.z;
-							volume += vol;
-						}
-
-						btAssert(f->copy != stamp);
-						f->copy = stamp;
-						f->face = face;
-
-						a = b;
-						b = f->target;
-
-						f = f->reverse->prev;
-					} while (f != e);
-				}
-				e = e->next;
-			} while (e != v->edges);
-		}
-	}
-
-	if (volume.getSign() <= 0)
-	{
-		return 0;
-	}
-
-	btVector3 hullCenter;
-	hullCenter[medAxis] = hullCenterX.toScalar();
-	hullCenter[maxAxis] = hullCenterY.toScalar();
-	hullCenter[minAxis] = hullCenterZ.toScalar();
-	hullCenter /= 4 * volume.toScalar();
-	hullCenter *= scaling;
-
-	int faceCount = faces.size();
-
-	if (clampAmount > 0)
-	{
-		btScalar minDist = SIMD_INFINITY;
-		for (int i = 0; i < faceCount; i++)
-		{
-			btVector3 normal = getBtNormal(faces[i]);
-			btScalar dist = normal.dot(toBtVector(faces[i]->origin) - hullCenter);
-			if (dist < minDist)
-			{
-				minDist = dist;
-			}
-		}
-		
-		if (minDist <= 0)
-		{
-			return 0;
-		}
-
-		amount = btMin(amount, minDist * clampAmount);
-	}
-
-	unsigned int seed = 243703;
-	for (int i = 0; i < faceCount; i++, seed = 1664525 * seed + 1013904223)
-	{
-		btSwap(faces[i], faces[seed % faceCount]);
-	}
-
-	for (int i = 0; i < faceCount; i++)
-	{
-		if (!shiftFace(faces[i], amount, stack))
-		{
-			return -amount;
-		}
-	}
-
-	return amount;
-}
-
-bool btConvexHullInternal::shiftFace(Face* face, btScalar amount, btAlignedObjectArray<Vertex*> stack)
-{
-	btVector3 origShift = getBtNormal(face) * -amount;
-	if (scaling[0] != 0)
-	{
-		origShift[0] /= scaling[0];
-	}
-	if (scaling[1] != 0)
-	{
-		origShift[1] /= scaling[1];
-	}
-	if (scaling[2] != 0)
-	{
-		origShift[2] /= scaling[2];
-	}
-	Point32 shift((int32_t) origShift[medAxis], (int32_t) origShift[maxAxis], (int32_t) origShift[minAxis]);
-	if (shift.isZero())
-	{
-		return true;
-	}
-	Point64 normal = face->getNormal();
-#ifdef DEBUG_CONVEX_HULL
-	printf("\nShrinking face (%d %d %d) (%d %d %d) (%d %d %d) by (%d %d %d)\n",
-				 face->origin.x, face->origin.y, face->origin.z, face->dir0.x, face->dir0.y, face->dir0.z, face->dir1.x, face->dir1.y, face->dir1.z, shift.x, shift.y, shift.z);
-#endif
-	int64_t origDot = face->origin.dot(normal);
-	Point32 shiftedOrigin = face->origin + shift;
-	int64_t shiftedDot = shiftedOrigin.dot(normal);
-	btAssert(shiftedDot <= origDot);
-	if (shiftedDot >= origDot)
-	{
-		return false;
-	}
-
-	Edge* intersection = NULL;
-
-	Edge* startEdge = face->nearbyVertex->edges;
-#ifdef DEBUG_CONVEX_HULL
-	printf("Start edge is ");
-	startEdge->print();
-	printf(", normal is (%lld %lld %lld), shifted dot is %lld\n", normal.x, normal.y, normal.z, shiftedDot);
-#endif
-	Rational128 optDot = face->nearbyVertex->dot(normal);
-	int cmp = optDot.compare(shiftedDot);
-#ifdef SHOW_ITERATIONS
-	int n = 0;
-#endif
-	if (cmp >= 0)
-	{
-		Edge* e = startEdge;
-		do
-		{
-#ifdef SHOW_ITERATIONS
-			n++;
-#endif
-			Rational128 dot = e->target->dot(normal);
-			btAssert(dot.compare(origDot) <= 0);
-#ifdef DEBUG_CONVEX_HULL
-			printf("Moving downwards, edge is ");
-			e->print();
-			printf(", dot is %f (%f %lld)\n", (float) dot.toScalar(), (float) optDot.toScalar(), shiftedDot);
-#endif
-			if (dot.compare(optDot) < 0)
-			{
-				int c = dot.compare(shiftedDot);
-				optDot = dot;
-				e = e->reverse;
-				startEdge = e;
-				if (c < 0)
-				{
-					intersection = e;
-					break;
-				}
-				cmp = c;
-			}
-			e = e->prev;
-		} while (e != startEdge);
-
-		if (!intersection)
-		{
-			return false;
-		}
-	}
-	else
-	{
-		Edge* e = startEdge;
-		do
-		{
-#ifdef SHOW_ITERATIONS
-			n++;
-#endif
-			Rational128 dot = e->target->dot(normal);
-			btAssert(dot.compare(origDot) <= 0);
-#ifdef DEBUG_CONVEX_HULL
-			printf("Moving upwards, edge is ");
-			e->print();
-			printf(", dot is %f (%f %lld)\n", (float) dot.toScalar(), (float) optDot.toScalar(), shiftedDot);
-#endif
-			if (dot.compare(optDot) > 0)
-			{
-				cmp = dot.compare(shiftedDot);
-				if (cmp >= 0)
-				{
-					intersection = e;
-					break;
-				}
-				optDot = dot;
-				e = e->reverse;
-				startEdge = e;
-			}
-			e = e->prev;
-		} while (e != startEdge);
-		
-		if (!intersection)
-		{
-			return true;
-		}
-	}
-
-#ifdef SHOW_ITERATIONS
-	printf("Needed %d iterations to find initial intersection\n", n);
-#endif
-
-	if (cmp == 0)
-	{
-		Edge* e = intersection->reverse->next;
-#ifdef SHOW_ITERATIONS
-		n = 0;
-#endif
-		while (e->target->dot(normal).compare(shiftedDot) <= 0)
-		{
-#ifdef SHOW_ITERATIONS
-			n++;
-#endif
-			e = e->next;
-			if (e == intersection->reverse)
-			{
-				return true;
-			}
-#ifdef DEBUG_CONVEX_HULL
-			printf("Checking for outwards edge, current edge is ");
-			e->print();
-			printf("\n");
-#endif
-		}
-#ifdef SHOW_ITERATIONS
-		printf("Needed %d iterations to check for complete containment\n", n);
-#endif
-	}
-	
-	Edge* firstIntersection = NULL;
-	Edge* faceEdge = NULL;
-	Edge* firstFaceEdge = NULL;
-
-#ifdef SHOW_ITERATIONS
-	int m = 0;
-#endif
-	while (true)
-	{
-#ifdef SHOW_ITERATIONS
-		m++;
-#endif
-#ifdef DEBUG_CONVEX_HULL
-		printf("Intersecting edge is ");
-		intersection->print();
-		printf("\n");
-#endif
-		if (cmp == 0)
-		{
-			Edge* e = intersection->reverse->next;
-			startEdge = e;
-#ifdef SHOW_ITERATIONS
-			n = 0;
-#endif
-			while (true)
-			{
-#ifdef SHOW_ITERATIONS
-				n++;
-#endif
-				if (e->target->dot(normal).compare(shiftedDot) >= 0)
-				{
-					break;
-				}
-				intersection = e->reverse;
-				e = e->next;
-				if (e == startEdge)
-				{
-					return true;
-				}
-			}
-#ifdef SHOW_ITERATIONS
-			printf("Needed %d iterations to advance intersection\n", n);
-#endif
-		}
-
-#ifdef DEBUG_CONVEX_HULL
-		printf("Advanced intersecting edge to ");
-		intersection->print();
-		printf(", cmp = %d\n", cmp);
-#endif
-
-		if (!firstIntersection)
-		{
-			firstIntersection = intersection;
-		}
-		else if (intersection == firstIntersection)
-		{
-			break;
-		}
-
-		int prevCmp = cmp;
-		Edge* prevIntersection = intersection;
-		Edge* prevFaceEdge = faceEdge;
-
-		Edge* e = intersection->reverse;
-#ifdef SHOW_ITERATIONS
-		n = 0;
-#endif
-		while (true)
-		{
-#ifdef SHOW_ITERATIONS
-			n++;
-#endif
-			e = e->reverse->prev;
-			btAssert(e != intersection->reverse);
-			cmp = e->target->dot(normal).compare(shiftedDot);
-#ifdef DEBUG_CONVEX_HULL
-			printf("Testing edge ");
-			e->print();
-			printf(" -> cmp = %d\n", cmp);
-#endif
-			if (cmp >= 0)
-			{
-				intersection = e;
-				break;
-			}
-		}
-#ifdef SHOW_ITERATIONS
-		printf("Needed %d iterations to find other intersection of face\n", n);
-#endif
-
-		if (cmp > 0)
-		{
-			Vertex* removed = intersection->target;
-			e = intersection->reverse;
-			if (e->prev == e)
-			{
-				removed->edges = NULL;
-			}
-			else
-			{
-				removed->edges = e->prev;
-				e->prev->link(e->next);
-				e->link(e);
-			}
-#ifdef DEBUG_CONVEX_HULL
-			printf("1: Removed part contains (%d %d %d)\n", removed->point.x, removed->point.y, removed->point.z);
-#endif
-			
-			Point64 n0 = intersection->face->getNormal();
-			Point64 n1 = intersection->reverse->face->getNormal();
-			int64_t m00 = face->dir0.dot(n0);
-			int64_t m01 = face->dir1.dot(n0);
-			int64_t m10 = face->dir0.dot(n1);
-			int64_t m11 = face->dir1.dot(n1);
-			int64_t r0 = (intersection->face->origin - shiftedOrigin).dot(n0);
-			int64_t r1 = (intersection->reverse->face->origin - shiftedOrigin).dot(n1);
-			Int128 det = Int128::mul(m00, m11) - Int128::mul(m01, m10);
-			btAssert(det.getSign() != 0);
-			Vertex* v = vertexPool.newObject();
-			v->point.index = -1;
-			v->copy = -1;
-			v->point128 = PointR128(Int128::mul(face->dir0.x * r0, m11) - Int128::mul(face->dir0.x * r1, m01)
-															+ Int128::mul(face->dir1.x * r1, m00) - Int128::mul(face->dir1.x * r0, m10) + det * shiftedOrigin.x,
-															Int128::mul(face->dir0.y * r0, m11) - Int128::mul(face->dir0.y * r1, m01)
-															+ Int128::mul(face->dir1.y * r1, m00) - Int128::mul(face->dir1.y * r0, m10) + det * shiftedOrigin.y,
-															Int128::mul(face->dir0.z * r0, m11) - Int128::mul(face->dir0.z * r1, m01)
-															+ Int128::mul(face->dir1.z * r1, m00) - Int128::mul(face->dir1.z * r0, m10) + det * shiftedOrigin.z,
-															det);
-			v->point.x = (int32_t) v->point128.xvalue();
-			v->point.y = (int32_t) v->point128.yvalue();
-			v->point.z = (int32_t) v->point128.zvalue();
-			intersection->target = v;
-			v->edges = e;
-
-			stack.push_back(v);
-			stack.push_back(removed);
-			stack.push_back(NULL);
-		}
-
-		if (cmp || prevCmp || (prevIntersection->reverse->next->target != intersection->target))
-		{
-			faceEdge = newEdgePair(prevIntersection->target, intersection->target);
-			if (prevCmp == 0)
-			{
-				faceEdge->link(prevIntersection->reverse->next);
-			}
-			if ((prevCmp == 0) || prevFaceEdge)
-			{
-				prevIntersection->reverse->link(faceEdge);
-			}
-			if (cmp == 0)
-			{
-				intersection->reverse->prev->link(faceEdge->reverse);
-			}
-			faceEdge->reverse->link(intersection->reverse);
-		}
-		else
-		{
-			faceEdge = prevIntersection->reverse->next;
-		}
-
-		if (prevFaceEdge)
-		{
-			if (prevCmp > 0)
-			{
-				faceEdge->link(prevFaceEdge->reverse);
-			}
-			else if (faceEdge != prevFaceEdge->reverse)
-			{
-				stack.push_back(prevFaceEdge->target);
-				while (faceEdge->next != prevFaceEdge->reverse)
-				{
-					Vertex* removed = faceEdge->next->target;
-					removeEdgePair(faceEdge->next);
-					stack.push_back(removed);
-#ifdef DEBUG_CONVEX_HULL
-					printf("2: Removed part contains (%d %d %d)\n", removed->point.x, removed->point.y, removed->point.z);
-#endif
-				}
-				stack.push_back(NULL);
-			}
-		}
-		faceEdge->face = face;
-		faceEdge->reverse->face = intersection->face;
-
-		if (!firstFaceEdge)
-		{
-			firstFaceEdge = faceEdge;
-		}
-	}
-#ifdef SHOW_ITERATIONS
-	printf("Needed %d iterations to process all intersections\n", m);
-#endif
-
-	if (cmp > 0)
-	{
-		firstFaceEdge->reverse->target = faceEdge->target;
-		firstIntersection->reverse->link(firstFaceEdge);
-		firstFaceEdge->link(faceEdge->reverse);
-	}
-	else if (firstFaceEdge != faceEdge->reverse)
-	{
-		stack.push_back(faceEdge->target);
-		while (firstFaceEdge->next != faceEdge->reverse)
-		{
-			Vertex* removed = firstFaceEdge->next->target;
-			removeEdgePair(firstFaceEdge->next);
-			stack.push_back(removed);
-#ifdef DEBUG_CONVEX_HULL
-			printf("3: Removed part contains (%d %d %d)\n", removed->point.x, removed->point.y, removed->point.z);
-#endif
-		}
-		stack.push_back(NULL);
-	}
-
-	btAssert(stack.size() > 0);
-	vertexList = stack[0];
-
-#ifdef DEBUG_CONVEX_HULL
-	printf("Removing part\n");
-#endif
-#ifdef SHOW_ITERATIONS
-	n = 0;
-#endif
-	int pos = 0;
-	while (pos < stack.size())
-	{
-		int end = stack.size();
-		while (pos < end)
-		{
-			Vertex* kept = stack[pos++];
-#ifdef DEBUG_CONVEX_HULL
-			kept->print();
-#endif
-			bool deeper = false;
-			Vertex* removed;
-			while ((removed = stack[pos++]) != NULL)
-			{
-#ifdef SHOW_ITERATIONS
-				n++;
-#endif
-				kept->receiveNearbyFaces(removed);
-				while (removed->edges)
-				{
-					if (!deeper)
-					{
-						deeper = true;
-						stack.push_back(kept);
-					}
-					stack.push_back(removed->edges->target);
-					removeEdgePair(removed->edges);
-				}
-			}
-			if (deeper)
-			{
-				stack.push_back(NULL);
-			}
-		}
-	}
-#ifdef SHOW_ITERATIONS
-	printf("Needed %d iterations to remove part\n", n);
-#endif
-
-	stack.resize(0);
-	face->origin = shiftedOrigin;
-
-	return true;
-}
-
-
-static int getVertexCopy(btConvexHullInternal::Vertex* vertex, btAlignedObjectArray<btConvexHullInternal::Vertex*>& vertices)
-{
-	int index = vertex->copy;
-	if (index < 0)
-	{
-		index = vertices.size();
-		vertex->copy = index;
-		vertices.push_back(vertex);
-#ifdef DEBUG_CONVEX_HULL
-		printf("Vertex %d gets index *%d\n", vertex->point.index, index);
-#endif
-	}
-	return index;
-}
-
-btScalar btConvexHullComputer::compute(const void* coords, bool doubleCoords, int stride, int count, btScalar shrink, btScalar shrinkClamp)
-{
-	if (count <= 0)
-	{
-		vertices.clear();
-		edges.clear();
-		faces.clear();
-		return 0;
-	}
-
-	btConvexHullInternal hull;
-	hull.compute(coords, doubleCoords, stride, count);
-
-	btScalar shift = 0;
-	if ((shrink > 0) && ((shift = hull.shrink(shrink, shrinkClamp)) < 0))
-	{
-		vertices.clear();
-		edges.clear();
-		faces.clear();
-		return shift;
-	}
-
-	vertices.resize(0);
-	edges.resize(0);
-	faces.resize(0);
-
-	btAlignedObjectArray<btConvexHullInternal::Vertex*> oldVertices;
-	getVertexCopy(hull.vertexList, oldVertices);
-	int copied = 0;
-	while (copied < oldVertices.size())
-	{
-		btConvexHullInternal::Vertex* v = oldVertices[copied];
-		vertices.push_back(hull.getCoordinates(v));
-		btConvexHullInternal::Edge* firstEdge = v->edges;
-		if (firstEdge)
-		{
-			int firstCopy = -1;
-			int prevCopy = -1;
-			btConvexHullInternal::Edge* e = firstEdge;
-			do
-			{
-				if (e->copy < 0)
-				{
-					int s = edges.size();
-					edges.push_back(Edge());
-					edges.push_back(Edge());
-					Edge* c = &edges[s];
-					Edge* r = &edges[s + 1];
-					e->copy = s;
-					e->reverse->copy = s + 1;
-					c->reverse = 1;
-					r->reverse = -1;
-					c->targetVertex = getVertexCopy(e->target, oldVertices);
-					r->targetVertex = copied;
-#ifdef DEBUG_CONVEX_HULL
-					printf("      CREATE: Vertex *%d has edge to *%d\n", copied, c->getTargetVertex());
-#endif
-				}
-				if (prevCopy >= 0)
-				{
-					edges[e->copy].next = prevCopy - e->copy;
-				}
-				else
-				{
-					firstCopy = e->copy;
-				}
-				prevCopy = e->copy;
-				e = e->next;
-			} while (e != firstEdge);
-			edges[firstCopy].next = prevCopy - firstCopy;
-		}
-		copied++;
-	}
-
-	for (int i = 0; i < copied; i++)
-	{
-		btConvexHullInternal::Vertex* v = oldVertices[i];
-		btConvexHullInternal::Edge* firstEdge = v->edges;
-		if (firstEdge)
-		{
-			btConvexHullInternal::Edge* e = firstEdge;
-			do
-			{
-				if (e->copy >= 0)
-				{
-#ifdef DEBUG_CONVEX_HULL
-					printf("Vertex *%d has edge to *%d\n", i, edges[e->copy].getTargetVertex());
-#endif
-					faces.push_back(e->copy);
-					btConvexHullInternal::Edge* f = e;
-					do
-					{
-#ifdef DEBUG_CONVEX_HULL
-						printf("   Face *%d\n", edges[f->copy].getTargetVertex());
-#endif
-						f->copy = -1;
-						f = f->reverse->prev;
-					} while (f != e);
-				}
-				e = e->next;
-			} while (e != firstEdge);
-		}
-	}
-
-	return shift;
-}
-
-
-
-
-
+/*
+Copyright (c) 2011 Ole Kniemeyer, MAXON, www.maxon.net
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <string.h>
+
+#include "btConvexHullComputer.h"
+#include "btAlignedObjectArray.h"
+#include "btMinMax.h"
+#include "btVector3.h"
+
+#ifdef __GNUC__
+	#include <stdint.h>
+#elif defined(_MSC_VER)
+	typedef __int32 int32_t;
+	typedef __int64 int64_t;
+	typedef unsigned __int32 uint32_t;
+	typedef unsigned __int64 uint64_t;
+#else
+	typedef int int32_t;
+	typedef long long int int64_t;
+	typedef unsigned int uint32_t;
+	typedef unsigned long long int uint64_t;
+#endif
+
+
+//The definition of USE_X86_64_ASM is moved into the build system. You can enable it manually by commenting out the following lines
+//#if (defined(__GNUC__) && defined(__x86_64__) && !defined(__ICL))  // || (defined(__ICL) && defined(_M_X64))   bug in Intel compiler, disable inline assembly
+//	#define USE_X86_64_ASM
+//#endif
+
+
+//#define DEBUG_CONVEX_HULL
+//#define SHOW_ITERATIONS
+
+#if defined(DEBUG_CONVEX_HULL) || defined(SHOW_ITERATIONS)
+	#include <stdio.h>
+#endif
+
+// Convex hull implementation based on Preparata and Hong
+// Ole Kniemeyer, MAXON Computer GmbH
+class btConvexHullInternal
+{
+	public:
+		
+		class Point64
+		{
+			public:
+				int64_t x;
+				int64_t y;
+				int64_t z;
+				
+				Point64(int64_t x, int64_t y, int64_t z): x(x), y(y), z(z)
+				{
+				}
+
+				bool isZero()
+				{
+					return (x == 0) && (y == 0) && (z == 0);
+				}
+
+				int64_t dot(const Point64& b) const
+				{
+					return x * b.x + y * b.y + z * b.z;
+				}
+		};
+		
+		class Point32
+		{
+			public:
+				int32_t x;
+				int32_t y;
+				int32_t z;
+				int index;
+				
+				Point32()
+				{
+				}
+				
+				Point32(int32_t x, int32_t y, int32_t z): x(x), y(y), z(z), index(-1)
+				{
+				}
+				
+				bool operator==(const Point32& b) const
+				{
+					return (x == b.x) && (y == b.y) && (z == b.z);
+				}
+
+				bool operator!=(const Point32& b) const
+				{
+					return (x != b.x) || (y != b.y) || (z != b.z);
+				}
+
+				bool isZero()
+				{
+					return (x == 0) && (y == 0) && (z == 0);
+				}
+
+				Point64 cross(const Point32& b) const
+				{
+					return Point64(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x);
+				}
+
+				Point64 cross(const Point64& b) const
+				{
+					return Point64(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x);
+				}
+
+				int64_t dot(const Point32& b) const
+				{
+					return x * b.x + y * b.y + z * b.z;
+				}
+
+				int64_t dot(const Point64& b) const
+				{
+					return x * b.x + y * b.y + z * b.z;
+				}
+
+				Point32 operator+(const Point32& b) const
+				{
+					return Point32(x + b.x, y + b.y, z + b.z);
+				}
+
+				Point32 operator-(const Point32& b) const
+				{
+					return Point32(x - b.x, y - b.y, z - b.z);
+				}
+		};
+
+		class Int128
+		{
+			public:
+				uint64_t low;
+				uint64_t high;
+
+				Int128()
+				{
+				}
+
+				Int128(uint64_t low, uint64_t high): low(low), high(high)
+				{
+				}
+
+				Int128(uint64_t low): low(low), high(0)
+				{
+				}
+
+				Int128(int64_t value): low(value), high((value >= 0) ? 0 : (uint64_t) -1LL)
+				{
+				}
+
+				static Int128 mul(int64_t a, int64_t b);
+
+				static Int128 mul(uint64_t a, uint64_t b);
+
+				Int128 operator-() const
+				{
+					return Int128((uint64_t) -(int64_t)low, ~high + (low == 0));
+				}
+
+				Int128 operator+(const Int128& b) const
+				{
+#ifdef USE_X86_64_ASM
+					Int128 result;
+					__asm__ ("addq %[bl], %[rl]\n\t"
+									 "adcq %[bh], %[rh]\n\t"
+									 : [rl] "=r" (result.low), [rh] "=r" (result.high)
+									 : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high)
+									 : "cc" );
+					return result;
+#else
+					uint64_t lo = low + b.low;
+					return Int128(lo, high + b.high + (lo < low));
+#endif
+				}
+
+				Int128 operator-(const Int128& b) const
+				{
+#ifdef USE_X86_64_ASM
+					Int128 result;
+					__asm__ ("subq %[bl], %[rl]\n\t"
+									 "sbbq %[bh], %[rh]\n\t"
+									 : [rl] "=r" (result.low), [rh] "=r" (result.high)
+									 : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high)
+									 : "cc" );
+					return result;
+#else
+					return *this + -b;
+#endif
+				}
+
+				Int128& operator+=(const Int128& b)
+				{
+#ifdef USE_X86_64_ASM
+					__asm__ ("addq %[bl], %[rl]\n\t"
+									 "adcq %[bh], %[rh]\n\t"
+									 : [rl] "=r" (low), [rh] "=r" (high)
+									 : "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high)
+									 : "cc" );
+#else
+					uint64_t lo = low + b.low;
+					if (lo < low)
+					{
+						++high;
+					}
+					low = lo;
+					high += b.high;
+#endif
+					return *this;
+				}
+
+				Int128& operator++()
+				{
+					if (++low == 0)
+					{
+						++high;
+					}
+					return *this;
+				}
+
+				Int128 operator*(int64_t b) const;
+
+				btScalar toScalar() const
+				{
+					return ((int64_t) high >= 0) ? btScalar(high) * (btScalar(0x100000000LL) * btScalar(0x100000000LL)) + btScalar(low)
+						: -(-*this).toScalar();
+				}
+
+				int getSign() const
+				{
+					return ((int64_t) high < 0) ? -1 : (high || low) ? 1 : 0;
+				}
+
+				bool operator<(const Int128& b) const
+				{
+					return (high < b.high) || ((high == b.high) && (low < b.low));
+				}
+
+				int ucmp(const Int128&b) const
+				{
+					if (high < b.high)
+					{
+						return -1;
+					}
+					if (high > b.high)
+					{
+						return 1;
+					}
+					if (low < b.low)
+					{
+						return -1;
+					}
+					if (low > b.low)
+					{
+						return 1;
+					}
+					return 0;
+				}
+		};
+
+
+		class Rational64
+		{
+			private:
+				uint64_t m_numerator;
+				uint64_t m_denominator;
+				int sign;
+				
+			public:
+				Rational64(int64_t numerator, int64_t denominator)
+				{
+					if (numerator > 0)
+					{
+						sign = 1;
+						m_numerator = (uint64_t) numerator;
+					}
+					else if (numerator < 0)
+					{
+						sign = -1;
+						m_numerator = (uint64_t) -numerator;
+					}
+					else
+					{
+						sign = 0;
+						m_numerator = 0;
+					}
+					if (denominator > 0)
+					{
+						m_denominator = (uint64_t) denominator;
+					}
+					else if (denominator < 0)
+					{
+						sign = -sign;
+						m_denominator = (uint64_t) -denominator;
+					}
+					else
+					{
+						m_denominator = 0;
+					}
+				}
+				
+				bool isNegativeInfinity() const
+				{
+					return (sign < 0) && (m_denominator == 0);
+				}
+				
+				bool isNaN() const
+				{
+					return (sign == 0) && (m_denominator == 0);
+				}
+				
+				int compare(const Rational64& b) const;
+				
+				btScalar toScalar() const
+				{
+					return sign * ((m_denominator == 0) ? SIMD_INFINITY : (btScalar) m_numerator / m_denominator);
+				}
+		};
+
+
+		class Rational128
+		{
+			private:
+				Int128 numerator;
+				Int128 denominator;
+				int sign;
+				bool isInt64;
+
+			public:
+				Rational128(int64_t value)
+				{
+					if (value > 0)
+					{
+						sign = 1;
+						this->numerator = value;
+					}
+					else if (value < 0)
+					{
+						sign = -1;
+						this->numerator = -value;
+					}
+					else
+					{
+						sign = 0;
+						this->numerator = (uint64_t) 0;
+					}
+					this->denominator = (uint64_t) 1;
+					isInt64 = true;
+				}
+
+				Rational128(const Int128& numerator, const Int128& denominator)
+				{
+					sign = numerator.getSign();
+					if (sign >= 0)
+					{
+						this->numerator = numerator;
+					}
+					else
+					{
+						this->numerator = -numerator;
+					}
+					int dsign = denominator.getSign();
+					if (dsign >= 0)
+					{
+						this->denominator = denominator;
+					}
+					else
+					{
+						sign = -sign;
+						this->denominator = -denominator;
+					}
+					isInt64 = false;
+				}
+
+				int compare(const Rational128& b) const;
+
+				int compare(int64_t b) const;
+
+				btScalar toScalar() const
+				{
+					return sign * ((denominator.getSign() == 0) ? SIMD_INFINITY : numerator.toScalar() / denominator.toScalar());
+				}
+		};
+
+		class PointR128
+		{
+			public:
+				Int128 x;
+				Int128 y;
+				Int128 z;
+				Int128 denominator;
+
+				PointR128()
+				{
+				}
+
+				PointR128(Int128 x, Int128 y, Int128 z, Int128 denominator): x(x), y(y), z(z), denominator(denominator)
+				{
+				}
+
+				btScalar xvalue() const
+				{
+					return x.toScalar() / denominator.toScalar();
+				}
+
+				btScalar yvalue() const
+				{
+					return y.toScalar() / denominator.toScalar();
+				}
+
+				btScalar zvalue() const
+				{
+					return z.toScalar() / denominator.toScalar();
+				}
+		};
+
+
+		class Edge;
+		class Face;
+
+		class Vertex
+		{
+			public:
+				Vertex* next;
+				Vertex* prev;
+				Edge* edges;
+				Face* firstNearbyFace;
+				Face* lastNearbyFace;
+				PointR128 point128;
+				Point32 point;
+				int copy;
+				
+				Vertex(): next(NULL), prev(NULL), edges(NULL), firstNearbyFace(NULL), lastNearbyFace(NULL), copy(-1)
+				{
+				}
+
+#ifdef DEBUG_CONVEX_HULL
+				void print()
+				{
+					printf("V%d (%d, %d, %d)", point.index, point.x, point.y, point.z);
+				}
+
+				void printGraph();
+#endif
+
+				Point32 operator-(const Vertex& b) const
+				{
+					return point - b.point;
+				}
+
+				Rational128 dot(const Point64& b) const
+				{
+					return (point.index >= 0) ? Rational128(point.dot(b))
+						: Rational128(point128.x * b.x + point128.y * b.y + point128.z * b.z, point128.denominator);
+				}
+
+				btScalar xvalue() const
+				{
+					return (point.index >= 0) ? btScalar(point.x) : point128.xvalue();
+				}
+
+				btScalar yvalue() const
+				{
+					return (point.index >= 0) ? btScalar(point.y) : point128.yvalue();
+				}
+
+				btScalar zvalue() const
+				{
+					return (point.index >= 0) ? btScalar(point.z) : point128.zvalue();
+				}
+
+				void receiveNearbyFaces(Vertex* src)
+				{
+					if (lastNearbyFace)
+					{
+						lastNearbyFace->nextWithSameNearbyVertex = src->firstNearbyFace;
+					}
+					else
+					{
+						firstNearbyFace = src->firstNearbyFace;
+					}
+					if (src->lastNearbyFace)
+					{
+						lastNearbyFace = src->lastNearbyFace;
+					}
+					for (Face* f = src->firstNearbyFace; f; f = f->nextWithSameNearbyVertex)
+					{
+						btAssert(f->nearbyVertex == src);
+						f->nearbyVertex = this;
+					}
+					src->firstNearbyFace = NULL;
+					src->lastNearbyFace = NULL;
+				}
+		};
+
+
+		class Edge
+		{
+			public:
+				Edge* next;
+				Edge* prev;
+				Edge* reverse;
+				Vertex* target;
+				Face* face;
+				int copy;
+
+				~Edge()
+				{
+					next = NULL;
+					prev = NULL;
+					reverse = NULL;
+					target = NULL;
+					face = NULL;
+				}
+
+				void link(Edge* n)
+				{
+					btAssert(reverse->target == n->reverse->target);
+					next = n;
+					n->prev = this;
+				}
+
+#ifdef DEBUG_CONVEX_HULL
+				void print()
+				{
+					printf("E%p : %d -> %d,  n=%p p=%p   (0 %d\t%d\t%d) -> (%d %d %d)", this, reverse->target->point.index, target->point.index, next, prev,
+								 reverse->target->point.x, reverse->target->point.y, reverse->target->point.z, target->point.x, target->point.y, target->point.z);
+				}
+#endif
+		};
+
+		class Face
+		{
+			public:
+				Face* next;
+				Vertex* nearbyVertex;
+				Face* nextWithSameNearbyVertex;
+				Point32 origin;
+				Point32 dir0;
+				Point32 dir1;
+
+				Face(): next(NULL), nearbyVertex(NULL), nextWithSameNearbyVertex(NULL)
+				{
+				}
+
+				void init(Vertex* a, Vertex* b, Vertex* c)
+				{
+					nearbyVertex = a;
+					origin = a->point;
+					dir0 = *b - *a;
+					dir1 = *c - *a;
+					if (a->lastNearbyFace)
+					{
+						a->lastNearbyFace->nextWithSameNearbyVertex = this;
+					}
+					else
+					{
+						a->firstNearbyFace = this;
+					}
+					a->lastNearbyFace = this;
+				}
+
+				Point64 getNormal()
+				{
+					return dir0.cross(dir1);
+				}
+		};
+
+		template<typename UWord, typename UHWord> class DMul
+		{
+			private:
+				static uint32_t high(uint64_t value)
+				{
+					return (uint32_t) (value >> 32);
+				}
+				
+				static uint32_t low(uint64_t value)
+				{
+					return (uint32_t) value;
+				}
+				
+				static uint64_t mul(uint32_t a, uint32_t b)
+				{
+					return (uint64_t) a * (uint64_t) b;
+				}
+				
+				static void shlHalf(uint64_t& value)
+				{
+					value <<= 32;
+				}
+				
+				static uint64_t high(Int128 value)
+				{
+					return value.high;
+				}
+				
+				static uint64_t low(Int128 value)
+				{
+					return value.low;
+				}
+				
+				static Int128 mul(uint64_t a, uint64_t b)
+				{
+					return Int128::mul(a, b);
+				}
+				
+				static void shlHalf(Int128& value)
+				{
+					value.high = value.low;
+					value.low = 0;
+				}
+				
+			public:
+				
+				static void mul(UWord a, UWord b, UWord& resLow, UWord& resHigh)
+				{
+					UWord p00 = mul(low(a), low(b));
+					UWord p01 = mul(low(a), high(b));
+					UWord p10 = mul(high(a), low(b));
+					UWord p11 = mul(high(a), high(b));
+					UWord p0110 = UWord(low(p01)) + UWord(low(p10));
+					p11 += high(p01);
+					p11 += high(p10);
+					p11 += high(p0110);
+					shlHalf(p0110);
+					p00 += p0110;
+					if (p00 < p0110)
+					{
+						++p11;
+					}
+					resLow = p00;
+					resHigh = p11;
+				}
+		};
+	
+	private:
+
+		class IntermediateHull
+		{
+			public:
+				Vertex* minXy;
+				Vertex* maxXy;
+				Vertex* minYx;
+				Vertex* maxYx;
+				
+				IntermediateHull(): minXy(NULL), maxXy(NULL), minYx(NULL), maxYx(NULL)
+				{
+				}
+				
+				void print();
+		};
+	
+		enum Orientation {NONE, CLOCKWISE, COUNTER_CLOCKWISE};
+
+		template <typename T> class PoolArray
+		{
+			private:
+				T* array;
+				int size;
+
+			public:
+				PoolArray<T>* next;
+
+				PoolArray(int size): size(size), next(NULL)
+				{
+					array = (T*) btAlignedAlloc(sizeof(T) * size, 16);
+				}
+
+				~PoolArray()
+				{
+					btAlignedFree(array);
+				}
+
+				T* init()
+				{
+					T* o = array;
+					for (int i = 0; i < size; i++, o++)
+					{
+						o->next = (i+1 < size) ? o + 1 : NULL;
+					}
+					return array;
+				}
+		};
+
+		template <typename T> class Pool
+		{
+			private:
+				PoolArray<T>* arrays;
+				PoolArray<T>* nextArray;
+				T* freeObjects;
+				int arraySize;
+
+			public:
+				Pool(): arrays(NULL), nextArray(NULL), freeObjects(NULL), arraySize(256)
+				{
+				}
+
+				~Pool()
+				{
+					while (arrays)
+					{
+						PoolArray<T>* p = arrays;
+						arrays = p->next;
+						p->~PoolArray<T>();
+						btAlignedFree(p);
+					}
+				}
+
+				void reset()
+				{
+					nextArray = arrays;
+					freeObjects = NULL;
+				}
+
+				void setArraySize(int arraySize)
+				{
+					this->arraySize = arraySize;
+				}
+
+				T* newObject()
+				{
+					T* o = freeObjects;
+					if (!o)
+					{
+						PoolArray<T>* p = nextArray;
+						if (p)
+						{
+							nextArray = p->next;
+						}
+						else
+						{
+							p = new(btAlignedAlloc(sizeof(PoolArray<T>), 16)) PoolArray<T>(arraySize);
+							p->next = arrays;
+							arrays = p;
+						}
+						o = p->init();
+					}
+					freeObjects = o->next;
+					return new(o) T();
+				};
+
+				void freeObject(T* object)
+				{
+					object->~T();
+					object->next = freeObjects;
+					freeObjects = object;
+				}
+		};
+
+		btVector3 scaling;
+		btVector3 center;
+		Pool<Vertex> vertexPool;
+		Pool<Edge> edgePool;
+		Pool<Face> facePool;
+		btAlignedObjectArray<Vertex*> originalVertices;
+		int mergeStamp;
+		int minAxis;
+		int medAxis;
+		int maxAxis;
+		int usedEdgePairs;
+		int maxUsedEdgePairs;
+
+		static Orientation getOrientation(const Edge* prev, const Edge* next, const Point32& s, const Point32& t);
+		Edge* findMaxAngle(bool ccw, const Vertex* start, const Point32& s, const Point64& rxs, const Point64& sxrxs, Rational64& minCot);
+		void findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge*& e0, Edge*& e1, Vertex* stop0, Vertex* stop1);
+
+		Edge* newEdgePair(Vertex* from, Vertex* to);
+
+		void removeEdgePair(Edge* edge)
+		{
+			Edge* n = edge->next;
+			Edge* r = edge->reverse;
+
+			btAssert(edge->target && r->target);
+
+			if (n != edge)
+			{
+				n->prev = edge->prev;
+				edge->prev->next = n;
+				r->target->edges = n;
+			}
+			else
+			{
+				r->target->edges = NULL;
+			}
+			
+			n = r->next;
+			
+			if (n != r)
+			{
+				n->prev = r->prev;
+				r->prev->next = n;
+				edge->target->edges = n;
+			}
+			else
+			{
+				edge->target->edges = NULL;
+			}
+
+			edgePool.freeObject(edge);
+			edgePool.freeObject(r);
+			usedEdgePairs--;
+		}
+		
+		void computeInternal(int start, int end, IntermediateHull& result);
+		
+		bool mergeProjection(IntermediateHull& h0, IntermediateHull& h1, Vertex*& c0, Vertex*& c1);
+		
+		void merge(IntermediateHull& h0, IntermediateHull& h1);
+
+		btVector3 toBtVector(const Point32& v);
+
+		btVector3 getBtNormal(Face* face);
+
+		bool shiftFace(Face* face, btScalar amount, btAlignedObjectArray<Vertex*> stack);
+
+	public:
+		Vertex* vertexList;
+
+		void compute(const void* coords, bool doubleCoords, int stride, int count);
+
+		btVector3 getCoordinates(const Vertex* v);
+
+		btScalar shrink(btScalar amount, btScalar clampAmount);
+};
+
+
+btConvexHullInternal::Int128 btConvexHullInternal::Int128::operator*(int64_t b) const
+{
+	bool negative = (int64_t) high < 0;
+	Int128 a = negative ? -*this : *this;
+	if (b < 0)
+	{
+		negative = !negative;
+		b = -b;
+	}
+	Int128 result = mul(a.low, (uint64_t) b);
+	result.high += a.high * (uint64_t) b;
+	return negative ? -result : result;
+}
+
+btConvexHullInternal::Int128 btConvexHullInternal::Int128::mul(int64_t a, int64_t b)
+{
+	Int128 result;
+	
+#ifdef USE_X86_64_ASM
+	__asm__ ("imulq %[b]"
+					 : "=a" (result.low), "=d" (result.high)
+					 : "0"(a), [b] "r"(b)
+					 : "cc" );
+	return result;
+	
+#else
+	bool negative = a < 0;
+	if (negative)
+	{
+		a = -a;
+	}
+	if (b < 0)
+	{
+		negative = !negative;
+		b = -b;
+	}
+	DMul<uint64_t, uint32_t>::mul((uint64_t) a, (uint64_t) b, result.low, result.high);
+	return negative ? -result : result;
+#endif
+}
+
+btConvexHullInternal::Int128 btConvexHullInternal::Int128::mul(uint64_t a, uint64_t b)
+{
+	Int128 result;
+
+#ifdef USE_X86_64_ASM
+	__asm__ ("mulq %[b]"
+					 : "=a" (result.low), "=d" (result.high)
+					 : "0"(a), [b] "r"(b)
+					 : "cc" );
+
+#else
+	DMul<uint64_t, uint32_t>::mul(a, b, result.low, result.high);
+#endif
+
+	return result;
+}
+
+int btConvexHullInternal::Rational64::compare(const Rational64& b) const
+{
+	if (sign != b.sign)
+	{
+		return sign - b.sign;
+	}
+	else if (sign == 0)
+	{
+		return 0;
+	}
+
+	//	return (numerator * b.denominator > b.numerator * denominator) ? sign : (numerator * b.denominator < b.numerator * denominator) ? -sign : 0;
+
+#ifdef USE_X86_64_ASM
+
+	int result;
+	int64_t tmp;
+	int64_t dummy;
+	__asm__ ("mulq %[bn]\n\t"
+					 "movq %%rax, %[tmp]\n\t"
+					 "movq %%rdx, %%rbx\n\t"
+					 "movq %[tn], %%rax\n\t"
+					 "mulq %[bd]\n\t"
+					 "subq %[tmp], %%rax\n\t"
+					 "sbbq %%rbx, %%rdx\n\t" // rdx:rax contains 128-bit-difference "numerator*b.denominator - b.numerator*denominator"
+					 "setnsb %%bh\n\t" // bh=1 if difference is non-negative, bh=0 otherwise
+					 "orq %%rdx, %%rax\n\t"
+					 "setnzb %%bl\n\t" // bl=1 if difference if non-zero, bl=0 if it is zero
+					 "decb %%bh\n\t" // now bx=0x0000 if difference is zero, 0xff01 if it is negative, 0x0001 if it is positive (i.e., same sign as difference)
+					 "shll $16, %%ebx\n\t" // ebx has same sign as difference
+					 : "=&b"(result), [tmp] "=&r"(tmp), "=a"(dummy)
+					 : "a"(denominator), [bn] "g"(b.numerator), [tn] "g"(numerator), [bd] "g"(b.denominator)
+					 : "%rdx", "cc" );
+	return result ? result ^ sign // if sign is +1, only bit 0 of result is inverted, which does not change the sign of result (and cannot result in zero)
+																// if sign is -1, all bits of result are inverted, which changes the sign of result (and again cannot result in zero)
+								: 0;
+
+#else
+
+	return sign * Int128::mul(m_numerator, b.m_denominator).ucmp(Int128::mul(m_denominator, b.m_numerator));
+
+#endif
+}
+
+int btConvexHullInternal::Rational128::compare(const Rational128& b) const
+{
+	if (sign != b.sign)
+	{
+		return sign - b.sign;
+	}
+	else if (sign == 0)
+	{
+		return 0;
+	}
+	if (isInt64)
+	{
+		return -b.compare(sign * (int64_t) numerator.low);
+	}
+
+	Int128 nbdLow, nbdHigh, dbnLow, dbnHigh;
+	DMul<Int128, uint64_t>::mul(numerator, b.denominator, nbdLow, nbdHigh);
+	DMul<Int128, uint64_t>::mul(denominator, b.numerator, dbnLow, dbnHigh);
+
+	int cmp = nbdHigh.ucmp(dbnHigh);
+	if (cmp)
+	{
+		return cmp * sign;
+	}
+	return nbdLow.ucmp(dbnLow) * sign;
+}
+
+int btConvexHullInternal::Rational128::compare(int64_t b) const
+{
+	if (isInt64)
+	{
+		int64_t a = sign * (int64_t) numerator.low;
+		return (a > b) ? 1 : (a < b) ? -1 : 0;
+	}
+	if (b > 0)
+	{
+		if (sign <= 0)
+		{
+			return -1;
+		}
+	}
+	else if (b < 0)
+	{
+		if (sign >= 0)
+		{
+			return 1;
+		}
+		b = -b;
+	}
+	else
+	{
+		return sign;
+	}
+
+	return numerator.ucmp(denominator * b) * sign;
+}
+
+
+btConvexHullInternal::Edge* btConvexHullInternal::newEdgePair(Vertex* from, Vertex* to)
+{
+	btAssert(from && to);
+	Edge* e = edgePool.newObject();
+	Edge* r = edgePool.newObject();
+	e->reverse = r;
+	r->reverse = e;
+	e->copy = mergeStamp;
+	r->copy = mergeStamp;
+	e->target = to;
+	r->target = from;
+	e->face = NULL;
+	r->face = NULL;
+	usedEdgePairs++;
+	if (usedEdgePairs > maxUsedEdgePairs)
+	{
+		maxUsedEdgePairs = usedEdgePairs;
+	}
+	return e;
+}
+
+bool btConvexHullInternal::mergeProjection(IntermediateHull& h0, IntermediateHull& h1, Vertex*& c0, Vertex*& c1)
+{
+	Vertex* v0 = h0.maxYx;
+	Vertex* v1 = h1.minYx;
+	if ((v0->point.x == v1->point.x) && (v0->point.y == v1->point.y))
+	{
+		btAssert(v0->point.z < v1->point.z);
+		Vertex* v1p = v1->prev;
+		if (v1p == v1)
+		{
+			c0 = v0;
+			if (v1->edges)
+			{
+				btAssert(v1->edges->next == v1->edges);
+				v1 = v1->edges->target;
+				btAssert(v1->edges->next == v1->edges);
+			}
+			c1 = v1;
+			return false;
+		}
+		Vertex* v1n = v1->next;
+		v1p->next = v1n;
+		v1n->prev = v1p;
+		if (v1 == h1.minXy)
+		{
+			if ((v1n->point.x < v1p->point.x) || ((v1n->point.x == v1p->point.x) && (v1n->point.y < v1p->point.y)))
+			{
+				h1.minXy = v1n;
+			}
+			else
+			{
+				h1.minXy = v1p;
+			}
+		}
+		if (v1 == h1.maxXy)
+		{
+			if ((v1n->point.x > v1p->point.x) || ((v1n->point.x == v1p->point.x) && (v1n->point.y > v1p->point.y)))
+			{
+				h1.maxXy = v1n;
+			}
+			else
+			{
+				h1.maxXy = v1p;
+			}
+		}
+	}
+	
+	v0 = h0.maxXy;
+	v1 = h1.maxXy;
+	Vertex* v00 = NULL;
+	Vertex* v10 = NULL;
+	int32_t sign = 1;
+
+	for (int side = 0; side <= 1; side++)
+	{		
+		int32_t dx = (v1->point.x - v0->point.x) * sign;
+		if (dx > 0)
+		{
+			while (true)
+			{
+				int32_t dy = v1->point.y - v0->point.y;
+
+				Vertex* w0 = side ? v0->next : v0->prev;
+				if (w0 != v0)
+				{
+					int32_t dx0 = (w0->point.x - v0->point.x) * sign;
+					int32_t dy0 = w0->point.y - v0->point.y;
+					if ((dy0 <= 0) && ((dx0 == 0) || ((dx0 < 0) && (dy0 * dx <= dy * dx0))))
+					{
+						v0 = w0;
+						dx = (v1->point.x - v0->point.x) * sign;
+						continue;
+					}
+				}
+
+				Vertex* w1 = side ? v1->next : v1->prev;
+				if (w1 != v1)
+				{
+					int32_t dx1 = (w1->point.x - v1->point.x) * sign;
+					int32_t dy1 = w1->point.y - v1->point.y;
+					int32_t dxn = (w1->point.x - v0->point.x) * sign;
+					if ((dxn > 0) && (dy1 < 0) && ((dx1 == 0) || ((dx1 < 0) && (dy1 * dx < dy * dx1))))
+					{
+						v1 = w1;
+						dx = dxn;
+						continue;
+					}
+				}
+
+				break;
+			}
+		}
+		else if (dx < 0)
+		{
+			while (true)
+			{
+				int32_t dy = v1->point.y - v0->point.y;
+				
+				Vertex* w1 = side ? v1->prev : v1->next;
+				if (w1 != v1)
+				{
+					int32_t dx1 = (w1->point.x - v1->point.x) * sign;
+					int32_t dy1 = w1->point.y - v1->point.y;
+					if ((dy1 >= 0) && ((dx1 == 0) || ((dx1 < 0) && (dy1 * dx <= dy * dx1))))
+					{
+						v1 = w1;
+						dx = (v1->point.x - v0->point.x) * sign;
+						continue;
+					}
+				}
+				
+				Vertex* w0 = side ? v0->prev : v0->next;
+				if (w0 != v0)
+				{
+					int32_t dx0 = (w0->point.x - v0->point.x) * sign;
+					int32_t dy0 = w0->point.y - v0->point.y;
+					int32_t dxn = (v1->point.x - w0->point.x) * sign;
+					if ((dxn < 0) && (dy0 > 0) && ((dx0 == 0) || ((dx0 < 0) && (dy0 * dx < dy * dx0))))
+					{
+						v0 = w0;
+						dx = dxn;
+						continue;
+					}
+				}
+				
+				break;
+			}
+		}
+		else
+		{
+			int32_t x = v0->point.x;
+			int32_t y0 = v0->point.y;
+			Vertex* w0 = v0;
+			Vertex* t;
+			while (((t = side ? w0->next : w0->prev) != v0) && (t->point.x == x) && (t->point.y <= y0))
+			{
+				w0 = t;
+				y0 = t->point.y;
+			}
+			v0 = w0;
+
+			int32_t y1 = v1->point.y;
+			Vertex* w1 = v1;
+			while (((t = side ? w1->prev : w1->next) != v1) && (t->point.x == x) && (t->point.y >= y1))
+			{
+				w1 = t;
+				y1 = t->point.y;
+			}
+			v1 = w1;
+		}
+		
+		if (side == 0)
+		{
+			v00 = v0;
+			v10 = v1;
+
+			v0 = h0.minXy;
+			v1 = h1.minXy;
+			sign = -1;
+		}
+	}
+
+	v0->prev = v1;
+	v1->next = v0;
+
+	v00->next = v10;
+	v10->prev = v00;
+
+	if (h1.minXy->point.x < h0.minXy->point.x)
+	{
+		h0.minXy = h1.minXy;
+	}
+	if (h1.maxXy->point.x >= h0.maxXy->point.x)
+	{
+		h0.maxXy = h1.maxXy;
+	}
+	
+	h0.maxYx = h1.maxYx;
+
+	c0 = v00;
+	c1 = v10;
+
+	return true;
+}
+
+void btConvexHullInternal::computeInternal(int start, int end, IntermediateHull& result)
+{
+	int n = end - start;
+	switch (n)
+	{
+		case 0:
+			result.minXy = NULL;
+			result.maxXy = NULL;
+			result.minYx = NULL;
+			result.maxYx = NULL;
+			return;
+		case 2:
+		{
+			Vertex* v = originalVertices[start];
+			Vertex* w = v + 1;
+			if (v->point != w->point)
+			{
+				int32_t dx = v->point.x - w->point.x;
+				int32_t dy = v->point.y - w->point.y;
+
+				if ((dx == 0) && (dy == 0))
+				{
+					if (v->point.z > w->point.z)
+					{
+						Vertex* t = w;
+						w = v;
+						v = t;
+					}
+					btAssert(v->point.z < w->point.z);
+					v->next = v;
+					v->prev = v;
+					result.minXy = v;
+					result.maxXy = v;
+					result.minYx = v;
+					result.maxYx = v;
+				}
+				else
+				{
+					v->next = w;
+					v->prev = w;
+					w->next = v;
+					w->prev = v;
+
+					if ((dx < 0) || ((dx == 0) && (dy < 0)))
+					{
+						result.minXy = v;
+						result.maxXy = w;
+					}
+					else
+					{
+						result.minXy = w;
+						result.maxXy = v;
+					}
+
+					if ((dy < 0) || ((dy == 0) && (dx < 0)))
+					{
+						result.minYx = v;
+						result.maxYx = w;
+					}
+					else
+					{
+						result.minYx = w;
+						result.maxYx = v;
+					}
+				}
+
+				Edge* e = newEdgePair(v, w);
+				e->link(e);
+				v->edges = e;
+
+				e = e->reverse;
+				e->link(e);
+				w->edges = e;
+
+				return;
+			}
+		}
+		// lint -fallthrough
+		case 1:
+		{
+			Vertex* v = originalVertices[start];
+			v->edges = NULL;
+			v->next = v;
+			v->prev = v;
+
+			result.minXy = v;
+			result.maxXy = v;
+			result.minYx = v;
+			result.maxYx = v;
+
+			return;
+		}
+	}
+
+	int split0 = start + n / 2;
+	Point32 p = originalVertices[split0-1]->point;
+	int split1 = split0;
+	while ((split1 < end) && (originalVertices[split1]->point == p))
+	{
+		split1++;
+	}
+	computeInternal(start, split0, result);
+	IntermediateHull hull1;
+	computeInternal(split1, end, hull1);
+#ifdef DEBUG_CONVEX_HULL
+	printf("\n\nMerge\n");
+	result.print();
+	hull1.print();
+#endif
+	merge(result, hull1);
+#ifdef DEBUG_CONVEX_HULL
+	printf("\n  Result\n");
+	result.print();
+#endif
+}
+
+#ifdef DEBUG_CONVEX_HULL
+void btConvexHullInternal::IntermediateHull::print()
+{
+	printf("    Hull\n");
+	for (Vertex* v = minXy; v; )
+	{
+		printf("      ");
+		v->print();
+		if (v == maxXy)
+		{
+			printf(" maxXy");
+		}
+		if (v == minYx)
+		{
+			printf(" minYx");
+		}
+		if (v == maxYx)
+		{
+			printf(" maxYx");
+		}
+		if (v->next->prev != v)
+		{
+			printf(" Inconsistency");
+		}
+		printf("\n");
+		v = v->next;
+		if (v == minXy)
+		{
+			break;
+		}
+	}
+	if (minXy)
+	{		
+		minXy->copy = (minXy->copy == -1) ? -2 : -1;
+		minXy->printGraph();
+	}
+}
+
+void btConvexHullInternal::Vertex::printGraph()
+{
+	print();
+	printf("\nEdges\n");
+	Edge* e = edges;
+	if (e)
+	{
+		do
+		{
+			e->print();
+			printf("\n");
+			e = e->next;
+		} while (e != edges);
+		do
+		{
+			Vertex* v = e->target;
+			if (v->copy != copy)
+			{
+				v->copy = copy;
+				v->printGraph();
+			}
+			e = e->next;
+		} while (e != edges);
+	}
+}
+#endif
+
+btConvexHullInternal::Orientation btConvexHullInternal::getOrientation(const Edge* prev, const Edge* next, const Point32& s, const Point32& t)
+{
+	btAssert(prev->reverse->target == next->reverse->target);
+	if (prev->next == next)
+	{
+		if (prev->prev == next)
+		{
+			Point64 n = t.cross(s);
+			Point64 m = (*prev->target - *next->reverse->target).cross(*next->target - *next->reverse->target);
+			btAssert(!m.isZero());
+			int64_t dot = n.dot(m);
+			btAssert(dot != 0);
+			return (dot > 0) ? COUNTER_CLOCKWISE : CLOCKWISE;
+		}
+		return COUNTER_CLOCKWISE;
+	}
+	else if (prev->prev == next)
+	{
+		return CLOCKWISE;
+	}
+	else
+	{
+		return NONE;
+	}
+}
+
+btConvexHullInternal::Edge* btConvexHullInternal::findMaxAngle(bool ccw, const Vertex* start, const Point32& s, const Point64& rxs, const Point64& sxrxs, Rational64& minCot)
+{
+	Edge* minEdge = NULL;
+
+#ifdef DEBUG_CONVEX_HULL
+	printf("find max edge for %d\n", start->point.index);
+#endif
+	Edge* e = start->edges;
+	if (e)
+	{
+		do
+		{
+			if (e->copy > mergeStamp)
+			{
+				Point32 t = *e->target - *start;
+				Rational64 cot(t.dot(sxrxs), t.dot(rxs));
+#ifdef DEBUG_CONVEX_HULL
+				printf("      Angle is %f (%d) for ", (float) btAtan(cot.toScalar()), (int) cot.isNaN());
+				e->print();
+#endif
+				if (cot.isNaN())
+				{
+					btAssert(ccw ? (t.dot(s) < 0) : (t.dot(s) > 0));
+				}
+				else
+				{
+					int cmp;
+					if (minEdge == NULL)
+					{
+						minCot = cot;
+						minEdge = e;
+					}
+					else if ((cmp = cot.compare(minCot)) < 0)
+					{
+						minCot = cot;
+						minEdge = e;
+					}
+					else if ((cmp == 0) && (ccw == (getOrientation(minEdge, e, s, t) == COUNTER_CLOCKWISE)))
+					{
+						minEdge = e;
+					}
+				}
+#ifdef DEBUG_CONVEX_HULL
+				printf("\n");
+#endif
+			}
+			e = e->next;
+		} while (e != start->edges);
+	}
+	return minEdge;
+}
+
+void btConvexHullInternal::findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge*& e0, Edge*& e1, Vertex* stop0, Vertex* stop1)
+{
+	Edge* start0 = e0;
+	Edge* start1 = e1;
+	Point32 et0 = start0 ? start0->target->point : c0->point;
+	Point32 et1 = start1 ? start1->target->point : c1->point;
+	Point32 s = c1->point - c0->point;
+	Point64 normal = ((start0 ? start0 : start1)->target->point - c0->point).cross(s);
+	int64_t dist = c0->point.dot(normal);
+	btAssert(!start1 || (start1->target->point.dot(normal) == dist));
+	Point64 perp = s.cross(normal);
+	btAssert(!perp.isZero());
+	
+#ifdef DEBUG_CONVEX_HULL
+	printf("   Advancing %d %d  (%p %p, %d %d)\n", c0->point.index, c1->point.index, start0, start1, start0 ? start0->target->point.index : -1, start1 ? start1->target->point.index : -1);
+#endif
+
+	int64_t maxDot0 = et0.dot(perp);
+	if (e0)
+	{
+		while (e0->target != stop0)
+		{
+			Edge* e = e0->reverse->prev;
+			if (e->target->point.dot(normal) < dist)
+			{
+				break;
+			}
+			btAssert(e->target->point.dot(normal) == dist);
+			if (e->copy == mergeStamp)
+			{
+				break;
+			}
+			int64_t dot = e->target->point.dot(perp);
+			if (dot <= maxDot0)
+			{
+				break;
+			}
+			maxDot0 = dot;
+			e0 = e;
+			et0 = e->target->point;
+		}
+	}
+	
+	int64_t maxDot1 = et1.dot(perp);
+	if (e1)
+	{
+		while (e1->target != stop1)
+		{
+			Edge* e = e1->reverse->next;
+			if (e->target->point.dot(normal) < dist)
+			{
+				break;
+			}
+			btAssert(e->target->point.dot(normal) == dist);
+			if (e->copy == mergeStamp)
+			{
+				break;
+			}
+			int64_t dot = e->target->point.dot(perp);
+			if (dot <= maxDot1)
+			{
+				break;
+			}
+			maxDot1 = dot;
+			e1 = e;
+			et1 = e->target->point;
+		}
+	}
+
+#ifdef DEBUG_CONVEX_HULL
+	printf("   Starting at %d %d\n", et0.index, et1.index);
+#endif
+
+	int64_t dx = maxDot1 - maxDot0;
+	if (dx > 0)
+	{
+		while (true)
+		{
+			int64_t dy = (et1 - et0).dot(s);
+			
+			if (e0 && (e0->target != stop0))
+			{
+				Edge* f0 = e0->next->reverse;
+				if (f0->copy > mergeStamp)
+				{
+					int64_t dx0 = (f0->target->point - et0).dot(perp);
+					int64_t dy0 = (f0->target->point - et0).dot(s);
+					if ((dx0 == 0) ? (dy0 < 0) : ((dx0 < 0) && (Rational64(dy0, dx0).compare(Rational64(dy, dx)) >= 0)))
+					{
+						et0 = f0->target->point;
+						dx = (et1 - et0).dot(perp);
+						e0 = (e0 == start0) ? NULL : f0;
+						continue;
+					}
+				}
+			}
+			
+			if (e1 && (e1->target != stop1))
+			{
+				Edge* f1 = e1->reverse->next;
+				if (f1->copy > mergeStamp)
+				{
+					Point32 d1 = f1->target->point - et1;
+					if (d1.dot(normal) == 0)
+					{
+						int64_t dx1 = d1.dot(perp);
+						int64_t dy1 = d1.dot(s);
+						int64_t dxn = (f1->target->point - et0).dot(perp);
+						if ((dxn > 0) && ((dx1 == 0) ? (dy1 < 0) : ((dx1 < 0) && (Rational64(dy1, dx1).compare(Rational64(dy, dx)) > 0))))
+						{
+							e1 = f1;
+							et1 = e1->target->point;
+							dx = dxn;
+							continue;
+						}
+					}
+					else
+					{
+						btAssert((e1 == start1) && (d1.dot(normal) < 0));
+					}
+				}
+			}
+
+			break;
+		}
+	}
+	else if (dx < 0)
+	{
+		while (true)
+		{
+			int64_t dy = (et1 - et0).dot(s);
+			
+			if (e1 && (e1->target != stop1))
+			{
+				Edge* f1 = e1->prev->reverse;
+				if (f1->copy > mergeStamp)
+				{
+					int64_t dx1 = (f1->target->point - et1).dot(perp);
+					int64_t dy1 = (f1->target->point - et1).dot(s);
+					if ((dx1 == 0) ? (dy1 > 0) : ((dx1 < 0) && (Rational64(dy1, dx1).compare(Rational64(dy, dx)) <= 0)))
+					{
+						et1 = f1->target->point;
+						dx = (et1 - et0).dot(perp);
+						e1 = (e1 == start1) ? NULL : f1;
+						continue;
+					}
+				}
+			}
+			
+			if (e0 && (e0->target != stop0))
+			{
+				Edge* f0 = e0->reverse->prev;
+				if (f0->copy > mergeStamp)
+				{
+					Point32 d0 = f0->target->point - et0;
+					if (d0.dot(normal) == 0)
+					{
+						int64_t dx0 = d0.dot(perp);
+						int64_t dy0 = d0.dot(s);
+						int64_t dxn = (et1 - f0->target->point).dot(perp);
+						if ((dxn < 0) && ((dx0 == 0) ? (dy0 > 0) : ((dx0 < 0) && (Rational64(dy0, dx0).compare(Rational64(dy, dx)) < 0))))
+						{
+							e0 = f0;
+							et0 = e0->target->point;
+							dx = dxn;
+							continue;
+						}
+					}
+					else
+					{
+						btAssert((e0 == start0) && (d0.dot(normal) < 0));
+					}
+				}
+			}
+
+			break;
+		}
+	}
+#ifdef DEBUG_CONVEX_HULL
+	printf("   Advanced edges to %d %d\n", et0.index, et1.index);
+#endif
+}
+
+
+void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1)
+{
+	if (!h1.maxXy)
+	{
+		return;
+	}
+	if (!h0.maxXy)
+	{
+		h0 = h1;
+		return;
+	}
+	
+	mergeStamp--;
+
+	Vertex* c0 = NULL;
+	Edge* toPrev0 = NULL;
+	Edge* firstNew0 = NULL;
+	Edge* pendingHead0 = NULL;
+	Edge* pendingTail0 = NULL;
+	Vertex* c1 = NULL;
+	Edge* toPrev1 = NULL;
+	Edge* firstNew1 = NULL;
+	Edge* pendingHead1 = NULL;
+	Edge* pendingTail1 = NULL;
+	Point32 prevPoint;
+
+	if (mergeProjection(h0, h1, c0, c1))
+	{
+		Point32 s = *c1 - *c0;
+		Point64 normal = Point32(0, 0, -1).cross(s);
+		Point64 t = s.cross(normal);
+		btAssert(!t.isZero());
+
+		Edge* e = c0->edges;
+		Edge* start0 = NULL;
+		if (e)
+		{
+			do
+			{
+				int64_t dot = (*e->target - *c0).dot(normal);
+				btAssert(dot <= 0);
+				if ((dot == 0) && ((*e->target - *c0).dot(t) > 0))
+				{
+					if (!start0 || (getOrientation(start0, e, s, Point32(0, 0, -1)) == CLOCKWISE))
+					{
+						start0 = e;
+					}
+				}
+				e = e->next;
+			} while (e != c0->edges);
+		}
+		
+		e = c1->edges;
+		Edge* start1 = NULL;
+		if (e)
+		{
+			do
+			{
+				int64_t dot = (*e->target - *c1).dot(normal);
+				btAssert(dot <= 0);
+				if ((dot == 0) && ((*e->target - *c1).dot(t) > 0))
+				{
+					if (!start1 || (getOrientation(start1, e, s, Point32(0, 0, -1)) == COUNTER_CLOCKWISE))
+					{
+						start1 = e;
+					}
+				}
+				e = e->next;
+			} while (e != c1->edges);
+		}
+
+		if (start0 || start1)
+		{
+			findEdgeForCoplanarFaces(c0, c1, start0, start1, NULL, NULL);
+			if (start0)
+			{
+				c0 = start0->target;
+			}
+			if (start1)
+			{
+				c1 = start1->target;
+			}
+		}
+
+		prevPoint = c1->point;
+		prevPoint.z++;
+	}
+	else
+	{
+		prevPoint = c1->point;
+		prevPoint.x++;
+	}
+
+	Vertex* first0 = c0;
+	Vertex* first1 = c1;
+	bool firstRun = true;
+
+	while (true)
+	{
+		Point32 s = *c1 - *c0;
+		Point32 r = prevPoint - c0->point;
+		Point64 rxs = r.cross(s);
+		Point64 sxrxs = s.cross(rxs);
+		
+#ifdef DEBUG_CONVEX_HULL
+		printf("\n  Checking %d %d\n", c0->point.index, c1->point.index);
+#endif
+		Rational64 minCot0(0, 0);
+		Edge* min0 = findMaxAngle(false, c0, s, rxs, sxrxs, minCot0);
+		Rational64 minCot1(0, 0);
+		Edge* min1 = findMaxAngle(true, c1, s, rxs, sxrxs, minCot1);
+		if (!min0 && !min1)
+		{
+			Edge* e = newEdgePair(c0, c1);
+			e->link(e);
+			c0->edges = e;
+
+			e = e->reverse;
+			e->link(e);
+			c1->edges = e;
+			return;
+		}
+		else
+		{
+			int cmp = !min0 ? 1 : !min1 ? -1 : minCot0.compare(minCot1);
+#ifdef DEBUG_CONVEX_HULL
+			printf("    -> Result %d\n", cmp);
+#endif
+			if (firstRun || ((cmp >= 0) ? !minCot1.isNegativeInfinity() : !minCot0.isNegativeInfinity()))
+			{
+				Edge* e = newEdgePair(c0, c1);
+				if (pendingTail0)
+				{
+					pendingTail0->prev = e;
+				}
+				else
+				{
+					pendingHead0 = e;
+				}
+				e->next = pendingTail0;
+				pendingTail0 = e;
+
+				e = e->reverse;
+				if (pendingTail1)
+				{
+					pendingTail1->next = e;
+				}
+				else
+				{
+					pendingHead1 = e;
+				}
+				e->prev = pendingTail1;
+				pendingTail1 = e;
+			}
+			
+			Edge* e0 = min0;
+			Edge* e1 = min1;
+
+#ifdef DEBUG_CONVEX_HULL
+			printf("   Found min edges to %d %d\n", e0 ? e0->target->point.index : -1, e1 ? e1->target->point.index : -1);
+#endif
+
+			if (cmp == 0)
+			{
+				findEdgeForCoplanarFaces(c0, c1, e0, e1, NULL, NULL);
+			}
+
+			if ((cmp >= 0) && e1)
+			{
+				if (toPrev1)
+				{
+					for (Edge* e = toPrev1->next, *n = NULL; e != min1; e = n)
+					{
+						n = e->next;
+						removeEdgePair(e);
+					}
+				}
+
+				if (pendingTail1)
+				{
+					if (toPrev1)
+					{
+						toPrev1->link(pendingHead1);
+					}
+					else
+					{
+						min1->prev->link(pendingHead1);
+						firstNew1 = pendingHead1;
+					}
+					pendingTail1->link(min1);
+					pendingHead1 = NULL;
+					pendingTail1 = NULL;
+				}
+				else if (!toPrev1)
+				{
+					firstNew1 = min1;
+				}
+
+				prevPoint = c1->point;
+				c1 = e1->target;
+				toPrev1 = e1->reverse;
+			}
+
+			if ((cmp <= 0) && e0)
+			{
+				if (toPrev0)
+				{
+					for (Edge* e = toPrev0->prev, *n = NULL; e != min0; e = n)
+					{
+						n = e->prev;
+						removeEdgePair(e);
+					}
+				}
+
+				if (pendingTail0)
+				{
+					if (toPrev0)
+					{
+						pendingHead0->link(toPrev0);
+					}
+					else
+					{
+						pendingHead0->link(min0->next);
+						firstNew0 = pendingHead0;
+					}
+					min0->link(pendingTail0);
+					pendingHead0 = NULL;
+					pendingTail0 = NULL;
+				}
+				else if (!toPrev0)
+				{
+					firstNew0 = min0;
+				}
+
+				prevPoint = c0->point;
+				c0 = e0->target;
+				toPrev0 = e0->reverse;
+			}
+		}
+
+		if ((c0 == first0) && (c1 == first1))
+		{
+			if (toPrev0 == NULL)
+			{
+				pendingHead0->link(pendingTail0);
+				c0->edges = pendingTail0;
+			}
+			else
+			{
+				for (Edge* e = toPrev0->prev, *n = NULL; e != firstNew0; e = n)
+				{
+					n = e->prev;
+					removeEdgePair(e);
+				}
+				if (pendingTail0)
+				{
+					pendingHead0->link(toPrev0);
+					firstNew0->link(pendingTail0);
+				}
+			}
+
+			if (toPrev1 == NULL)
+			{
+				pendingTail1->link(pendingHead1);
+				c1->edges = pendingTail1;
+			}
+			else
+			{
+				for (Edge* e = toPrev1->next, *n = NULL; e != firstNew1; e = n)
+				{
+					n = e->next;
+					removeEdgePair(e);
+				}
+				if (pendingTail1)
+				{
+					toPrev1->link(pendingHead1);
+					pendingTail1->link(firstNew1);
+				}
+			}
+			
+			return;
+		}
+
+		firstRun = false;
+	}
+}
+
+
+static bool pointCmp(const btConvexHullInternal::Point32& p, const btConvexHullInternal::Point32& q)
+{
+	return (p.y < q.y) || ((p.y == q.y) && ((p.x < q.x) || ((p.x == q.x) && (p.z < q.z))));
+}
+
+void btConvexHullInternal::compute(const void* coords, bool doubleCoords, int stride, int count)
+{
+	btVector3 min(btScalar(1e30), btScalar(1e30), btScalar(1e30)), max(btScalar(-1e30), btScalar(-1e30), btScalar(-1e30));
+	const char* ptr = (const char*) coords;
+	if (doubleCoords)
+	{
+		for (int i = 0; i < count; i++)
+		{
+			const double* v = (const double*) ptr;
+			btVector3 p((btScalar) v[0], (btScalar) v[1], (btScalar) v[2]);
+			ptr += stride;
+			min.setMin(p);
+			max.setMax(p);
+		}
+	}
+	else
+	{
+		for (int i = 0; i < count; i++)
+		{
+			const float* v = (const float*) ptr;
+			btVector3 p(v[0], v[1], v[2]);
+			ptr += stride;
+			min.setMin(p);
+			max.setMax(p);
+		}
+	}
+
+	btVector3 s = max - min;
+	maxAxis = s.maxAxis();
+	minAxis = s.minAxis();
+	if (minAxis == maxAxis)
+	{
+		minAxis = (maxAxis + 1) % 3;
+	}
+	medAxis = 3 - maxAxis - minAxis;
+
+	s /= btScalar(10216);
+	if (((medAxis + 1) % 3) != maxAxis)
+	{
+		s *= -1;
+	}
+	scaling = s;
+
+	if (s[0] != 0)
+	{
+		s[0] = btScalar(1) / s[0];
+	}
+	if (s[1] != 0)
+	{
+		s[1] = btScalar(1) / s[1];
+	}
+	if (s[2] != 0)
+	{
+		s[2] = btScalar(1) / s[2];
+	}
+
+	center = (min + max) * btScalar(0.5);
+
+	btAlignedObjectArray<Point32> points;
+	points.resize(count);
+	ptr = (const char*) coords;
+	if (doubleCoords)
+	{
+		for (int i = 0; i < count; i++)
+		{
+			const double* v = (const double*) ptr;
+			btVector3 p((btScalar) v[0], (btScalar) v[1], (btScalar) v[2]);
+			ptr += stride;
+			p = (p - center) * s;
+			points[i].x = (int32_t) p[medAxis];
+			points[i].y = (int32_t) p[maxAxis];
+			points[i].z = (int32_t) p[minAxis];
+			points[i].index = i;
+		}
+	}
+	else
+	{
+		for (int i = 0; i < count; i++)
+		{
+			const float* v = (const float*) ptr;
+			btVector3 p(v[0], v[1], v[2]);
+			ptr += stride;
+			p = (p - center) * s;
+			points[i].x = (int32_t) p[medAxis];
+			points[i].y = (int32_t) p[maxAxis];
+			points[i].z = (int32_t) p[minAxis];
+			points[i].index = i;
+		}
+	}
+	points.quickSort(pointCmp);
+
+	vertexPool.reset();
+	vertexPool.setArraySize(count);
+	originalVertices.resize(count);
+	for (int i = 0; i < count; i++)
+	{
+		Vertex* v = vertexPool.newObject();
+		v->edges = NULL;
+		v->point = points[i];
+		v->copy = -1;
+		originalVertices[i] = v;
+	}
+
+	points.clear();
+
+	edgePool.reset();
+	edgePool.setArraySize(6 * count);
+
+	usedEdgePairs = 0;
+	maxUsedEdgePairs = 0;
+
+	mergeStamp = -3;
+
+	IntermediateHull hull;
+	computeInternal(0, count, hull);
+	vertexList = hull.minXy;
+#ifdef DEBUG_CONVEX_HULL
+	printf("max. edges %d (3v = %d)", maxUsedEdgePairs, 3 * count);
+#endif
+}
+
+btVector3 btConvexHullInternal::toBtVector(const Point32& v)
+{
+	btVector3 p;
+	p[medAxis] = btScalar(v.x);
+	p[maxAxis] = btScalar(v.y);
+	p[minAxis] = btScalar(v.z);
+	return p * scaling;
+}
+
+btVector3 btConvexHullInternal::getBtNormal(Face* face)
+{
+	return toBtVector(face->dir0).cross(toBtVector(face->dir1)).normalized();
+}
+
+btVector3 btConvexHullInternal::getCoordinates(const Vertex* v)
+{
+	btVector3 p;
+	p[medAxis] = v->xvalue();
+	p[maxAxis] = v->yvalue();
+	p[minAxis] = v->zvalue();
+	return p * scaling + center;
+}
+
+btScalar btConvexHullInternal::shrink(btScalar amount, btScalar clampAmount)
+{
+	if (!vertexList)
+	{
+		return 0;
+	}
+	int stamp = --mergeStamp;
+	btAlignedObjectArray<Vertex*> stack;
+	vertexList->copy = stamp;
+	stack.push_back(vertexList);
+	btAlignedObjectArray<Face*> faces;
+
+	Point32 ref = vertexList->point;
+	Int128 hullCenterX(0, 0);
+	Int128 hullCenterY(0, 0);
+	Int128 hullCenterZ(0, 0);
+	Int128 volume(0, 0);
+
+	while (stack.size() > 0)
+	{
+		Vertex* v = stack[stack.size() - 1];
+		stack.pop_back();
+		Edge* e = v->edges;
+		if (e)
+		{
+			do
+			{
+				if (e->target->copy != stamp)
+				{
+					e->target->copy = stamp;
+					stack.push_back(e->target);
+				}
+				if (e->copy != stamp)
+				{
+					Face* face = facePool.newObject();
+					face->init(e->target, e->reverse->prev->target, v);
+					faces.push_back(face);
+					Edge* f = e;
+
+					Vertex* a = NULL;
+					Vertex* b = NULL;
+					do
+					{
+						if (a && b)
+						{
+							int64_t vol = (v->point - ref).dot((a->point - ref).cross(b->point - ref));
+							btAssert(vol >= 0);
+							Point32 c = v->point + a->point + b->point + ref;
+							hullCenterX += vol * c.x;
+							hullCenterY += vol * c.y;
+							hullCenterZ += vol * c.z;
+							volume += vol;
+						}
+
+						btAssert(f->copy != stamp);
+						f->copy = stamp;
+						f->face = face;
+
+						a = b;
+						b = f->target;
+
+						f = f->reverse->prev;
+					} while (f != e);
+				}
+				e = e->next;
+			} while (e != v->edges);
+		}
+	}
+
+	if (volume.getSign() <= 0)
+	{
+		return 0;
+	}
+
+	btVector3 hullCenter;
+	hullCenter[medAxis] = hullCenterX.toScalar();
+	hullCenter[maxAxis] = hullCenterY.toScalar();
+	hullCenter[minAxis] = hullCenterZ.toScalar();
+	hullCenter /= 4 * volume.toScalar();
+	hullCenter *= scaling;
+
+	int faceCount = faces.size();
+
+	if (clampAmount > 0)
+	{
+		btScalar minDist = SIMD_INFINITY;
+		for (int i = 0; i < faceCount; i++)
+		{
+			btVector3 normal = getBtNormal(faces[i]);
+			btScalar dist = normal.dot(toBtVector(faces[i]->origin) - hullCenter);
+			if (dist < minDist)
+			{
+				minDist = dist;
+			}
+		}
+		
+		if (minDist <= 0)
+		{
+			return 0;
+		}
+
+		amount = btMin(amount, minDist * clampAmount);
+	}
+
+	unsigned int seed = 243703;
+	for (int i = 0; i < faceCount; i++, seed = 1664525 * seed + 1013904223)
+	{
+		btSwap(faces[i], faces[seed % faceCount]);
+	}
+
+	for (int i = 0; i < faceCount; i++)
+	{
+		if (!shiftFace(faces[i], amount, stack))
+		{
+			return -amount;
+		}
+	}
+
+	return amount;
+}
+
+bool btConvexHullInternal::shiftFace(Face* face, btScalar amount, btAlignedObjectArray<Vertex*> stack)
+{
+	btVector3 origShift = getBtNormal(face) * -amount;
+	if (scaling[0] != 0)
+	{
+		origShift[0] /= scaling[0];
+	}
+	if (scaling[1] != 0)
+	{
+		origShift[1] /= scaling[1];
+	}
+	if (scaling[2] != 0)
+	{
+		origShift[2] /= scaling[2];
+	}
+	Point32 shift((int32_t) origShift[medAxis], (int32_t) origShift[maxAxis], (int32_t) origShift[minAxis]);
+	if (shift.isZero())
+	{
+		return true;
+	}
+	Point64 normal = face->getNormal();
+#ifdef DEBUG_CONVEX_HULL
+	printf("\nShrinking face (%d %d %d) (%d %d %d) (%d %d %d) by (%d %d %d)\n",
+				 face->origin.x, face->origin.y, face->origin.z, face->dir0.x, face->dir0.y, face->dir0.z, face->dir1.x, face->dir1.y, face->dir1.z, shift.x, shift.y, shift.z);
+#endif
+	int64_t origDot = face->origin.dot(normal);
+	Point32 shiftedOrigin = face->origin + shift;
+	int64_t shiftedDot = shiftedOrigin.dot(normal);
+	btAssert(shiftedDot <= origDot);
+	if (shiftedDot >= origDot)
+	{
+		return false;
+	}
+
+	Edge* intersection = NULL;
+
+	Edge* startEdge = face->nearbyVertex->edges;
+#ifdef DEBUG_CONVEX_HULL
+	printf("Start edge is ");
+	startEdge->print();
+	printf(", normal is (%lld %lld %lld), shifted dot is %lld\n", normal.x, normal.y, normal.z, shiftedDot);
+#endif
+	Rational128 optDot = face->nearbyVertex->dot(normal);
+	int cmp = optDot.compare(shiftedDot);
+#ifdef SHOW_ITERATIONS
+	int n = 0;
+#endif
+	if (cmp >= 0)
+	{
+		Edge* e = startEdge;
+		do
+		{
+#ifdef SHOW_ITERATIONS
+			n++;
+#endif
+			Rational128 dot = e->target->dot(normal);
+			btAssert(dot.compare(origDot) <= 0);
+#ifdef DEBUG_CONVEX_HULL
+			printf("Moving downwards, edge is ");
+			e->print();
+			printf(", dot is %f (%f %lld)\n", (float) dot.toScalar(), (float) optDot.toScalar(), shiftedDot);
+#endif
+			if (dot.compare(optDot) < 0)
+			{
+				int c = dot.compare(shiftedDot);
+				optDot = dot;
+				e = e->reverse;
+				startEdge = e;
+				if (c < 0)
+				{
+					intersection = e;
+					break;
+				}
+				cmp = c;
+			}
+			e = e->prev;
+		} while (e != startEdge);
+
+		if (!intersection)
+		{
+			return false;
+		}
+	}
+	else
+	{
+		Edge* e = startEdge;
+		do
+		{
+#ifdef SHOW_ITERATIONS
+			n++;
+#endif
+			Rational128 dot = e->target->dot(normal);
+			btAssert(dot.compare(origDot) <= 0);
+#ifdef DEBUG_CONVEX_HULL
+			printf("Moving upwards, edge is ");
+			e->print();
+			printf(", dot is %f (%f %lld)\n", (float) dot.toScalar(), (float) optDot.toScalar(), shiftedDot);
+#endif
+			if (dot.compare(optDot) > 0)
+			{
+				cmp = dot.compare(shiftedDot);
+				if (cmp >= 0)
+				{
+					intersection = e;
+					break;
+				}
+				optDot = dot;
+				e = e->reverse;
+				startEdge = e;
+			}
+			e = e->prev;
+		} while (e != startEdge);
+		
+		if (!intersection)
+		{
+			return true;
+		}
+	}
+
+#ifdef SHOW_ITERATIONS
+	printf("Needed %d iterations to find initial intersection\n", n);
+#endif
+
+	if (cmp == 0)
+	{
+		Edge* e = intersection->reverse->next;
+#ifdef SHOW_ITERATIONS
+		n = 0;
+#endif
+		while (e->target->dot(normal).compare(shiftedDot) <= 0)
+		{
+#ifdef SHOW_ITERATIONS
+			n++;
+#endif
+			e = e->next;
+			if (e == intersection->reverse)
+			{
+				return true;
+			}
+#ifdef DEBUG_CONVEX_HULL
+			printf("Checking for outwards edge, current edge is ");
+			e->print();
+			printf("\n");
+#endif
+		}
+#ifdef SHOW_ITERATIONS
+		printf("Needed %d iterations to check for complete containment\n", n);
+#endif
+	}
+	
+	Edge* firstIntersection = NULL;
+	Edge* faceEdge = NULL;
+	Edge* firstFaceEdge = NULL;
+
+#ifdef SHOW_ITERATIONS
+	int m = 0;
+#endif
+	while (true)
+	{
+#ifdef SHOW_ITERATIONS
+		m++;
+#endif
+#ifdef DEBUG_CONVEX_HULL
+		printf("Intersecting edge is ");
+		intersection->print();
+		printf("\n");
+#endif
+		if (cmp == 0)
+		{
+			Edge* e = intersection->reverse->next;
+			startEdge = e;
+#ifdef SHOW_ITERATIONS
+			n = 0;
+#endif
+			while (true)
+			{
+#ifdef SHOW_ITERATIONS
+				n++;
+#endif
+				if (e->target->dot(normal).compare(shiftedDot) >= 0)
+				{
+					break;
+				}
+				intersection = e->reverse;
+				e = e->next;
+				if (e == startEdge)
+				{
+					return true;
+				}
+			}
+#ifdef SHOW_ITERATIONS
+			printf("Needed %d iterations to advance intersection\n", n);
+#endif
+		}
+
+#ifdef DEBUG_CONVEX_HULL
+		printf("Advanced intersecting edge to ");
+		intersection->print();
+		printf(", cmp = %d\n", cmp);
+#endif
+
+		if (!firstIntersection)
+		{
+			firstIntersection = intersection;
+		}
+		else if (intersection == firstIntersection)
+		{
+			break;
+		}
+
+		int prevCmp = cmp;
+		Edge* prevIntersection = intersection;
+		Edge* prevFaceEdge = faceEdge;
+
+		Edge* e = intersection->reverse;
+#ifdef SHOW_ITERATIONS
+		n = 0;
+#endif
+		while (true)
+		{
+#ifdef SHOW_ITERATIONS
+			n++;
+#endif
+			e = e->reverse->prev;
+			btAssert(e != intersection->reverse);
+			cmp = e->target->dot(normal).compare(shiftedDot);
+#ifdef DEBUG_CONVEX_HULL
+			printf("Testing edge ");
+			e->print();
+			printf(" -> cmp = %d\n", cmp);
+#endif
+			if (cmp >= 0)
+			{
+				intersection = e;
+				break;
+			}
+		}
+#ifdef SHOW_ITERATIONS
+		printf("Needed %d iterations to find other intersection of face\n", n);
+#endif
+
+		if (cmp > 0)
+		{
+			Vertex* removed = intersection->target;
+			e = intersection->reverse;
+			if (e->prev == e)
+			{
+				removed->edges = NULL;
+			}
+			else
+			{
+				removed->edges = e->prev;
+				e->prev->link(e->next);
+				e->link(e);
+			}
+#ifdef DEBUG_CONVEX_HULL
+			printf("1: Removed part contains (%d %d %d)\n", removed->point.x, removed->point.y, removed->point.z);
+#endif
+			
+			Point64 n0 = intersection->face->getNormal();
+			Point64 n1 = intersection->reverse->face->getNormal();
+			int64_t m00 = face->dir0.dot(n0);
+			int64_t m01 = face->dir1.dot(n0);
+			int64_t m10 = face->dir0.dot(n1);
+			int64_t m11 = face->dir1.dot(n1);
+			int64_t r0 = (intersection->face->origin - shiftedOrigin).dot(n0);
+			int64_t r1 = (intersection->reverse->face->origin - shiftedOrigin).dot(n1);
+			Int128 det = Int128::mul(m00, m11) - Int128::mul(m01, m10);
+			btAssert(det.getSign() != 0);
+			Vertex* v = vertexPool.newObject();
+			v->point.index = -1;
+			v->copy = -1;
+			v->point128 = PointR128(Int128::mul(face->dir0.x * r0, m11) - Int128::mul(face->dir0.x * r1, m01)
+															+ Int128::mul(face->dir1.x * r1, m00) - Int128::mul(face->dir1.x * r0, m10) + det * shiftedOrigin.x,
+															Int128::mul(face->dir0.y * r0, m11) - Int128::mul(face->dir0.y * r1, m01)
+															+ Int128::mul(face->dir1.y * r1, m00) - Int128::mul(face->dir1.y * r0, m10) + det * shiftedOrigin.y,
+															Int128::mul(face->dir0.z * r0, m11) - Int128::mul(face->dir0.z * r1, m01)
+															+ Int128::mul(face->dir1.z * r1, m00) - Int128::mul(face->dir1.z * r0, m10) + det * shiftedOrigin.z,
+															det);
+			v->point.x = (int32_t) v->point128.xvalue();
+			v->point.y = (int32_t) v->point128.yvalue();
+			v->point.z = (int32_t) v->point128.zvalue();
+			intersection->target = v;
+			v->edges = e;
+
+			stack.push_back(v);
+			stack.push_back(removed);
+			stack.push_back(NULL);
+		}
+
+		if (cmp || prevCmp || (prevIntersection->reverse->next->target != intersection->target))
+		{
+			faceEdge = newEdgePair(prevIntersection->target, intersection->target);
+			if (prevCmp == 0)
+			{
+				faceEdge->link(prevIntersection->reverse->next);
+			}
+			if ((prevCmp == 0) || prevFaceEdge)
+			{
+				prevIntersection->reverse->link(faceEdge);
+			}
+			if (cmp == 0)
+			{
+				intersection->reverse->prev->link(faceEdge->reverse);
+			}
+			faceEdge->reverse->link(intersection->reverse);
+		}
+		else
+		{
+			faceEdge = prevIntersection->reverse->next;
+		}
+
+		if (prevFaceEdge)
+		{
+			if (prevCmp > 0)
+			{
+				faceEdge->link(prevFaceEdge->reverse);
+			}
+			else if (faceEdge != prevFaceEdge->reverse)
+			{
+				stack.push_back(prevFaceEdge->target);
+				while (faceEdge->next != prevFaceEdge->reverse)
+				{
+					Vertex* removed = faceEdge->next->target;
+					removeEdgePair(faceEdge->next);
+					stack.push_back(removed);
+#ifdef DEBUG_CONVEX_HULL
+					printf("2: Removed part contains (%d %d %d)\n", removed->point.x, removed->point.y, removed->point.z);
+#endif
+				}
+				stack.push_back(NULL);
+			}
+		}
+		faceEdge->face = face;
+		faceEdge->reverse->face = intersection->face;
+
+		if (!firstFaceEdge)
+		{
+			firstFaceEdge = faceEdge;
+		}
+	}
+#ifdef SHOW_ITERATIONS
+	printf("Needed %d iterations to process all intersections\n", m);
+#endif
+
+	if (cmp > 0)
+	{
+		firstFaceEdge->reverse->target = faceEdge->target;
+		firstIntersection->reverse->link(firstFaceEdge);
+		firstFaceEdge->link(faceEdge->reverse);
+	}
+	else if (firstFaceEdge != faceEdge->reverse)
+	{
+		stack.push_back(faceEdge->target);
+		while (firstFaceEdge->next != faceEdge->reverse)
+		{
+			Vertex* removed = firstFaceEdge->next->target;
+			removeEdgePair(firstFaceEdge->next);
+			stack.push_back(removed);
+#ifdef DEBUG_CONVEX_HULL
+			printf("3: Removed part contains (%d %d %d)\n", removed->point.x, removed->point.y, removed->point.z);
+#endif
+		}
+		stack.push_back(NULL);
+	}
+
+	btAssert(stack.size() > 0);
+	vertexList = stack[0];
+
+#ifdef DEBUG_CONVEX_HULL
+	printf("Removing part\n");
+#endif
+#ifdef SHOW_ITERATIONS
+	n = 0;
+#endif
+	int pos = 0;
+	while (pos < stack.size())
+	{
+		int end = stack.size();
+		while (pos < end)
+		{
+			Vertex* kept = stack[pos++];
+#ifdef DEBUG_CONVEX_HULL
+			kept->print();
+#endif
+			bool deeper = false;
+			Vertex* removed;
+			while ((removed = stack[pos++]) != NULL)
+			{
+#ifdef SHOW_ITERATIONS
+				n++;
+#endif
+				kept->receiveNearbyFaces(removed);
+				while (removed->edges)
+				{
+					if (!deeper)
+					{
+						deeper = true;
+						stack.push_back(kept);
+					}
+					stack.push_back(removed->edges->target);
+					removeEdgePair(removed->edges);
+				}
+			}
+			if (deeper)
+			{
+				stack.push_back(NULL);
+			}
+		}
+	}
+#ifdef SHOW_ITERATIONS
+	printf("Needed %d iterations to remove part\n", n);
+#endif
+
+	stack.resize(0);
+	face->origin = shiftedOrigin;
+
+	return true;
+}
+
+
+static int getVertexCopy(btConvexHullInternal::Vertex* vertex, btAlignedObjectArray<btConvexHullInternal::Vertex*>& vertices)
+{
+	int index = vertex->copy;
+	if (index < 0)
+	{
+		index = vertices.size();
+		vertex->copy = index;
+		vertices.push_back(vertex);
+#ifdef DEBUG_CONVEX_HULL
+		printf("Vertex %d gets index *%d\n", vertex->point.index, index);
+#endif
+	}
+	return index;
+}
+
+btScalar btConvexHullComputer::compute(const void* coords, bool doubleCoords, int stride, int count, btScalar shrink, btScalar shrinkClamp)
+{
+	if (count <= 0)
+	{
+		vertices.clear();
+		edges.clear();
+		faces.clear();
+		return 0;
+	}
+
+	btConvexHullInternal hull;
+	hull.compute(coords, doubleCoords, stride, count);
+
+	btScalar shift = 0;
+	if ((shrink > 0) && ((shift = hull.shrink(shrink, shrinkClamp)) < 0))
+	{
+		vertices.clear();
+		edges.clear();
+		faces.clear();
+		return shift;
+	}
+
+	vertices.resize(0);
+	edges.resize(0);
+	faces.resize(0);
+
+	btAlignedObjectArray<btConvexHullInternal::Vertex*> oldVertices;
+	getVertexCopy(hull.vertexList, oldVertices);
+	int copied = 0;
+	while (copied < oldVertices.size())
+	{
+		btConvexHullInternal::Vertex* v = oldVertices[copied];
+		vertices.push_back(hull.getCoordinates(v));
+		btConvexHullInternal::Edge* firstEdge = v->edges;
+		if (firstEdge)
+		{
+			int firstCopy = -1;
+			int prevCopy = -1;
+			btConvexHullInternal::Edge* e = firstEdge;
+			do
+			{
+				if (e->copy < 0)
+				{
+					int s = edges.size();
+					edges.push_back(Edge());
+					edges.push_back(Edge());
+					Edge* c = &edges[s];
+					Edge* r = &edges[s + 1];
+					e->copy = s;
+					e->reverse->copy = s + 1;
+					c->reverse = 1;
+					r->reverse = -1;
+					c->targetVertex = getVertexCopy(e->target, oldVertices);
+					r->targetVertex = copied;
+#ifdef DEBUG_CONVEX_HULL
+					printf("      CREATE: Vertex *%d has edge to *%d\n", copied, c->getTargetVertex());
+#endif
+				}
+				if (prevCopy >= 0)
+				{
+					edges[e->copy].next = prevCopy - e->copy;
+				}
+				else
+				{
+					firstCopy = e->copy;
+				}
+				prevCopy = e->copy;
+				e = e->next;
+			} while (e != firstEdge);
+			edges[firstCopy].next = prevCopy - firstCopy;
+		}
+		copied++;
+	}
+
+	for (int i = 0; i < copied; i++)
+	{
+		btConvexHullInternal::Vertex* v = oldVertices[i];
+		btConvexHullInternal::Edge* firstEdge = v->edges;
+		if (firstEdge)
+		{
+			btConvexHullInternal::Edge* e = firstEdge;
+			do
+			{
+				if (e->copy >= 0)
+				{
+#ifdef DEBUG_CONVEX_HULL
+					printf("Vertex *%d has edge to *%d\n", i, edges[e->copy].getTargetVertex());
+#endif
+					faces.push_back(e->copy);
+					btConvexHullInternal::Edge* f = e;
+					do
+					{
+#ifdef DEBUG_CONVEX_HULL
+						printf("   Face *%d\n", edges[f->copy].getTargetVertex());
+#endif
+						f->copy = -1;
+						f = f->reverse->prev;
+					} while (f != e);
+				}
+				e = e->next;
+			} while (e != firstEdge);
+		}
+	}
+
+	return shift;
+}
+
+
+
+
+
diff --git a/src/LinearMath/btConvexHullComputer.h b/src/LinearMath/btConvexHullComputer.h
index da96a494d..7240ac4fb 100644
--- a/src/LinearMath/btConvexHullComputer.h
+++ b/src/LinearMath/btConvexHullComputer.h
@@ -1,103 +1,103 @@
-/*
-Copyright (c) 2011 Ole Kniemeyer, MAXON, www.maxon.net
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#ifndef BT_CONVEX_HULL_COMPUTER_H
-#define BT_CONVEX_HULL_COMPUTER_H
-
-#include "btVector3.h"
-#include "btAlignedObjectArray.h"
-
-/// Convex hull implementation based on Preparata and Hong
-/// See http://code.google.com/p/bullet/issues/detail?id=275
-/// Ole Kniemeyer, MAXON Computer GmbH
-class btConvexHullComputer
-{
-	private:
-		btScalar compute(const void* coords, bool doubleCoords, int stride, int count, btScalar shrink, btScalar shrinkClamp);
-
-	public:
-
-		class Edge
-		{
-			private:
-				int next;
-				int reverse;
-				int targetVertex;
-
-				friend class btConvexHullComputer;
-
-			public:
-				int getSourceVertex() const
-				{
-					return (this + reverse)->targetVertex;
-				}
-
-				int getTargetVertex() const
-				{
-					return targetVertex;
-				}
-
-				const Edge* getNextEdgeOfVertex() const // clockwise list of all edges of a vertex
-				{
-					return this + next;
-				}
-
-				const Edge* getNextEdgeOfFace() const // counter-clockwise list of all edges of a face
-				{
-					return (this + reverse)->getNextEdgeOfVertex();
-				}
-
-				const Edge* getReverseEdge() const
-				{
-					return this + reverse;
-				}
-		};
-
-
-		// Vertices of the output hull
-		btAlignedObjectArray<btVector3> vertices;
-
-		// Edges of the output hull
-		btAlignedObjectArray<Edge> edges;
-
-		// Faces of the convex hull. Each entry is an index into the "edges" array pointing to an edge of the face. Faces are planar n-gons
-		btAlignedObjectArray<int> faces;
-
-		/*
-		Compute convex hull of "count" vertices stored in "coords". "stride" is the difference in bytes
-		between the addresses of consecutive vertices. If "shrink" is positive, the convex hull is shrunken
-		by that amount (each face is moved by "shrink" length units towards the center along its normal).
-		If "shrinkClamp" is positive, "shrink" is clamped to not exceed "shrinkClamp * innerRadius", where "innerRadius"
-		is the minimum distance of a face to the center of the convex hull.
-
-		The returned value is the amount by which the hull has been shrunken. If it is negative, the amount was so large
-		that the resulting convex hull is empty.
-
-		The output convex hull can be found in the member variables "vertices", "edges", "faces".
-		*/
-		btScalar compute(const float* coords, int stride, int count, btScalar shrink, btScalar shrinkClamp)
-		{
-			return compute(coords, false, stride, count, shrink, shrinkClamp);
-		}
-
-		// same as above, but double precision
-		btScalar compute(const double* coords, int stride, int count, btScalar shrink, btScalar shrinkClamp)
-		{
-			return compute(coords, true, stride, count, shrink, shrinkClamp);
-		}
-};
-
-
-#endif //BT_CONVEX_HULL_COMPUTER_H
-
+/*
+Copyright (c) 2011 Ole Kniemeyer, MAXON, www.maxon.net
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_CONVEX_HULL_COMPUTER_H
+#define BT_CONVEX_HULL_COMPUTER_H
+
+#include "btVector3.h"
+#include "btAlignedObjectArray.h"
+
+/// Convex hull implementation based on Preparata and Hong
+/// See http://code.google.com/p/bullet/issues/detail?id=275
+/// Ole Kniemeyer, MAXON Computer GmbH
+class btConvexHullComputer
+{
+	private:
+		btScalar compute(const void* coords, bool doubleCoords, int stride, int count, btScalar shrink, btScalar shrinkClamp);
+
+	public:
+
+		class Edge
+		{
+			private:
+				int next;
+				int reverse;
+				int targetVertex;
+
+				friend class btConvexHullComputer;
+
+			public:
+				int getSourceVertex() const
+				{
+					return (this + reverse)->targetVertex;
+				}
+
+				int getTargetVertex() const
+				{
+					return targetVertex;
+				}
+
+				const Edge* getNextEdgeOfVertex() const // clockwise list of all edges of a vertex
+				{
+					return this + next;
+				}
+
+				const Edge* getNextEdgeOfFace() const // counter-clockwise list of all edges of a face
+				{
+					return (this + reverse)->getNextEdgeOfVertex();
+				}
+
+				const Edge* getReverseEdge() const
+				{
+					return this + reverse;
+				}
+		};
+
+
+		// Vertices of the output hull
+		btAlignedObjectArray<btVector3> vertices;
+
+		// Edges of the output hull
+		btAlignedObjectArray<Edge> edges;
+
+		// Faces of the convex hull. Each entry is an index into the "edges" array pointing to an edge of the face. Faces are planar n-gons
+		btAlignedObjectArray<int> faces;
+
+		/*
+		Compute convex hull of "count" vertices stored in "coords". "stride" is the difference in bytes
+		between the addresses of consecutive vertices. If "shrink" is positive, the convex hull is shrunken
+		by that amount (each face is moved by "shrink" length units towards the center along its normal).
+		If "shrinkClamp" is positive, "shrink" is clamped to not exceed "shrinkClamp * innerRadius", where "innerRadius"
+		is the minimum distance of a face to the center of the convex hull.
+
+		The returned value is the amount by which the hull has been shrunken. If it is negative, the amount was so large
+		that the resulting convex hull is empty.
+
+		The output convex hull can be found in the member variables "vertices", "edges", "faces".
+		*/
+		btScalar compute(const float* coords, int stride, int count, btScalar shrink, btScalar shrinkClamp)
+		{
+			return compute(coords, false, stride, count, shrink, shrinkClamp);
+		}
+
+		// same as above, but double precision
+		btScalar compute(const double* coords, int stride, int count, btScalar shrink, btScalar shrinkClamp)
+		{
+			return compute(coords, true, stride, count, shrink, shrinkClamp);
+		}
+};
+
+
+#endif //BT_CONVEX_HULL_COMPUTER_H
+
diff --git a/src/LinearMath/btDefaultMotionState.h b/src/LinearMath/btDefaultMotionState.h
index a6b7ef15a..c90b74923 100644
--- a/src/LinearMath/btDefaultMotionState.h
+++ b/src/LinearMath/btDefaultMotionState.h
@@ -4,13 +4,15 @@
 #include "btMotionState.h"
 
 ///The btDefaultMotionState provides a common implementation to synchronize world transforms with offsets.
-struct	btDefaultMotionState : public btMotionState
+ATTRIBUTE_ALIGNED16(struct)	btDefaultMotionState : public btMotionState
 {
 	btTransform m_graphicsWorldTrans;
 	btTransform	m_centerOfMassOffset;
 	btTransform m_startWorldTrans;
 	void*		m_userPointer;
 
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	btDefaultMotionState(const btTransform& startTrans = btTransform::getIdentity(),const btTransform& centerOfMassOffset = btTransform::getIdentity())
 		: m_graphicsWorldTrans(startTrans),
 		m_centerOfMassOffset(centerOfMassOffset),
diff --git a/src/LinearMath/btGrahamScan2dConvexHull.h b/src/LinearMath/btGrahamScan2dConvexHull.h
index aa933c177..d227148df 100644
--- a/src/LinearMath/btGrahamScan2dConvexHull.h
+++ b/src/LinearMath/btGrahamScan2dConvexHull.h
@@ -70,7 +70,8 @@ inline void GrahamScanConvexHull2D(btAlignedObjectArray<GrahamVector2>& original
 	{
 		const btVector3& left = originalPoints[i];
 		const btVector3& right = originalPoints[0];
-		if (left.x() < right.x() || !(right.x() < left.x()) && left.y() < right.y())
+		if (left.x() < right.x() || 
+            (!(right.x() < left.x()) && left.y() < right.y()))
 		{
 			originalPoints.swap(0,i);
 		}
diff --git a/src/LinearMath/btMatrix3x3.h b/src/LinearMath/btMatrix3x3.h
index d0234a043..d4f5c95aa 100644
--- a/src/LinearMath/btMatrix3x3.h
+++ b/src/LinearMath/btMatrix3x3.h
@@ -18,6 +18,18 @@ subject to the following restrictions:
 
 #include "btVector3.h"
 #include "btQuaternion.h"
+#include <stdio.h>
+
+#ifdef BT_USE_SSE
+//const __m128 ATTRIBUTE_ALIGNED16(v2220) = {2.0f, 2.0f, 2.0f, 0.0f};
+const __m128 ATTRIBUTE_ALIGNED16(vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f};
+#endif
+
+#if defined(BT_USE_SSE) || defined(BT_USE_NEON)
+const btSimdFloat4 ATTRIBUTE_ALIGNED16(v1000) = {1.0f, 0.0f, 0.0f, 0.0f};
+const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0100) = {0.0f, 1.0f, 0.0f, 0.0f};
+const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0010) = {0.0f, 0.0f, 1.0f, 0.0f};
+#endif
 
 #ifdef BT_USE_DOUBLE_PRECISION
 #define btMatrix3x3Data	btMatrix3x3DoubleData 
@@ -28,7 +40,7 @@ subject to the following restrictions:
 
 /**@brief The btMatrix3x3 class implements a 3x3 rotation matrix, to perform linear algebra in combination with btQuaternion, btTransform and btVector3.
 * Make sure to only include a pure orthogonal matrix without scaling. */
-class btMatrix3x3 {
+ATTRIBUTE_ALIGNED16(class) btMatrix3x3 {
 
 	///Data storage for the matrix, each vector is a row of the matrix
 	btVector3 m_el[3];
@@ -57,6 +69,42 @@ public:
 			yx, yy, yz, 
 			zx, zy, zz);
 	}
+
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
+	SIMD_FORCE_INLINE btMatrix3x3 (const btSimdFloat4 v0, const btSimdFloat4 v1, const btSimdFloat4 v2 ) 
+	{
+        m_el[0].mVec128 = v0;
+        m_el[1].mVec128 = v1;
+        m_el[2].mVec128 = v2;
+	}
+
+	SIMD_FORCE_INLINE btMatrix3x3 (const btVector3& v0, const btVector3& v1, const btVector3& v2 ) 
+	{
+        m_el[0] = v0;
+        m_el[1] = v1;
+        m_el[2] = v2;
+	}
+
+	// Copy constructor
+	SIMD_FORCE_INLINE btMatrix3x3(const btMatrix3x3& rhs)
+	{
+		m_el[0].mVec128 = rhs.m_el[0].mVec128;
+		m_el[1].mVec128 = rhs.m_el[1].mVec128;
+		m_el[2].mVec128 = rhs.m_el[2].mVec128;
+	}
+
+	// Assignment Operator
+	SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& m) 
+	{
+		m_el[0].mVec128 = m.m_el[0].mVec128;
+		m_el[1].mVec128 = m.m_el[1].mVec128;
+		m_el[2].mVec128 = m.m_el[2].mVec128;
+		
+		return *this;
+	}
+
+#else
+
 	/** @brief Copy constructor */
 	SIMD_FORCE_INLINE btMatrix3x3 (const btMatrix3x3& other)
 	{
@@ -64,6 +112,7 @@ public:
 		m_el[1] = other.m_el[1];
 		m_el[2] = other.m_el[2];
 	}
+    
 	/** @brief Assignment Operator */
 	SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& other)
 	{
@@ -73,6 +122,8 @@ public:
 		return *this;
 	}
 
+#endif
+
 	/** @brief Get a column of the matrix as a vector 
 	*  @param i Column number 0 indexed */
 	SIMD_FORCE_INLINE btVector3 getColumn(int i) const
@@ -155,14 +206,69 @@ public:
 		btScalar d = q.length2();
 		btFullAssert(d != btScalar(0.0));
 		btScalar s = btScalar(2.0) / d;
+    
+    #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+        __m128	vs, Q = q.get128();
+		__m128i Qi = btCastfTo128i(Q);
+        __m128	Y, Z;
+        __m128	V1, V2, V3;
+        __m128	V11, V21, V31;
+        __m128	NQ = _mm_xor_ps(Q, btvMzeroMask);
+		__m128i NQi = btCastfTo128i(NQ);
+        
+        V1 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,2,3)));	// Y X Z W
+		V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0,0,1,3));     // -X -X  Y  W
+        V3 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(2,1,0,3)));	// Z Y X W
+        V1 = _mm_xor_ps(V1, vMPPP);	//	change the sign of the first element
+			
+        V11	= btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,1,0,3)));	// Y Y X W
+		V21 = _mm_unpackhi_ps(Q, Q);                    //  Z  Z  W  W
+		V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0,2,0,3));	//  X  Z -X -W
+
+		V2 = V2 * V1;	//
+		V1 = V1 * V11;	//
+		V3 = V3 * V31;	//
+
+        V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2,3,1,3));	//	-Z -W  Y  W
+		V11 = V11 * V21;	//
+        V21 = _mm_xor_ps(V21, vMPPP);	//	change the sign of the first element
+		V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3,3,1,3));	//	 W  W -Y -W
+        V31 = _mm_xor_ps(V31, vMPPP);	//	change the sign of the first element
+		Y = btCastiTo128f(_mm_shuffle_epi32 (NQi, BT_SHUFFLE(3,2,0,3)));	// -W -Z -X -W
+		Z = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,1,3)));	//  Y  X  Y  W
+
+		vs = _mm_load_ss(&s);
+		V21 = V21 * Y;
+		V31 = V31 * Z;
+
+		V1 = V1 + V11;
+        V2 = V2 + V21;
+        V3 = V3 + V31;
+
+        vs = bt_splat3_ps(vs, 0);
+            //	s ready
+        V1 = V1 * vs;
+        V2 = V2 * vs;
+        V3 = V3 * vs;
+        
+        V1 = V1 + v1000;
+        V2 = V2 + v0100;
+        V3 = V3 + v0010;
+        
+        m_el[0] = V1; 
+        m_el[1] = V2;
+        m_el[2] = V3;
+    #else    
 		btScalar xs = q.x() * s,   ys = q.y() * s,   zs = q.z() * s;
 		btScalar wx = q.w() * xs,  wy = q.w() * ys,  wz = q.w() * zs;
 		btScalar xx = q.x() * xs,  xy = q.x() * ys,  xz = q.x() * zs;
 		btScalar yy = q.y() * ys,  yz = q.y() * zs,  zz = q.z() * zs;
-		setValue(btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
+		setValue(
+            btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
 			xy + wz, btScalar(1.0) - (xx + zz), yz - wx,
 			xz - wy, yz + wx, btScalar(1.0) - (xx + yy));
-	}
+	#endif
+    }
 
 
 	/** @brief Set the matrix from euler angles using YPR around YXZ respectively
@@ -205,16 +311,29 @@ public:
 	/**@brief Set the matrix to the identity */
 	void setIdentity()
 	{ 
+#if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
+			m_el[0] = v1000; 
+			m_el[1] = v0100;
+			m_el[2] = v0010;
+#else
 		setValue(btScalar(1.0), btScalar(0.0), btScalar(0.0), 
 			btScalar(0.0), btScalar(1.0), btScalar(0.0), 
 			btScalar(0.0), btScalar(0.0), btScalar(1.0)); 
+#endif
 	}
 
 	static const btMatrix3x3&	getIdentity()
 	{
-		static const btMatrix3x3 identityMatrix(btScalar(1.0), btScalar(0.0), btScalar(0.0), 
+#if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
+        static const btMatrix3x3 
+        identityMatrix(v1000, v0100, v0010);
+#else
+		static const btMatrix3x3 
+        identityMatrix(
+            btScalar(1.0), btScalar(0.0), btScalar(0.0), 
 			btScalar(0.0), btScalar(1.0), btScalar(0.0), 
 			btScalar(0.0), btScalar(0.0), btScalar(1.0));
+#endif
 		return identityMatrix;
 	}
 
@@ -222,6 +341,40 @@ public:
 	* @param m The array to be filled */
 	void getOpenGLSubMatrix(btScalar *m) const 
 	{
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+        __m128 v0 = m_el[0].mVec128;
+        __m128 v1 = m_el[1].mVec128;
+        __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
+        __m128 *vm = (__m128 *)m;
+        __m128 vT;
+        
+        v2 = _mm_and_ps(v2, btvFFF0fMask);  //  x2 y2 z2 0
+        
+        vT = _mm_unpackhi_ps(v0, v1);	//	z0 z1 * *
+        v0 = _mm_unpacklo_ps(v0, v1);	//	x0 x1 y0 y1
+
+        v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) );	// y0 y1 y2 0
+        v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) );	// x0 x1 x2 0
+        v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));	// z0 z1 z2 0
+
+        vm[0] = v0;
+        vm[1] = v1;
+        vm[2] = v2;
+#elif defined(BT_USE_NEON)
+        // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
+        static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
+        float32x4_t *vm = (float32x4_t *)m;
+        float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
+        float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
+        float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
+        float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
+        float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
+        float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
+
+        vm[0] = v0;
+        vm[1] = v1;
+        vm[2] = v2;
+#else
 		m[0]  = btScalar(m_el[0].x()); 
 		m[1]  = btScalar(m_el[1].x());
 		m[2]  = btScalar(m_el[2].x());
@@ -234,13 +387,67 @@ public:
 		m[9]  = btScalar(m_el[1].z());
 		m[10] = btScalar(m_el[2].z());
 		m[11] = btScalar(0.0); 
+#endif
 	}
 
 	/**@brief Get the matrix represented as a quaternion 
 	* @param q The quaternion which will be set */
 	void getRotation(btQuaternion& q) const
 	{
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
+        btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
+        btScalar s, x;
+        
+        union {
+            btSimdFloat4 vec;
+            btScalar f[4];
+        } temp;
+        
+        if (trace > btScalar(0.0)) 
+        {
+            x = trace + btScalar(1.0);
+
+            temp.f[0]=m_el[2].y() - m_el[1].z();
+            temp.f[1]=m_el[0].z() - m_el[2].x();
+            temp.f[2]=m_el[1].x() - m_el[0].y();
+            temp.f[3]=x;
+            //temp.f[3]= s * btScalar(0.5);
+        } 
+        else 
+        {
+            int i, j, k;
+            if(m_el[0].x() < m_el[1].y()) 
+            { 
+                if( m_el[1].y() < m_el[2].z() )
+                    { i = 2; j = 0; k = 1; }
+                else
+                    { i = 1; j = 2; k = 0; }
+            }
+            else
+            {
+                if( m_el[0].x() < m_el[2].z())
+                    { i = 2; j = 0; k = 1; }
+                else
+                    { i = 0; j = 1; k = 2; }
+            }
+
+            x = m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0);
+
+            temp.f[3] = (m_el[k][j] - m_el[j][k]);
+            temp.f[j] = (m_el[j][i] + m_el[i][j]);
+            temp.f[k] = (m_el[k][i] + m_el[i][k]);
+            temp.f[i] = x;
+            //temp.f[i] = s * btScalar(0.5);
+        }
+
+        s = btSqrt(x);
+        q.set128(temp.vec);
+        s = btScalar(0.5) / s;
+
+        q *= s;
+#else    
 		btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
+
 		btScalar temp[4];
 
 		if (trace > btScalar(0.0)) 
@@ -270,6 +477,7 @@ public:
 			temp[k] = (m_el[k][i] + m_el[i][k]) * s;
 		}
 		q.setValue(temp[0],temp[1],temp[2],temp[3]);
+#endif
 	}
 
 	/**@brief Get the matrix represented as euler angles around YXZ, roundtrip with setEulerYPR
@@ -376,9 +584,14 @@ public:
 
 	btMatrix3x3 scaled(const btVector3& s) const
 	{
-		return btMatrix3x3(m_el[0].x() * s.x(), m_el[0].y() * s.y(), m_el[0].z() * s.z(),
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
+		return btMatrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
+#else		
+		return btMatrix3x3(
+            m_el[0].x() * s.x(), m_el[0].y() * s.y(), m_el[0].z() * s.z(),
 			m_el[1].x() * s.x(), m_el[1].y() * s.y(), m_el[1].z() * s.z(),
 			m_el[2].x() * s.x(), m_el[2].y() * s.y(), m_el[2].z() * s.z());
+#endif
 	}
 
 	/**@brief Return the determinant of the matrix */
@@ -527,15 +740,101 @@ public:
 SIMD_FORCE_INLINE btMatrix3x3& 
 btMatrix3x3::operator*=(const btMatrix3x3& m)
 {
-	setValue(m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+    __m128 rv00, rv01, rv02;
+    __m128 rv10, rv11, rv12;
+    __m128 rv20, rv21, rv22;
+    __m128 mv0, mv1, mv2;
+
+    rv02 = m_el[0].mVec128;
+    rv12 = m_el[1].mVec128;
+    rv22 = m_el[2].mVec128;
+
+    mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask); 
+    mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask); 
+    mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask); 
+    
+    // rv0
+    rv00 = bt_splat_ps(rv02, 0);
+    rv01 = bt_splat_ps(rv02, 1);
+    rv02 = bt_splat_ps(rv02, 2);
+    
+    rv00 = _mm_mul_ps(rv00, mv0);
+    rv01 = _mm_mul_ps(rv01, mv1);
+    rv02 = _mm_mul_ps(rv02, mv2);
+    
+    // rv1
+    rv10 = bt_splat_ps(rv12, 0);
+    rv11 = bt_splat_ps(rv12, 1);
+    rv12 = bt_splat_ps(rv12, 2);
+    
+    rv10 = _mm_mul_ps(rv10, mv0);
+    rv11 = _mm_mul_ps(rv11, mv1);
+    rv12 = _mm_mul_ps(rv12, mv2);
+    
+    // rv2
+    rv20 = bt_splat_ps(rv22, 0);
+    rv21 = bt_splat_ps(rv22, 1);
+    rv22 = bt_splat_ps(rv22, 2);
+    
+    rv20 = _mm_mul_ps(rv20, mv0);
+    rv21 = _mm_mul_ps(rv21, mv1);
+    rv22 = _mm_mul_ps(rv22, mv2);
+
+    rv00 = _mm_add_ps(rv00, rv01);
+    rv10 = _mm_add_ps(rv10, rv11);
+    rv20 = _mm_add_ps(rv20, rv21);
+
+    m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
+    m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
+    m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
+
+#elif defined(BT_USE_NEON)
+
+    float32x4_t rv0, rv1, rv2;
+    float32x4_t v0, v1, v2;
+    float32x4_t mv0, mv1, mv2;
+
+    v0 = m_el[0].mVec128;
+    v1 = m_el[1].mVec128;
+    v2 = m_el[2].mVec128;
+
+    mv0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask); 
+    mv1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask); 
+    mv2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask); 
+    
+    rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
+    rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
+    rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
+    
+    rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
+    rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
+    rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
+    
+    rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
+    rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
+    rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
+
+    m_el[0].mVec128 = rv0;
+    m_el[1].mVec128 = rv1;
+    m_el[2].mVec128 = rv2;
+#else    
+	setValue(
+        m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
 		m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
 		m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
+#endif
 	return *this;
 }
 
 SIMD_FORCE_INLINE btMatrix3x3& 
 btMatrix3x3::operator+=(const btMatrix3x3& m)
 {
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
+    m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
+    m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
+    m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
+#else
 	setValue(
 		m_el[0][0]+m.m_el[0][0], 
 		m_el[0][1]+m.m_el[0][1],
@@ -546,52 +845,89 @@ btMatrix3x3::operator+=(const btMatrix3x3& m)
 		m_el[2][0]+m.m_el[2][0], 
 		m_el[2][1]+m.m_el[2][1],
 		m_el[2][2]+m.m_el[2][2]);
+#endif
 	return *this;
 }
 
 SIMD_FORCE_INLINE btMatrix3x3
 operator*(const btMatrix3x3& m, const btScalar & k)
 {
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+    __m128 vk = bt_splat_ps(_mm_load_ss((float *)&k), 0x80);
+    return btMatrix3x3(
+                _mm_mul_ps(m[0].mVec128, vk), 
+                _mm_mul_ps(m[1].mVec128, vk), 
+                _mm_mul_ps(m[2].mVec128, vk)); 
+#elif defined(BT_USE_NEON)
+    return btMatrix3x3(
+                vmulq_n_f32(m[0].mVec128, k),
+                vmulq_n_f32(m[1].mVec128, k),
+                vmulq_n_f32(m[2].mVec128, k)); 
+#else
 	return btMatrix3x3(
 		m[0].x()*k,m[0].y()*k,m[0].z()*k,
 		m[1].x()*k,m[1].y()*k,m[1].z()*k,
 		m[2].x()*k,m[2].y()*k,m[2].z()*k);
+#endif
 }
 
- SIMD_FORCE_INLINE btMatrix3x3 
+SIMD_FORCE_INLINE btMatrix3x3 
 operator+(const btMatrix3x3& m1, const btMatrix3x3& m2)
 {
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
 	return btMatrix3x3(
-	m1[0][0]+m2[0][0], 
-	m1[0][1]+m2[0][1],
-	m1[0][2]+m2[0][2],
-	m1[1][0]+m2[1][0], 
-	m1[1][1]+m2[1][1],
-	m1[1][2]+m2[1][2],
-	m1[2][0]+m2[2][0], 
-	m1[2][1]+m2[2][1],
-	m1[2][2]+m2[2][2]);
+        m1[0].mVec128 + m2[0].mVec128,
+        m1[1].mVec128 + m2[1].mVec128,
+        m1[2].mVec128 + m2[2].mVec128);
+#else
+	return btMatrix3x3(
+        m1[0][0]+m2[0][0], 
+        m1[0][1]+m2[0][1],
+        m1[0][2]+m2[0][2],
+        
+        m1[1][0]+m2[1][0], 
+        m1[1][1]+m2[1][1],
+        m1[1][2]+m2[1][2],
+        
+        m1[2][0]+m2[2][0], 
+        m1[2][1]+m2[2][1],
+        m1[2][2]+m2[2][2]);
+#endif    
 }
 
 SIMD_FORCE_INLINE btMatrix3x3 
 operator-(const btMatrix3x3& m1, const btMatrix3x3& m2)
 {
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
 	return btMatrix3x3(
-	m1[0][0]-m2[0][0], 
-	m1[0][1]-m2[0][1],
-	m1[0][2]-m2[0][2],
-	m1[1][0]-m2[1][0], 
-	m1[1][1]-m2[1][1],
-	m1[1][2]-m2[1][2],
-	m1[2][0]-m2[2][0], 
-	m1[2][1]-m2[2][1],
-	m1[2][2]-m2[2][2]);
+        m1[0].mVec128 - m2[0].mVec128,
+        m1[1].mVec128 - m2[1].mVec128,
+        m1[2].mVec128 - m2[2].mVec128);
+#else
+	return btMatrix3x3(
+        m1[0][0]-m2[0][0], 
+        m1[0][1]-m2[0][1],
+        m1[0][2]-m2[0][2],
+        
+        m1[1][0]-m2[1][0], 
+        m1[1][1]-m2[1][1],
+        m1[1][2]-m2[1][2],
+        
+        m1[2][0]-m2[2][0], 
+        m1[2][1]-m2[2][1],
+        m1[2][2]-m2[2][2]);
+#endif
 }
 
 
 SIMD_FORCE_INLINE btMatrix3x3& 
 btMatrix3x3::operator-=(const btMatrix3x3& m)
 {
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
+    m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
+    m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
+    m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
+#else
 	setValue(
 	m_el[0][0]-m.m_el[0][0], 
 	m_el[0][1]-m.m_el[0][1],
@@ -602,6 +938,7 @@ btMatrix3x3::operator-=(const btMatrix3x3& m)
 	m_el[2][0]-m.m_el[2][0], 
 	m_el[2][1]-m.m_el[2][1],
 	m_el[2][2]-m.m_el[2][2]);
+#endif
 	return *this;
 }
 
@@ -616,18 +953,59 @@ btMatrix3x3::determinant() const
 SIMD_FORCE_INLINE btMatrix3x3 
 btMatrix3x3::absolute() const
 {
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+    return btMatrix3x3(
+            _mm_and_ps(m_el[0].mVec128, btvAbsfMask),
+            _mm_and_ps(m_el[1].mVec128, btvAbsfMask),
+            _mm_and_ps(m_el[2].mVec128, btvAbsfMask));
+#elif defined(BT_USE_NEON)
+    return btMatrix3x3(
+            (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, btv3AbsMask),
+            (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, btv3AbsMask),
+            (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, btv3AbsMask));
+#else	
 	return btMatrix3x3(
-		btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()),
-		btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()),
-		btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z()));
+            btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()),
+            btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()),
+            btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z()));
+#endif
 }
 
 SIMD_FORCE_INLINE btMatrix3x3 
 btMatrix3x3::transpose() const 
 {
-	return btMatrix3x3(m_el[0].x(), m_el[1].x(), m_el[2].x(),
-		m_el[0].y(), m_el[1].y(), m_el[2].y(),
-		m_el[0].z(), m_el[1].z(), m_el[2].z());
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+    __m128 v0 = m_el[0].mVec128;
+    __m128 v1 = m_el[1].mVec128;
+    __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
+    __m128 vT;
+    
+    v2 = _mm_and_ps(v2, btvFFF0fMask);  //  x2 y2 z2 0
+    
+    vT = _mm_unpackhi_ps(v0, v1);	//	z0 z1 * *
+    v0 = _mm_unpacklo_ps(v0, v1);	//	x0 x1 y0 y1
+
+    v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) );	// y0 y1 y2 0
+    v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) );	// x0 x1 x2 0
+    v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));	// z0 z1 z2 0
+
+
+    return btMatrix3x3( v0, v1, v2 );
+#elif defined(BT_USE_NEON)
+    // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
+    static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
+    float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
+    float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
+    float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
+    float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
+    float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
+    float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
+    return btMatrix3x3( v0, v1, v2 ); 
+#else
+	return btMatrix3x3( m_el[0].x(), m_el[1].x(), m_el[2].x(),
+                        m_el[0].y(), m_el[1].y(), m_el[2].y(),
+                        m_el[0].z(), m_el[1].z(), m_el[2].z());
+#endif
 }
 
 SIMD_FORCE_INLINE btMatrix3x3 
@@ -653,7 +1031,47 @@ btMatrix3x3::inverse() const
 SIMD_FORCE_INLINE btMatrix3x3 
 btMatrix3x3::transposeTimes(const btMatrix3x3& m) const
 {
-	return btMatrix3x3(
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+    // zeros w
+//    static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
+    __m128 row = m_el[0].mVec128;
+    __m128 m0 = _mm_and_ps( m.getRow(0).mVec128, btvFFF0fMask );
+    __m128 m1 = _mm_and_ps( m.getRow(1).mVec128, btvFFF0fMask);
+    __m128 m2 = _mm_and_ps( m.getRow(2).mVec128, btvFFF0fMask );
+    __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
+    __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
+    __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
+    row = m_el[1].mVec128;
+    r0 = _mm_add_ps( r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
+    r1 = _mm_add_ps( r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
+    r2 = _mm_add_ps( r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
+    row = m_el[2].mVec128;
+    r0 = _mm_add_ps( r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
+    r1 = _mm_add_ps( r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
+    r2 = _mm_add_ps( r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
+    return btMatrix3x3( r0, r1, r2 );
+
+#elif defined BT_USE_NEON
+    // zeros w
+    static const uint32x4_t xyzMask = (const uint32x4_t){ -1, -1, -1, 0 };
+    float32x4_t m0 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(0).mVec128, xyzMask );
+    float32x4_t m1 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(1).mVec128, xyzMask );
+    float32x4_t m2 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(2).mVec128, xyzMask );
+    float32x4_t row = m_el[0].mVec128;
+    float32x4_t r0 = vmulq_lane_f32( m0, vget_low_f32(row), 0);
+    float32x4_t r1 = vmulq_lane_f32( m0, vget_low_f32(row), 1);
+    float32x4_t r2 = vmulq_lane_f32( m0, vget_high_f32(row), 0);
+    row = m_el[1].mVec128;
+    r0 = vmlaq_lane_f32( r0, m1, vget_low_f32(row), 0);
+    r1 = vmlaq_lane_f32( r1, m1, vget_low_f32(row), 1);
+    r2 = vmlaq_lane_f32( r2, m1, vget_high_f32(row), 0);
+    row = m_el[2].mVec128;
+    r0 = vmlaq_lane_f32( r0, m2, vget_low_f32(row), 0);
+    r1 = vmlaq_lane_f32( r1, m2, vget_low_f32(row), 1);
+    r2 = vmlaq_lane_f32( r2, m2, vget_high_f32(row), 0);
+    return btMatrix3x3( r0, r1, r2 );
+#else
+    return btMatrix3x3(
 		m_el[0].x() * m[0].x() + m_el[1].x() * m[1].x() + m_el[2].x() * m[2].x(),
 		m_el[0].x() * m[0].y() + m_el[1].x() * m[1].y() + m_el[2].x() * m[2].y(),
 		m_el[0].x() * m[0].z() + m_el[1].x() * m[1].z() + m_el[2].x() * m[2].z(),
@@ -663,38 +1081,196 @@ btMatrix3x3::transposeTimes(const btMatrix3x3& m) const
 		m_el[0].z() * m[0].x() + m_el[1].z() * m[1].x() + m_el[2].z() * m[2].x(),
 		m_el[0].z() * m[0].y() + m_el[1].z() * m[1].y() + m_el[2].z() * m[2].y(),
 		m_el[0].z() * m[0].z() + m_el[1].z() * m[1].z() + m_el[2].z() * m[2].z());
+#endif
 }
 
 SIMD_FORCE_INLINE btMatrix3x3 
 btMatrix3x3::timesTranspose(const btMatrix3x3& m) const
 {
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+    __m128 a0 = m_el[0].mVec128;
+    __m128 a1 = m_el[1].mVec128;
+    __m128 a2 = m_el[2].mVec128;
+    
+    btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
+    __m128 mx = mT[0].mVec128;
+    __m128 my = mT[1].mVec128;
+    __m128 mz = mT[2].mVec128;
+    
+    __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
+    __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
+    __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
+    r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
+    r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
+    r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
+    r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
+    r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
+    r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
+    return btMatrix3x3( r0, r1, r2);
+            
+#elif defined BT_USE_NEON
+    float32x4_t a0 = m_el[0].mVec128;
+    float32x4_t a1 = m_el[1].mVec128;
+    float32x4_t a2 = m_el[2].mVec128;
+    
+    btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
+    float32x4_t mx = mT[0].mVec128;
+    float32x4_t my = mT[1].mVec128;
+    float32x4_t mz = mT[2].mVec128;
+    
+    float32x4_t r0 = vmulq_lane_f32( mx, vget_low_f32(a0), 0);
+    float32x4_t r1 = vmulq_lane_f32( mx, vget_low_f32(a1), 0);
+    float32x4_t r2 = vmulq_lane_f32( mx, vget_low_f32(a2), 0);
+    r0 = vmlaq_lane_f32( r0, my, vget_low_f32(a0), 1);
+    r1 = vmlaq_lane_f32( r1, my, vget_low_f32(a1), 1);
+    r2 = vmlaq_lane_f32( r2, my, vget_low_f32(a2), 1);
+    r0 = vmlaq_lane_f32( r0, mz, vget_high_f32(a0), 0);
+    r1 = vmlaq_lane_f32( r1, mz, vget_high_f32(a1), 0);
+    r2 = vmlaq_lane_f32( r2, mz, vget_high_f32(a2), 0);
+    return btMatrix3x3( r0, r1, r2 );
+    
+#else
 	return btMatrix3x3(
 		m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
 		m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
 		m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
-
+#endif
 }
 
 SIMD_FORCE_INLINE btVector3 
 operator*(const btMatrix3x3& m, const btVector3& v) 
 {
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
+    return v.dot3(m[0], m[1], m[2]);
+#else
 	return btVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
+#endif
 }
 
 
 SIMD_FORCE_INLINE btVector3
 operator*(const btVector3& v, const btMatrix3x3& m)
 {
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+
+    const __m128 vv = v.mVec128;
+
+    __m128 c0 = bt_splat_ps( vv, 0);
+    __m128 c1 = bt_splat_ps( vv, 1);
+    __m128 c2 = bt_splat_ps( vv, 2);
+
+    c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask) );
+    c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask) );
+    c0 = _mm_add_ps(c0, c1);
+    c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask) );
+    
+    return btVector3(_mm_add_ps(c0, c2));
+#elif defined(BT_USE_NEON)
+    const float32x4_t vv = v.mVec128;
+    const float32x2_t vlo = vget_low_f32(vv);
+    const float32x2_t vhi = vget_high_f32(vv);
+
+    float32x4_t c0, c1, c2;
+
+    c0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
+    c1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
+    c2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
+
+    c0 = vmulq_lane_f32(c0, vlo, 0);
+    c1 = vmulq_lane_f32(c1, vlo, 1);
+    c2 = vmulq_lane_f32(c2, vhi, 0);
+    c0 = vaddq_f32(c0, c1);
+    c0 = vaddq_f32(c0, c2);
+    
+    return btVector3(c0);
+#else
 	return btVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
+#endif
 }
 
 SIMD_FORCE_INLINE btMatrix3x3 
 operator*(const btMatrix3x3& m1, const btMatrix3x3& m2)
 {
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+
+    __m128 m10 = m1[0].mVec128;  
+    __m128 m11 = m1[1].mVec128;
+    __m128 m12 = m1[2].mVec128;
+    
+    __m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask);
+    
+    __m128 c0 = bt_splat_ps( m10, 0);
+    __m128 c1 = bt_splat_ps( m11, 0);
+    __m128 c2 = bt_splat_ps( m12, 0);
+    
+    c0 = _mm_mul_ps(c0, m2v);
+    c1 = _mm_mul_ps(c1, m2v);
+    c2 = _mm_mul_ps(c2, m2v);
+    
+    m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask);
+    
+    __m128 c0_1 = bt_splat_ps( m10, 1);
+    __m128 c1_1 = bt_splat_ps( m11, 1);
+    __m128 c2_1 = bt_splat_ps( m12, 1);
+    
+    c0_1 = _mm_mul_ps(c0_1, m2v);
+    c1_1 = _mm_mul_ps(c1_1, m2v);
+    c2_1 = _mm_mul_ps(c2_1, m2v);
+    
+    m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask);
+    
+    c0 = _mm_add_ps(c0, c0_1);
+    c1 = _mm_add_ps(c1, c1_1);
+    c2 = _mm_add_ps(c2, c2_1);
+    
+    m10 = bt_splat_ps( m10, 2);
+    m11 = bt_splat_ps( m11, 2);
+    m12 = bt_splat_ps( m12, 2);
+    
+    m10 = _mm_mul_ps(m10, m2v);
+    m11 = _mm_mul_ps(m11, m2v);
+    m12 = _mm_mul_ps(m12, m2v);
+    
+    c0 = _mm_add_ps(c0, m10);
+    c1 = _mm_add_ps(c1, m11);
+    c2 = _mm_add_ps(c2, m12);
+    
+    return btMatrix3x3(c0, c1, c2);
+
+#elif defined(BT_USE_NEON)
+
+    float32x4_t rv0, rv1, rv2;
+    float32x4_t v0, v1, v2;
+    float32x4_t mv0, mv1, mv2;
+
+    v0 = m1[0].mVec128;
+    v1 = m1[1].mVec128;
+    v2 = m1[2].mVec128;
+
+    mv0 = (float32x4_t) vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask); 
+    mv1 = (float32x4_t) vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask); 
+    mv2 = (float32x4_t) vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask); 
+    
+    rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
+    rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
+    rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
+    
+    rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
+    rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
+    rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
+    
+    rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
+    rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
+    rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
+
+	return btMatrix3x3(rv0, rv1, rv2);
+        
+#else	
 	return btMatrix3x3(
 		m2.tdotx( m1[0]), m2.tdoty( m1[0]), m2.tdotz( m1[0]),
 		m2.tdotx( m1[1]), m2.tdoty( m1[1]), m2.tdotz( m1[1]),
 		m2.tdotx( m1[2]), m2.tdoty( m1[2]), m2.tdotz( m1[2]));
+#endif
 }
 
 /*
@@ -716,9 +1292,24 @@ m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]);
 * It will test all elements are equal.  */
 SIMD_FORCE_INLINE bool operator==(const btMatrix3x3& m1, const btMatrix3x3& m2)
 {
-	return ( m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+
+    __m128 c0, c1, c2;
+
+    c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
+    c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
+    c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
+    
+    c0 = _mm_and_ps(c0, c1);
+    c0 = _mm_and_ps(c0, c2);
+
+    return (0x7 == _mm_movemask_ps((__m128)c0));
+#else 
+	return 
+    (   m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
 		m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
 		m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2] );
+#endif
 }
 
 ///for serialization
diff --git a/src/LinearMath/btQuadWord.h b/src/LinearMath/btQuadWord.h
index d5e9daa45..11067ef47 100644
--- a/src/LinearMath/btQuadWord.h
+++ b/src/LinearMath/btQuadWord.h
@@ -20,6 +20,9 @@ subject to the following restrictions:
 #include "btMinMax.h"
 
 
+
+
+
 #if defined (__CELLOS_LV2) && defined (__SPU__)
 #include <altivec.h>
 #endif
@@ -47,11 +50,53 @@ public:
 	}
 protected:
 #else //__CELLOS_LV2__ __SPU__
+
+#if defined(BT_USE_SSE) || defined(BT_USE_NEON) 
+	union {
+		btSimdFloat4 mVec128;
+		btScalar	m_floats[4];
+	};
+public:
+	SIMD_FORCE_INLINE	btSimdFloat4	get128() const
+	{
+		return mVec128;
+	}
+	SIMD_FORCE_INLINE	void	set128(btSimdFloat4 v128)
+	{
+		mVec128 = v128;
+	}
+#else
 	btScalar	m_floats[4];
+#endif // BT_USE_SSE
+
 #endif //__CELLOS_LV2__ __SPU__
 
 	public:
   
+#if defined(BT_USE_SSE) || defined(BT_USE_NEON)
+
+	// Set Vector 
+	SIMD_FORCE_INLINE btQuadWord(const btSimdFloat4 vec)
+	{
+		mVec128 = vec;
+	}
+
+	// Copy constructor
+	SIMD_FORCE_INLINE btQuadWord(const btQuadWord& rhs)
+	{
+		mVec128 = rhs.mVec128;
+	}
+
+	// Assignment Operator
+	SIMD_FORCE_INLINE btQuadWord& 
+	operator=(const btQuadWord& v) 
+	{
+		mVec128 = v.mVec128;
+		
+		return *this;
+	}
+	
+#endif
 
   /**@brief Return the x value */
 		SIMD_FORCE_INLINE const btScalar& getX() const { return m_floats[0]; }
@@ -60,13 +105,13 @@ protected:
   /**@brief Return the z value */
 		SIMD_FORCE_INLINE const btScalar& getZ() const { return m_floats[2]; }
   /**@brief Set the x value */
-		SIMD_FORCE_INLINE void	setX(btScalar x) { m_floats[0] = x;};
+		SIMD_FORCE_INLINE void	setX(btScalar _x) { m_floats[0] = _x;};
   /**@brief Set the y value */
-		SIMD_FORCE_INLINE void	setY(btScalar y) { m_floats[1] = y;};
+		SIMD_FORCE_INLINE void	setY(btScalar _y) { m_floats[1] = _y;};
   /**@brief Set the z value */
-		SIMD_FORCE_INLINE void	setZ(btScalar z) { m_floats[2] = z;};
+		SIMD_FORCE_INLINE void	setZ(btScalar _z) { m_floats[2] = _z;};
   /**@brief Set the w value */
-		SIMD_FORCE_INLINE void	setW(btScalar w) { m_floats[3] = w;};
+		SIMD_FORCE_INLINE void	setW(btScalar _w) { m_floats[3] = _w;};
   /**@brief Return the x value */
 		SIMD_FORCE_INLINE const btScalar& x() const { return m_floats[0]; }
   /**@brief Return the y value */
@@ -84,7 +129,14 @@ protected:
 
 	SIMD_FORCE_INLINE	bool	operator==(const btQuadWord& other) const
 	{
-		return ((m_floats[3]==other.m_floats[3]) && (m_floats[2]==other.m_floats[2]) && (m_floats[1]==other.m_floats[1]) && (m_floats[0]==other.m_floats[0]));
+#ifdef BT_USE_SSE
+        return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
+#else 
+		return ((m_floats[3]==other.m_floats[3]) && 
+                (m_floats[2]==other.m_floats[2]) && 
+                (m_floats[1]==other.m_floats[1]) && 
+                (m_floats[0]==other.m_floats[0]));
+#endif
 	}
 
 	SIMD_FORCE_INLINE	bool	operator!=(const btQuadWord& other) const
@@ -97,11 +149,11 @@ protected:
    * @param y Value of y
    * @param z Value of z
    */
-		SIMD_FORCE_INLINE void 	setValue(const btScalar& x, const btScalar& y, const btScalar& z)
+		SIMD_FORCE_INLINE void 	setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z)
 		{
-			m_floats[0]=x;
-			m_floats[1]=y;
-			m_floats[2]=z;
+			m_floats[0]=_x;
+			m_floats[1]=_y;
+			m_floats[2]=_z;
 			m_floats[3] = 0.f;
 		}
 
@@ -118,12 +170,12 @@ protected:
    * @param z Value of z
    * @param w Value of w
    */
-		SIMD_FORCE_INLINE void	setValue(const btScalar& x, const btScalar& y, const btScalar& z,const btScalar& w)
+		SIMD_FORCE_INLINE void	setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w)
 		{
-			m_floats[0]=x;
-			m_floats[1]=y;
-			m_floats[2]=z;
-			m_floats[3]=w;
+			m_floats[0]=_x;
+			m_floats[1]=_y;
+			m_floats[2]=_z;
+			m_floats[3]=_w;
 		}
   /**@brief No initialization constructor */
 		SIMD_FORCE_INLINE btQuadWord()
@@ -136,9 +188,9 @@ protected:
    * @param y Value of y
    * @param z Value of z
    */
-		SIMD_FORCE_INLINE btQuadWord(const btScalar& x, const btScalar& y, const btScalar& z)		
+		SIMD_FORCE_INLINE btQuadWord(const btScalar& _x, const btScalar& _y, const btScalar& _z)		
 		{
-			m_floats[0] = x, m_floats[1] = y, m_floats[2] = z, m_floats[3] = 0.0f;
+			m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = 0.0f;
 		}
 
 /**@brief Initializing constructor
@@ -147,9 +199,9 @@ protected:
    * @param z Value of z
    * @param w Value of w
    */
-		SIMD_FORCE_INLINE btQuadWord(const btScalar& x, const btScalar& y, const btScalar& z,const btScalar& w) 
+		SIMD_FORCE_INLINE btQuadWord(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w) 
 		{
-			m_floats[0] = x, m_floats[1] = y, m_floats[2] = z, m_floats[3] = w;
+			m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = _w;
 		}
 
   /**@brief Set each element to the max of the current values and the values of another btQuadWord
@@ -157,21 +209,33 @@ protected:
    */
 		SIMD_FORCE_INLINE void	setMax(const btQuadWord& other)
 		{
-			btSetMax(m_floats[0], other.m_floats[0]);
+        #ifdef BT_USE_SSE
+            mVec128 = _mm_max_ps(mVec128, other.mVec128);
+        #elif defined(BT_USE_NEON)
+            mVec128 = vmaxq_f32(mVec128, other.mVec128);
+        #else
+        	btSetMax(m_floats[0], other.m_floats[0]);
 			btSetMax(m_floats[1], other.m_floats[1]);
 			btSetMax(m_floats[2], other.m_floats[2]);
 			btSetMax(m_floats[3], other.m_floats[3]);
-		}
+		#endif
+        }
   /**@brief Set each element to the min of the current values and the values of another btQuadWord
    * @param other The other btQuadWord to compare with 
    */
 		SIMD_FORCE_INLINE void	setMin(const btQuadWord& other)
 		{
-			btSetMin(m_floats[0], other.m_floats[0]);
+        #ifdef BT_USE_SSE
+            mVec128 = _mm_min_ps(mVec128, other.mVec128);
+        #elif defined(BT_USE_NEON)
+            mVec128 = vminq_f32(mVec128, other.mVec128);
+        #else
+        	btSetMin(m_floats[0], other.m_floats[0]);
 			btSetMin(m_floats[1], other.m_floats[1]);
 			btSetMin(m_floats[2], other.m_floats[2]);
 			btSetMin(m_floats[3], other.m_floats[3]);
-		}
+		#endif
+        }
 
 
 
diff --git a/src/LinearMath/btQuaternion.h b/src/LinearMath/btQuaternion.h
index ee79f6eae..ed67f2900 100644
--- a/src/LinearMath/btQuaternion.h
+++ b/src/LinearMath/btQuaternion.h
@@ -21,24 +21,65 @@ subject to the following restrictions:
 #include "btVector3.h"
 #include "btQuadWord.h"
 
+
+
+
+
+#ifdef BT_USE_SSE
+
+const __m128 ATTRIBUTE_ALIGNED16(vOnes) = {1.0f, 1.0f, 1.0f, 1.0f};
+
+#endif
+
+#if defined(BT_USE_SSE) || defined(BT_USE_NEON)
+
+const btSimdFloat4 ATTRIBUTE_ALIGNED16(vQInv) = {-0.0f, -0.0f, -0.0f, +0.0f};
+const btSimdFloat4 ATTRIBUTE_ALIGNED16(vPPPM) = {+0.0f, +0.0f, +0.0f, -0.0f};
+
+#endif
+
 /**@brief The btQuaternion implements quaternion to perform linear algebra rotations in combination with btMatrix3x3, btVector3 and btTransform. */
 class btQuaternion : public btQuadWord {
 public:
   /**@brief No initialization constructor */
 	btQuaternion() {}
 
+#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))|| defined(BT_USE_NEON) 
+	// Set Vector 
+	SIMD_FORCE_INLINE btQuaternion(const btSimdFloat4 vec)
+	{
+		mVec128 = vec;
+	}
+
+	// Copy constructor
+	SIMD_FORCE_INLINE btQuaternion(const btQuaternion& rhs)
+	{
+		mVec128 = rhs.mVec128;
+	}
+
+	// Assignment Operator
+	SIMD_FORCE_INLINE btQuaternion& 
+	operator=(const btQuaternion& v) 
+	{
+		mVec128 = v.mVec128;
+		
+		return *this;
+	}
+	
+#endif
+
 	//		template <typename btScalar>
 	//		explicit Quaternion(const btScalar *v) : Tuple4<btScalar>(v) {}
   /**@brief Constructor from scalars */
-	btQuaternion(const btScalar& x, const btScalar& y, const btScalar& z, const btScalar& w) 
-		: btQuadWord(x, y, z, w) 
+	btQuaternion(const btScalar& _x, const btScalar& _y, const btScalar& _z, const btScalar& _w) 
+		: btQuadWord(_x, _y, _z, _w) 
 	{}
   /**@brief Axis angle Constructor
    * @param axis The axis which the rotation is around
    * @param angle The magnitude of the rotation around the angle (Radians) */
-	btQuaternion(const btVector3& axis, const btScalar& angle) 
+	btQuaternion(const btVector3& _axis, const btScalar& _angle) 
 	{ 
-		setRotation(axis, angle); 
+		setRotation(_axis, _angle); 
 	}
   /**@brief Constructor from Euler angles
    * @param yaw Angle around Y unless BT_EULER_DEFAULT_ZYX defined then Z
@@ -55,13 +96,13 @@ public:
   /**@brief Set the rotation using axis angle notation 
    * @param axis The axis around which to rotate
    * @param angle The magnitude of the rotation in Radians */
-	void setRotation(const btVector3& axis, const btScalar& angle)
+	void setRotation(const btVector3& axis, const btScalar& _angle)
 	{
 		btScalar d = axis.length();
 		btAssert(d != btScalar(0.0));
-		btScalar s = btSin(angle * btScalar(0.5)) / d;
+		btScalar s = btSin(_angle * btScalar(0.5)) / d;
 		setValue(axis.x() * s, axis.y() * s, axis.z() * s, 
-			btCos(angle * btScalar(0.5)));
+			btCos(_angle * btScalar(0.5)));
 	}
   /**@brief Set the quaternion using Euler angles
    * @param yaw Angle around Y
@@ -107,7 +148,16 @@ public:
    * @param q The quaternion to add to this one */
 	SIMD_FORCE_INLINE	btQuaternion& operator+=(const btQuaternion& q)
 	{
-		m_floats[0] += q.x(); m_floats[1] += q.y(); m_floats[2] += q.z(); m_floats[3] += q.m_floats[3];
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_add_ps(mVec128, q.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vaddq_f32(mVec128, q.mVec128);
+#else	
+		m_floats[0] += q.x(); 
+        m_floats[1] += q.y(); 
+        m_floats[2] += q.z(); 
+        m_floats[3] += q.m_floats[3];
+#endif
 		return *this;
 	}
 
@@ -115,15 +165,35 @@ public:
    * @param q The quaternion to subtract from this one */
 	btQuaternion& operator-=(const btQuaternion& q) 
 	{
-		m_floats[0] -= q.x(); m_floats[1] -= q.y(); m_floats[2] -= q.z(); m_floats[3] -= q.m_floats[3];
-		return *this;
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_sub_ps(mVec128, q.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vsubq_f32(mVec128, q.mVec128);
+#else	
+		m_floats[0] -= q.x(); 
+        m_floats[1] -= q.y(); 
+        m_floats[2] -= q.z(); 
+        m_floats[3] -= q.m_floats[3];
+#endif
+        return *this;
 	}
 
   /**@brief Scale this quaternion
    * @param s The scalar to scale by */
 	btQuaternion& operator*=(const btScalar& s)
 	{
-		m_floats[0] *= s; m_floats[1] *= s; m_floats[2] *= s; m_floats[3] *= s;
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+		vs = bt_pshufd_ps(vs, 0);	//	(S S S S)
+		mVec128 = _mm_mul_ps(mVec128, vs);
+#elif defined(BT_USE_NEON)
+		mVec128 = vmulq_n_f32(mVec128, s);
+#else
+		m_floats[0] *= s; 
+        m_floats[1] *= s; 
+        m_floats[2] *= s; 
+        m_floats[3] *= s;
+#endif
 		return *this;
 	}
 
@@ -132,17 +202,111 @@ public:
    * Equivilant to this = this * q */
 	btQuaternion& operator*=(const btQuaternion& q)
 	{
-		setValue(m_floats[3] * q.x() + m_floats[0] * q.m_floats[3] + m_floats[1] * q.z() - m_floats[2] * q.y(),
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128 vQ2 = q.get128();
+		
+		__m128 A1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(0,1,2,0));
+		__m128 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));
+		
+		A1 = A1 * B1;
+		
+		__m128 A2 = bt_pshufd_ps(mVec128, BT_SHUFFLE(1,2,0,1));
+		__m128 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
+		
+		A2 = A2 * B2;
+		
+		B1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(2,0,1,2));
+		B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
+		
+		B1 = B1 * B2;	//	A3 *= B3
+		
+		mVec128 = bt_splat_ps(mVec128, 3);	//	A0
+		mVec128 = mVec128 * vQ2;	//	A0 * B0
+		
+		A1 = A1 + A2;	//	AB12
+		mVec128 = mVec128 - B1;	//	AB03 = AB0 - AB3 
+		A1 = _mm_xor_ps(A1, vPPPM);	//	change sign of the last element
+		mVec128 = mVec128+ A1;	//	AB03 + AB12
+
+#elif defined(BT_USE_NEON)     
+
+        float32x4_t vQ1 = mVec128;
+        float32x4_t vQ2 = q.get128();
+        float32x4_t A0, A1, B1, A2, B2, A3, B3;
+        float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+        
+        {
+        float32x2x2_t tmp;
+        tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
+        vQ1zx = tmp.val[0];
+
+        tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
+        vQ2zx = tmp.val[0];
+        }
+        vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); 
+
+        vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+
+        vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+        vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+
+        A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x 
+        B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X 
+
+        A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+        B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+
+        A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
+        B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+
+        A1 = vmulq_f32(A1, B1);
+        A2 = vmulq_f32(A2, B2);
+        A3 = vmulq_f32(A3, B3);	//	A3 *= B3
+        A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); //	A0 * B0
+
+        A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
+        A0 = vsubq_f32(A0, A3);	//	AB03 = AB0 - AB3 
+        
+        //	change the sign of the last element
+        A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);	
+        A0 = vaddq_f32(A0, A1);	//	AB03 + AB12
+        
+        mVec128 = A0;
+#else
+		setValue(
+            m_floats[3] * q.x() + m_floats[0] * q.m_floats[3] + m_floats[1] * q.z() - m_floats[2] * q.y(),
 			m_floats[3] * q.y() + m_floats[1] * q.m_floats[3] + m_floats[2] * q.x() - m_floats[0] * q.z(),
 			m_floats[3] * q.z() + m_floats[2] * q.m_floats[3] + m_floats[0] * q.y() - m_floats[1] * q.x(),
 			m_floats[3] * q.m_floats[3] - m_floats[0] * q.x() - m_floats[1] * q.y() - m_floats[2] * q.z());
+#endif
 		return *this;
 	}
   /**@brief Return the dot product between this quaternion and another
    * @param q The other quaternion */
 	btScalar dot(const btQuaternion& q) const
 	{
-		return m_floats[0] * q.x() + m_floats[1] * q.y() + m_floats[2] * q.z() + m_floats[3] * q.m_floats[3];
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vd;
+		
+		vd = _mm_mul_ps(mVec128, q.mVec128);
+		
+        __m128 t = _mm_movehl_ps(vd, vd);
+		vd = _mm_add_ps(vd, t);
+		t = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, t);
+		
+        return _mm_cvtss_f32(vd);
+#elif defined(BT_USE_NEON)
+		float32x4_t vd = vmulq_f32(mVec128, q.mVec128);
+		float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd));  
+		x = vpadd_f32(x, x);
+		return vget_lane_f32(x, 0);
+#else    
+		return  m_floats[0] * q.x() + 
+                m_floats[1] * q.y() + 
+                m_floats[2] * q.z() + 
+                m_floats[3] * q.m_floats[3];
+#endif
 	}
 
   /**@brief Return the length squared of the quaternion */
@@ -161,7 +325,25 @@ public:
    * Such that x^2 + y^2 + z^2 +w^2 = 1 */
 	btQuaternion& normalize() 
 	{
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vd;
+		
+		vd = _mm_mul_ps(mVec128, mVec128);
+		
+        __m128 t = _mm_movehl_ps(vd, vd);
+		vd = _mm_add_ps(vd, t);
+		t = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, t);
+
+		vd = _mm_sqrt_ss(vd);
+		vd = _mm_div_ss(vOnes, vd);
+        vd = bt_pshufd_ps(vd, 0); // splat
+		mVec128 = _mm_mul_ps(mVec128, vd);
+    
+		return *this;
+#else    
 		return *this /= length();
+#endif
 	}
 
   /**@brief Return a scaled version of this quaternion
@@ -169,10 +351,18 @@ public:
 	SIMD_FORCE_INLINE btQuaternion
 	operator*(const btScalar& s) const
 	{
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+		vs = bt_pshufd_ps(vs, 0x00);	//	(S S S S)
+		
+		return btQuaternion(_mm_mul_ps(mVec128, vs));
+#elif defined(BT_USE_NEON)
+		return btQuaternion(vmulq_n_f32(mVec128, s));
+#else
 		return btQuaternion(x() * s, y() * s, z() * s, m_floats[3] * s);
+#endif
 	}
 
-
   /**@brief Return an inversely scaled versionof this quaternion
    * @param s The inverse scale factor */
 	btQuaternion operator/(const btScalar& s) const
@@ -223,7 +413,13 @@ public:
 	/**@brief Return the inverse of this quaternion */
 	btQuaternion inverse() const
 	{
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		return btQuaternion(_mm_xor_ps(mVec128, vQInv));
+#elif defined(BT_USE_NEON)
+        return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)vQInv));
+#else	
 		return btQuaternion(-m_floats[0], -m_floats[1], -m_floats[2], m_floats[3]);
+#endif
 	}
 
   /**@brief Return the sum of this quaternion and the other 
@@ -231,8 +427,14 @@ public:
 	SIMD_FORCE_INLINE btQuaternion
 	operator+(const btQuaternion& q2) const
 	{
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		return btQuaternion(_mm_add_ps(mVec128, q2.mVec128));
+#elif defined(BT_USE_NEON)
+        return btQuaternion(vaddq_f32(mVec128, q2.mVec128));
+#else	
 		const btQuaternion& q1 = *this;
 		return btQuaternion(q1.x() + q2.x(), q1.y() + q2.y(), q1.z() + q2.z(), q1.m_floats[3] + q2.m_floats[3]);
+#endif
 	}
 
   /**@brief Return the difference between this quaternion and the other 
@@ -240,16 +442,28 @@ public:
 	SIMD_FORCE_INLINE btQuaternion
 	operator-(const btQuaternion& q2) const
 	{
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		return btQuaternion(_mm_sub_ps(mVec128, q2.mVec128));
+#elif defined(BT_USE_NEON)
+        return btQuaternion(vsubq_f32(mVec128, q2.mVec128));
+#else	
 		const btQuaternion& q1 = *this;
 		return btQuaternion(q1.x() - q2.x(), q1.y() - q2.y(), q1.z() - q2.z(), q1.m_floats[3] - q2.m_floats[3]);
+#endif
 	}
 
   /**@brief Return the negative of this quaternion 
    * This simply negates each element */
 	SIMD_FORCE_INLINE btQuaternion operator-() const
 	{
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		return btQuaternion(_mm_xor_ps(mVec128, btvMzeroMask));
+#elif defined(BT_USE_NEON)
+		return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)btvMzeroMask) );
+#else	
 		const btQuaternion& q2 = *this;
 		return btQuaternion( - q2.x(), - q2.y(),  - q2.z(),  - q2.m_floats[3]);
+#endif
 	}
   /**@todo document this and it's use */
 	SIMD_FORCE_INLINE btQuaternion farthest( const btQuaternion& qd) const 
@@ -323,29 +537,257 @@ public:
 
 /**@brief Return the product of two quaternions */
 SIMD_FORCE_INLINE btQuaternion
-operator*(const btQuaternion& q1, const btQuaternion& q2) {
-	return btQuaternion(q1.w() * q2.x() + q1.x() * q2.w() + q1.y() * q2.z() - q1.z() * q2.y(),
+operator*(const btQuaternion& q1, const btQuaternion& q2) 
+{
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	__m128 vQ1 = q1.get128();
+	__m128 vQ2 = q2.get128();
+	__m128 A0, A1, B1, A2, B2;
+    
+	A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0)); // X Y  z x     //      vtrn
+	B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0)); // W W  W X     // vdup vext
+
+	A1 = A1 * B1;
+	
+	A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1)); // Y Z  X Y     // vext 
+	B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1)); // z x  Y Y     // vtrn vdup
+
+	A2 = A2 * B2;
+
+	B1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2)); // z x Y Z      // vtrn vext
+	B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2)); // Y Z x z      // vext vtrn
+	
+	B1 = B1 * B2;	//	A3 *= B3
+
+	A0 = bt_splat_ps(vQ1, 3);	//	A0
+	A0 = A0 * vQ2;	//	A0 * B0
+
+	A1 = A1 + A2;	//	AB12
+	A0 =  A0 - B1;	//	AB03 = AB0 - AB3 
+	
+    A1 = _mm_xor_ps(A1, vPPPM);	//	change sign of the last element
+	A0 = A0 + A1;	//	AB03 + AB12
+	
+	return btQuaternion(A0);
+
+#elif defined(BT_USE_NEON)     
+
+	float32x4_t vQ1 = q1.get128();
+	float32x4_t vQ2 = q2.get128();
+	float32x4_t A0, A1, B1, A2, B2, A3, B3;
+    float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+    
+    {
+    float32x2x2_t tmp;
+    tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
+    vQ1zx = tmp.val[0];
+
+    tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
+    vQ2zx = tmp.val[0];
+    }
+    vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); 
+
+    vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+
+    vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+    vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+
+    A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x 
+    B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X 
+
+	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+    B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+
+    A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
+    B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+
+	A1 = vmulq_f32(A1, B1);
+	A2 = vmulq_f32(A2, B2);
+	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
+	A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); //	A0 * B0
+
+	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
+	A0 = vsubq_f32(A0, A3);	//	AB03 = AB0 - AB3 
+	
+    //	change the sign of the last element
+    A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);	
+	A0 = vaddq_f32(A0, A1);	//	AB03 + AB12
+	
+	return btQuaternion(A0);
+
+#else
+	return btQuaternion(
+        q1.w() * q2.x() + q1.x() * q2.w() + q1.y() * q2.z() - q1.z() * q2.y(),
 		q1.w() * q2.y() + q1.y() * q2.w() + q1.z() * q2.x() - q1.x() * q2.z(),
 		q1.w() * q2.z() + q1.z() * q2.w() + q1.x() * q2.y() - q1.y() * q2.x(),
 		q1.w() * q2.w() - q1.x() * q2.x() - q1.y() * q2.y() - q1.z() * q2.z()); 
+#endif
 }
 
 SIMD_FORCE_INLINE btQuaternion
 operator*(const btQuaternion& q, const btVector3& w)
 {
-	return btQuaternion( q.w() * w.x() + q.y() * w.z() - q.z() * w.y(),
-		q.w() * w.y() + q.z() * w.x() - q.x() * w.z(),
-		q.w() * w.z() + q.x() * w.y() - q.y() * w.x(),
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	__m128 vQ1 = q.get128();
+	__m128 vQ2 = w.get128();
+	__m128 A1, B1, A2, B2, A3, B3;
+	
+	A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(3,3,3,0));
+	B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(0,1,2,0));
+
+	A1 = A1 * B1;
+	
+	A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
+	B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
+
+	A2 = A2 * B2;
+
+	A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
+	B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
+	
+	A3 = A3 * B3;	//	A3 *= B3
+
+	A1 = A1 + A2;	//	AB12
+	A1 = _mm_xor_ps(A1, vPPPM);	//	change sign of the last element
+    A1 = A1 - A3;	//	AB123 = AB12 - AB3 
+	
+	return btQuaternion(A1);
+    
+#elif defined(BT_USE_NEON)     
+
+	float32x4_t vQ1 = q.get128();
+	float32x4_t vQ2 = w.get128();
+	float32x4_t A1, B1, A2, B2, A3, B3;
+    float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz;
+    
+    vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1); 
+    {
+    float32x2x2_t tmp;
+
+    tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
+    vQ2zx = tmp.val[0];
+
+    tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
+    vQ1zx = tmp.val[0];
+    }
+
+    vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+
+    vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+    vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+
+    A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx); // W W  W X 
+    B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx);                    // X Y  z x 
+
+	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+    B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+
+    A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
+    B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+
+	A1 = vmulq_f32(A1, B1);
+	A2 = vmulq_f32(A2, B2);
+	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
+
+	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
+	
+    //	change the sign of the last element
+    A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);	
+	
+    A1 = vsubq_f32(A1, A3);	//	AB123 = AB12 - AB3
+	
+	return btQuaternion(A1);
+    
+#else
+	return btQuaternion( 
+         q.w() * w.x() + q.y() * w.z() - q.z() * w.y(),
+		 q.w() * w.y() + q.z() * w.x() - q.x() * w.z(),
+		 q.w() * w.z() + q.x() * w.y() - q.y() * w.x(),
 		-q.x() * w.x() - q.y() * w.y() - q.z() * w.z()); 
+#endif
 }
 
 SIMD_FORCE_INLINE btQuaternion
 operator*(const btVector3& w, const btQuaternion& q)
 {
-	return btQuaternion( w.x() * q.w() + w.y() * q.z() - w.z() * q.y(),
-		w.y() * q.w() + w.z() * q.x() - w.x() * q.z(),
-		w.z() * q.w() + w.x() * q.y() - w.y() * q.x(),
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	__m128 vQ1 = w.get128();
+	__m128 vQ2 = q.get128();
+	__m128 A1, B1, A2, B2, A3, B3;
+	
+	A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0));  // X Y  z x
+	B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));  // W W  W X 
+
+	A1 = A1 * B1;
+	
+	A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
+	B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
+
+	A2 = A2 *B2;
+
+	A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
+	B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
+	
+	A3 = A3 * B3;	//	A3 *= B3
+
+	A1 = A1 + A2;	//	AB12
+	A1 = _mm_xor_ps(A1, vPPPM);	//	change sign of the last element
+	A1 = A1 - A3;	//	AB123 = AB12 - AB3 
+	
+	return btQuaternion(A1);
+
+#elif defined(BT_USE_NEON)     
+
+	float32x4_t vQ1 = w.get128();
+	float32x4_t vQ2 = q.get128();
+	float32x4_t  A1, B1, A2, B2, A3, B3;
+    float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+    
+    {
+    float32x2x2_t tmp;
+   
+    tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
+    vQ1zx = tmp.val[0];
+
+    tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
+    vQ2zx = tmp.val[0];
+    }
+    vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1); 
+
+    vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+
+    vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+    vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+
+    A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x 
+    B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X 
+
+	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+    B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+
+    A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
+    B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
+
+	A1 = vmulq_f32(A1, B1);
+	A2 = vmulq_f32(A2, B2);
+	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
+
+	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
+	
+    //	change the sign of the last element
+    A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);	
+	
+    A1 = vsubq_f32(A1, A3);	//	AB123 = AB12 - AB3
+	
+	return btQuaternion(A1);
+    
+#else
+	return btQuaternion( 
+        +w.x() * q.w() + w.y() * q.z() - w.z() * q.y(),
+		+w.y() * q.w() + w.z() * q.x() - w.x() * q.z(),
+		+w.z() * q.w() + w.x() * q.y() - w.y() * q.x(),
 		-w.x() * q.x() - w.y() * q.y() - w.z() * q.z()); 
+#endif
 }
 
 /**@brief Calculate the dot product between two quaternions */
@@ -393,7 +835,13 @@ quatRotate(const btQuaternion& rotation, const btVector3& v)
 {
 	btQuaternion q = rotation * v;
 	q *= rotation.inverse();
+#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	return btVector3(_mm_and_ps(q.get128(), btvFFF0fMask));
+#elif defined(BT_USE_NEON)
+    return btVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), btvFFF0Mask));
+#else	
 	return btVector3(q.getX(),q.getY(),q.getZ());
+#endif
 }
 
 SIMD_FORCE_INLINE btQuaternion 
diff --git a/src/LinearMath/btScalar.h b/src/LinearMath/btScalar.h
index ecae97224..ac5aac99b 100644
--- a/src/LinearMath/btScalar.h
+++ b/src/LinearMath/btScalar.h
@@ -69,6 +69,15 @@ inline int	btGetVersion()
 
 #if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined (BT_USE_DOUBLE_PRECISION))
 			#define BT_USE_SSE
+			#ifdef BT_USE_SSE
+			//BT_USE_SSE_IN_API is disabled under Windows by default, because 
+			//it makes it harder to integrate Bullet into your application under Windows 
+			//(structured embedding Bullet structs/classes need to be 16-byte aligned)
+			//with relatively little performance gain
+			//If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries
+			//you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage)
+			//#define BT_USE_SSE_IN_API
+			#endif //BT_USE_SSE
 			#include <emmintrin.h>
 #endif
 
@@ -143,11 +152,39 @@ inline int	btGetVersion()
 #else
 	//non-windows systems
 
-#if (defined (__APPLE__) && defined (__i386__) && (!defined (BT_USE_DOUBLE_PRECISION)))
-	#define BT_USE_SSE
-	#include <emmintrin.h>
+#if (defined (__APPLE__) && (!defined (BT_USE_DOUBLE_PRECISION)))
+    #if defined (__i386__) || defined (__x86_64__)
+        #define BT_USE_SSE
+		//BT_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries
+		//if apps run into issues, we will disable the next line
+		#define BT_USE_SSE_IN_API
+        #ifdef BT_USE_SSE
+            // include appropriate SSE level
+            #if defined (__SSE4_1__)
+                #include <smmintrin.h>
+            #elif defined (__SSSE3__)
+                #include <tmmintrin.h>
+            #elif defined (__SSE3__)
+                #include <pmmintrin.h>
+            #else
+                #include <emmintrin.h>
+            #endif
+        #endif //BT_USE_SSE
+    #elif defined( __arm__ )
+        #ifdef __clang__
+        #define BT_USE_NEON 1
+        #if defined BT_USE_NEON && defined (__clang__)
+            #if! defined( ARM_NEON_GCC_COMPATIBILITY )
+                // -DARM_NEON_GCC_COMPATIBILITY=1 changes neon vector types to raw vectors, syntactically similar to SSE and AltiVec
+                // instead of vectors wrapped up in structs. This code base assumes GCC style raw vectors are used.
+                #error The C preprocessor macro ARM_NEON_GCC_COMPATIBILITY must be defined. Pass -DARM_NEON_GCC_COMPATIBILITY=1 to the compiler.
+            #endif//!ARM_NEON_GCC_COMPATIBILITY
+            #include <arm_neon.h>
+        #endif//BT_USE_NEON
+       #endif //__clang__
+    #endif//__arm__
 
-	#define SIMD_FORCE_INLINE inline
+	#define SIMD_FORCE_INLINE inline __attribute__ ((always_inline))
 ///@todo: check out alignment methods for other platforms/compilers
 	#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
 	#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
@@ -210,6 +247,69 @@ typedef float btScalar;
 #define BT_LARGE_FLOAT 1e18f
 #endif
 
+#ifdef BT_USE_SSE
+typedef __m128 btSimdFloat4;
+#endif//BT_USE_SSE
+
+#if defined BT_USE_SSE_IN_API && defined (BT_USE_SSE)
+#ifdef _WIN32
+
+#ifndef BT_NAN
+static int btNanMask = 0x7F800001;
+#define BT_NAN (*(float*)&btNanMask)
+#endif
+
+#ifndef BT_INFINITY
+static  int btInfinityMask = 0x7F800000;
+#define BT_INFINITY (*(float*)&btInfinityMask)
+#endif
+
+inline __m128 operator + (const __m128 A, const __m128 B)
+{
+    return _mm_add_ps(A, B);
+}
+
+inline __m128 operator - (const __m128 A, const __m128 B)
+{
+    return _mm_sub_ps(A, B);
+}
+
+inline __m128 operator * (const __m128 A, const __m128 B)
+{
+    return _mm_mul_ps(A, B);
+}
+
+#define btCastfTo128i(a) (_mm_castps_si128(a))
+#define btCastfTo128d(a) (_mm_castps_pd(a))
+#define btCastiTo128f(a) (_mm_castsi128_ps(a))
+#define btCastdTo128f(a) (_mm_castpd_ps(a))
+#define btCastdTo128i(a) (_mm_castpd_si128(a))
+#define btAssign128(r0,r1,r2,r3) _mm_setr_ps(r0,r1,r2,r3)
+
+#else//_WIN32
+
+#define btCastfTo128i(a) ((__m128i)(a))
+#define btCastfTo128d(a) ((__m128d)(a))
+#define btCastiTo128f(a)  ((__m128) (a))
+#define btCastdTo128f(a) ((__m128) (a))
+#define btCastdTo128i(a) ((__m128i)(a))
+#define btAssign128(r0,r1,r2,r3) (__m128){r0,r1,r2,r3}
+#define BT_INFINITY INFINITY
+#define BT_NAN NAN
+#endif//_WIN32
+#endif //BT_USE_SSE_IN_API
+
+#ifdef BT_USE_NEON
+#include <arm_neon.h>
+
+typedef float32x4_t btSimdFloat4;
+#define BT_INFINITY INFINITY
+#define BT_NAN NAN
+#define btAssign128(r0,r1,r2,r3) (float32x4_t){r0,r1,r2,r3}
+#endif
+
+
+
 
 
 #define BT_DECLARE_ALIGNED_ALLOCATOR() \
diff --git a/src/LinearMath/btTransform.h b/src/LinearMath/btTransform.h
index 5e52d183a..907627379 100644
--- a/src/LinearMath/btTransform.h
+++ b/src/LinearMath/btTransform.h
@@ -31,7 +31,7 @@ subject to the following restrictions:
 
 /**@brief The btTransform class supports rigid transforms with only translation and rotation and no scaling/shear.
  *It can be used in combination with btVector3, btQuaternion and btMatrix3x3 linear algebra classes. */
-class btTransform {
+ATTRIBUTE_ALIGNED16(class) btTransform {
 	
   ///Storage for the rotation
 	btMatrix3x3 m_basis;
@@ -93,9 +93,7 @@ public:
 /**@brief Return the transform of the vector */
 	SIMD_FORCE_INLINE btVector3 operator()(const btVector3& x) const
 	{
-		return btVector3(m_basis[0].dot(x) + m_origin.x(), 
-			m_basis[1].dot(x) + m_origin.y(), 
-			m_basis[2].dot(x) + m_origin.z());
+        return x.dot3(m_basis[0], m_basis[1], m_basis[2]) + m_origin;
 	}
 
   /**@brief Return the transform of the vector */
diff --git a/src/LinearMath/btVector3.cpp b/src/LinearMath/btVector3.cpp
new file mode 100644
index 000000000..260614c16
--- /dev/null
+++ b/src/LinearMath/btVector3.cpp
@@ -0,0 +1,1631 @@
+/*
+ Copyright (c) 2011 Apple Inc.
+ http://continuousphysics.com/Bullet/
+ 
+ This software is provided 'as-is', without any express or implied warranty.
+ In no event will the authors be held liable for any damages arising from the use of this software.
+ Permission is granted to anyone to use this software for any purpose, 
+ including commercial applications, and to alter it and redistribute it freely, 
+ subject to the following restrictions:
+ 
+ 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+ 
+ This source version has been altered.
+ */
+
+#if defined (_WIN32) || defined (__i386__)
+#define BT_USE_SSE_IN_API
+#endif
+
+#include "btVector3.h"
+
+#if defined (BT_USE_SSE) || defined (BT_USE_NEON)
+
+#ifdef __APPLE__
+#include <stdint.h>
+typedef  float float4 __attribute__ ((vector_size(16)));
+#else
+#define float4 __m128
+#endif
+//typedef  uint32_t uint4 __attribute__ ((vector_size(16)));
+
+
+#if defined BT_USE_SSE || defined _WIN32
+
+#define LOG2_ARRAY_SIZE     6
+#define STACK_ARRAY_COUNT   (1UL << LOG2_ARRAY_SIZE)
+
+#include <emmintrin.h>
+
+long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
+long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
+{
+    const float4 *vertices = (const float4*) vv;
+    static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
+    float4 dotMax = btAssign128( -BT_INFINITY,  -BT_INFINITY,  -BT_INFINITY,  -BT_INFINITY );
+    float4 vvec = _mm_loadu_ps( vec );
+    float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          /// zzzz
+    float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
+    
+    long maxIndex = -1L;
+    
+    size_t segment = 0;
+    float4 stack_array[ STACK_ARRAY_COUNT ];
+    
+#if DEBUG
+    memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+#endif
+    
+    size_t index;
+    float4 max;
+    // Faster loop without cleanup code for full tiles
+    for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) 
+    {
+        max = dotMax;
+        
+        for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )   
+        { // do four dot products at a time. Carefully avoid touching the w element.
+            float4 v0 = vertices[0];
+            float4 v1 = vertices[1];
+            float4 v2 = vertices[2];
+            float4 v3 = vertices[3];            vertices += 4;
+            
+            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+1] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+2] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+3] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            
+            // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
+        }
+        
+        // If we found a new max
+        if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
+        { 
+            // copy the new max across all lanes of our max accumulator
+            max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
+            max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
+            
+            dotMax = max;
+            
+            // find first occurrence of that max  
+            size_t test;
+            for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
+            {}
+            // record where it is.
+            maxIndex = 4*index + segment + indexTable[test];
+        }
+    }
+    
+    // account for work we've already done
+    count -= segment;
+    
+    // Deal with the last < STACK_ARRAY_COUNT vectors
+    max = dotMax;
+    index = 0;
+    
+    
+    if( btUnlikely( count > 16) )
+    {
+        for( ; index + 4 <= count / 4; index+=4 )   
+        { // do four dot products at a time. Carefully avoid touching the w element.
+            float4 v0 = vertices[0];
+            float4 v1 = vertices[1];
+            float4 v2 = vertices[2];
+            float4 v3 = vertices[3];            vertices += 4;
+            
+            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+1] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+2] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+3] = x;
+            max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            
+            // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
+        }
+    }
+    
+    size_t localCount = (count & -4L) - 4*index;
+    if( localCount )
+    {
+#ifdef __APPLE__
+        float4 t0, t1, t2, t3, t4;
+        float4 * sap = &stack_array[index + localCount / 4];
+          vertices += localCount;      // counter the offset
+         size_t byteIndex = -(localCount) * sizeof(float);
+        //AT&T Code style assembly
+        asm volatile
+        (   ".align 4                                                                   \n\
+             0: movaps  %[max], %[t2]                            // move max out of the way to avoid propagating NaNs in max \n\
+          movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
+          movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
+          movaps  %[t0], %[max]                               // vertices[0]      \n\
+          movlhps %[t1], %[max]                               // x0y0x1y1         \n\
+         movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
+         movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
+          mulps   %[vLo], %[max]                              // x0y0x1y1 * vLo   \n\
+         movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
+         movaps  %[t3], %[t0]                                // vertices[2]      \n\
+         movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
+         mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
+          movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
+          shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
+          mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
+         movaps  %[max], %[t3]                               // x0y0x1y1 * vLo   \n\
+         shufps  $0x88, %[t0], %[max]                        // x0x1x2x3 * vLo.x \n\
+         shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
+         addps   %[t3], %[max]                               // x + y            \n\
+         addps   %[t1], %[max]                               // x + y + z        \n\
+         movaps  %[max], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
+         maxps   %[t2], %[max]                               // record max, restore max   \n\
+         add     $16, %[byteIndex]                           // advance loop counter\n\
+         jnz     0b                                          \n\
+     "
+         : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
+         : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
+         : "memory", "cc"
+         );
+        index += localCount/4;
+#else
+        {
+            for( int i=0; i<localCount/4; i++,index++)   
+            { // do four dot products at a time. Carefully avoid touching the w element.
+                float4 v0 = vertices[0];
+                float4 v1 = vertices[1];
+                float4 v2 = vertices[2];
+                float4 v3 = vertices[3];            
+                vertices += 4;
+                
+                float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+                float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+                float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+                float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+                
+                lo0 = lo0*vLo;
+                lo1 = lo1*vLo;
+                float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+                float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+                float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+                z = z*vHi;
+                x = x+y;
+                x = x+z;
+                stack_array[index] = x;
+                max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+            }
+        }
+#endif //__APPLE__
+    }
+
+    // process the last few points
+    if( count & 3 )
+    {
+        float4 v0, v1, v2, x, y, z;
+        switch( count & 3 )
+        {
+            case 3:
+            {
+                v0 = vertices[0];
+                v1 = vertices[1];
+                v2 = vertices[2];
+                
+                // Calculate 3 dot products, transpose, duplicate v2
+                float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
+                float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
+                lo0 = lo0*vLo;
+                z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
+                z = z*vHi;
+                float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
+                lo1 = lo1*vLo;
+                x = _mm_shuffle_ps(lo0, lo1, 0x88);
+                y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            }
+                break;
+            case 2:
+            {
+                v0 = vertices[0];
+                v1 = vertices[1];
+                float4 xy = _mm_movelh_ps(v0, v1);
+                z = _mm_movehl_ps(v1, v0);
+                xy = xy*vLo;
+                z = _mm_shuffle_ps( z, z,  0xa8);
+                x = _mm_shuffle_ps( xy, xy, 0xa8);
+                y = _mm_shuffle_ps( xy, xy, 0xfd);
+                z = z*vHi;
+            }
+                break;
+            case 1:
+            {
+                float4 xy = vertices[0];
+                z =  _mm_shuffle_ps( xy, xy, 0xaa);
+                xy = xy*vLo;
+                z = z*vHi;
+                x = _mm_shuffle_ps(xy, xy, 0);
+                y = _mm_shuffle_ps(xy, xy, 0x55);
+            }
+                break;
+        }
+        x = x+y;
+        x = x+z;
+        stack_array[index] = x;
+        max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
+        index++;
+    }
+    
+    // if we found a new max. 
+    if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
+    { // we found a new max. Search for it
+      // find max across the max vector, place in all elements of max -- big latency hit here
+        max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
+        max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
+        
+        // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
+        // this where it actually makes a difference is handled in the early out at the top of the function, 
+        // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced 
+        // complexity, and removed it.
+        
+        dotMax = max;
+        
+        // scan for the first occurence of max in the array  
+        size_t test;
+        for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
+        {}
+        maxIndex = 4*index + segment + indexTable[test];
+    }
+    
+    _mm_store_ss( dotResult, dotMax);
+    return maxIndex;
+}
+
+long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
+
+long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
+{
+    const float4 *vertices = (const float4*) vv;
+    static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
+    float4 dotmin = btAssign128( BT_INFINITY,  BT_INFINITY,  BT_INFINITY,  BT_INFINITY );
+    float4 vvec = _mm_loadu_ps( vec );
+    float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          /// zzzz
+    float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
+    
+    long minIndex = -1L;
+
+    size_t segment = 0;
+    float4 stack_array[ STACK_ARRAY_COUNT ];
+    
+#if DEBUG
+    memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+#endif
+    
+    size_t index;
+    float4 min;
+    // Faster loop without cleanup code for full tiles
+    for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) 
+    {
+        min = dotmin;
+        
+        for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )   
+        { // do four dot products at a time. Carefully avoid touching the w element.
+            float4 v0 = vertices[0];
+            float4 v1 = vertices[1];
+            float4 v2 = vertices[2];
+            float4 v3 = vertices[3];            vertices += 4;
+            
+            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+1] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+2] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+3] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            
+            // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
+        }
+        
+        // If we found a new min
+        if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
+        { 
+            // copy the new min across all lanes of our min accumulator
+            min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
+            min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
+            
+            dotmin = min;
+            
+            // find first occurrence of that min  
+            size_t test;
+            for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
+            {}
+            // record where it is.
+            minIndex = 4*index + segment + indexTable[test];
+        }
+    }
+    
+    // account for work we've already done
+    count -= segment;
+    
+    // Deal with the last < STACK_ARRAY_COUNT vectors
+    min = dotmin;
+    index = 0;
+    
+    
+    if(btUnlikely( count > 16) )
+    {
+        for( ; index + 4 <= count / 4; index+=4 )   
+        { // do four dot products at a time. Carefully avoid touching the w element.
+            float4 v0 = vertices[0];
+            float4 v1 = vertices[1];
+            float4 v2 = vertices[2];
+            float4 v3 = vertices[3];            vertices += 4;
+            
+            float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+1] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+2] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            
+            v0 = vertices[0];
+            v1 = vertices[1];
+            v2 = vertices[2];
+            v3 = vertices[3];            vertices += 4;
+            
+            lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+            hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+            lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+            hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+            
+            lo0 = lo0*vLo;
+            lo1 = lo1*vLo;
+            z = _mm_shuffle_ps(hi0, hi1, 0x88);
+            x = _mm_shuffle_ps(lo0, lo1, 0x88);
+            y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            z = z*vHi;
+            x = x+y;
+            x = x+z;
+            stack_array[index+3] = x;
+            min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+            
+            // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
+        }
+    }
+    
+    size_t localCount = (count & -4L) - 4*index;
+    if( localCount )
+    {
+        
+        
+#ifdef __APPLE__
+        vertices += localCount;      // counter the offset
+        float4 t0, t1, t2, t3, t4;
+        size_t byteIndex = -(localCount) * sizeof(float);
+        float4 * sap = &stack_array[index + localCount / 4];
+        
+        asm volatile
+        (   ".align 4                                                                   \n\
+             0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\
+             movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
+             movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
+             movaps  %[t0], %[min]                               // vertices[0]      \n\
+             movlhps %[t1], %[min]                               // x0y0x1y1         \n\
+             movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
+             movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
+             mulps   %[vLo], %[min]                              // x0y0x1y1 * vLo   \n\
+             movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
+             movaps  %[t3], %[t0]                                // vertices[2]      \n\
+             movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
+             movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
+             mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
+             shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
+             mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
+             movaps  %[min], %[t3]                               // x0y0x1y1 * vLo   \n\
+             shufps  $0x88, %[t0], %[min]                        // x0x1x2x3 * vLo.x \n\
+             shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
+             addps   %[t3], %[min]                               // x + y            \n\
+             addps   %[t1], %[min]                               // x + y + z        \n\
+             movaps  %[min], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
+             minps   %[t2], %[min]                               // record min, restore min   \n\
+             add     $16, %[byteIndex]                           // advance loop counter\n\
+             jnz     0b                                          \n\
+             "
+         : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
+         : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
+         : "memory", "cc"
+         );
+        index += localCount/4;
+#else
+        {
+            for( int i=0; i<localCount/4; i++,index++)   
+            { // do four dot products at a time. Carefully avoid touching the w element.
+                float4 v0 = vertices[0];
+                float4 v1 = vertices[1];
+                float4 v2 = vertices[2];
+                float4 v3 = vertices[3];            
+                vertices += 4;
+                
+                float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
+                float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
+                float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
+                float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
+                
+                lo0 = lo0*vLo;
+                lo1 = lo1*vLo;
+                float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+                float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+                float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+                z = z*vHi;
+                x = x+y;
+                x = x+z;
+                stack_array[index] = x;
+                min = _mm_min_ps( x, min );         // control the order here so that max is never NaN even if x is nan
+            }
+        }
+
+#endif
+    }
+    
+    // process the last few points
+    if( count & 3 )
+    {
+        float4 v0, v1, v2, x, y, z;
+        switch( count & 3 )
+        {
+            case 3:
+            {
+                v0 = vertices[0];
+                v1 = vertices[1];
+                v2 = vertices[2];
+                
+                // Calculate 3 dot products, transpose, duplicate v2
+                float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
+                float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
+                lo0 = lo0*vLo;
+                z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
+                z = z*vHi;
+                float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
+                lo1 = lo1*vLo;
+                x = _mm_shuffle_ps(lo0, lo1, 0x88);
+                y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+            }
+                break;
+            case 2:
+            {
+                v0 = vertices[0];
+                v1 = vertices[1];
+                float4 xy = _mm_movelh_ps(v0, v1);
+                z = _mm_movehl_ps(v1, v0);
+                xy = xy*vLo;
+                z = _mm_shuffle_ps( z, z,  0xa8);
+                x = _mm_shuffle_ps( xy, xy, 0xa8);
+                y = _mm_shuffle_ps( xy, xy, 0xfd);
+                z = z*vHi;
+            }
+                break;
+            case 1:
+            {
+                float4 xy = vertices[0];
+                z =  _mm_shuffle_ps( xy, xy, 0xaa);
+                xy = xy*vLo;
+                z = z*vHi;
+                x = _mm_shuffle_ps(xy, xy, 0);
+                y = _mm_shuffle_ps(xy, xy, 0x55);
+            }
+                break;
+        }
+        x = x+y;
+        x = x+z;
+        stack_array[index] = x;
+        min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
+        index++;
+    }
+    
+    // if we found a new min. 
+    if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
+    { // we found a new min. Search for it
+      // find min across the min vector, place in all elements of min -- big latency hit here
+        min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
+        min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
+        
+        // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
+        // this where it actually makes a difference is handled in the early out at the top of the function, 
+        // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced 
+        // complexity, and removed it.
+        
+        dotmin = min;
+        
+        // scan for the first occurence of min in the array  
+        size_t test;
+        for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
+        {}
+        minIndex = 4*index + segment + indexTable[test];
+    }
+    
+    _mm_store_ss( dotResult, dotmin);
+    return minIndex;
+}
+
+
+#elif defined BT_USE_NEON
+#define ARM_NEON_GCC_COMPATIBILITY  1
+#include <arm_neon.h>
+
+
+static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
+static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
+
+long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
+long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
+
+extern "C" {int  _get_cpu_capabilities( void );}
+
+static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
+{
+    if( _get_cpu_capabilities() & 0x2000 )
+        _maxdot_large = _maxdot_large_v1;
+    else
+        _maxdot_large = _maxdot_large_v0;
+    
+    return _maxdot_large(vv, vec, count, dotResult);
+}
+
+static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
+{
+    if( _get_cpu_capabilities() & 0x2000 )
+        _mindot_large = _mindot_large_v1;
+    else
+        _mindot_large = _mindot_large_v0;
+    
+    return _mindot_large(vv, vec, count, dotResult);
+}
+
+
+
+#define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32  {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
+
+
+long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
+{
+    unsigned long i = 0;
+    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
+    float32x2_t vLo = vget_low_f32(vvec);
+    float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
+    float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
+    float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
+    uint32x2_t indexLo = (uint32x2_t) {0, 1};
+    uint32x2_t indexHi = (uint32x2_t) {2, 3};
+    uint32x2_t iLo = (uint32x2_t) {-1, -1};
+    uint32x2_t iHi = (uint32x2_t) {-1, -1};
+    const uint32x2_t four = (uint32x2_t) {4,4};
+
+    for( ; i+8 <= count; i+= 8 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        
+        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        
+        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
+        
+        float32x2_t rLo = vpadd_f32( xy0, xy1);
+        float32x2_t rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        
+        uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+        uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
+        dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+        dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four); 
+        indexHi = vadd_u32(indexHi, four);
+
+        v0 = vld1q_f32_aligned_postincrement( vv );
+        v1 = vld1q_f32_aligned_postincrement( vv );
+        v2 = vld1q_f32_aligned_postincrement( vv );
+        v3 = vld1q_f32_aligned_postincrement( vv );
+        
+        xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        
+        z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        zLo = vmul_f32( z0.val[0], vHi);
+        zHi = vmul_f32( z1.val[0], vHi);
+        
+        rLo = vpadd_f32( xy0, xy1);
+        rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        
+        maskLo = vcgt_f32( rLo, dotMaxLo );
+        maskHi = vcgt_f32( rHi, dotMaxHi );
+        dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+        dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+    }
+
+    for( ; i+4 <= count; i+= 4 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        
+        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        
+        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
+        
+        float32x2_t rLo = vpadd_f32( xy0, xy1);
+        float32x2_t rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        
+        uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+        uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
+        dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+        dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+    }
+    
+    switch( count & 3 )
+    {
+        case 3:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+            
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+            float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+            
+            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+            float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
+            
+            float32x2_t rLo = vpadd_f32( xy0, xy1);
+            float32x2_t rHi = vpadd_f32( xy2, xy2);
+            rLo = vadd_f32(rLo, zLo);
+            rHi = vadd_f32(rHi, zHi);
+            
+            uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+            uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
+            dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+            dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+            iHi = vbsl_u32(maskHi, indexHi, iHi);
+        }
+            break;
+        case 2:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+            
+            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+            
+            float32x2_t rLo = vpadd_f32( xy0, xy1);
+            rLo = vadd_f32(rLo, zLo);
+            
+            uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+            dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+        }
+            break;
+        case 1:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
+            float32x2_t zLo = vmul_f32( z0, vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy0);
+            rLo = vadd_f32(rLo, zLo);
+            uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
+            dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+        }
+            break;
+        
+        default:
+            break;
+    }
+    
+    // select best answer between hi and lo results
+    uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
+    dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
+    iLo = vbsl_u32(mask, iHi, iLo);
+    
+    // select best answer between even and odd results
+    dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
+    iHi = vdup_lane_u32(iLo, 1);
+    mask = vcgt_f32( dotMaxHi, dotMaxLo );
+    dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
+    iLo = vbsl_u32(mask, iHi, iLo);
+    
+    *dotResult = vget_lane_f32( dotMaxLo, 0);
+    return vget_lane_u32(iLo, 0);
+}
+
+
+long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
+{
+    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
+    float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
+    float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
+    const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
+    uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
+    uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
+    float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
+    
+    unsigned long i = 0;
+    for( ; i + 8 <= count; i += 8 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        
+        float32x4x2_t zb = vuzpq_f32( z0, z1);
+        float32x4_t z = vmulq_f32( zb.val[0], vHi);
+        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        
+        uint32x4_t mask = vcgtq_f32(x, maxDot);
+        maxDot = vbslq_f32( mask, x, maxDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+
+        v0 = vld1q_f32_aligned_postincrement( vv );
+        v1 = vld1q_f32_aligned_postincrement( vv );
+        v2 = vld1q_f32_aligned_postincrement( vv );
+        v3 = vld1q_f32_aligned_postincrement( vv );
+        
+        // the next two lines should resolve to a single vswp d, d
+        xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        
+        zb = vuzpq_f32( z0, z1);
+        z = vmulq_f32( zb.val[0], vHi);
+        xy = vuzpq_f32( xy0, xy1);
+        x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        
+        mask = vcgtq_f32(x, maxDot);
+        maxDot = vbslq_f32( mask, x, maxDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+    }
+
+    for( ; i + 4 <= count; i += 4 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        
+        float32x4x2_t zb = vuzpq_f32( z0, z1);
+        float32x4_t z = vmulq_f32( zb.val[0], vHi);
+        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        
+        uint32x4_t mask = vcgtq_f32(x, maxDot);
+        maxDot = vbslq_f32( mask, x, maxDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+    }
+    
+    switch (count & 3) {
+        case 3:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+            
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+            float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
+            
+            xy0 = vmulq_f32(xy0, vLo);
+            xy1 = vmulq_f32(xy1, vLo);
+            
+            float32x4x2_t zb = vuzpq_f32( z0, z1);
+            float32x4_t z = vmulq_f32( zb.val[0], vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            
+            uint32x4_t mask = vcgtq_f32(x, maxDot);
+            maxDot = vbslq_f32( mask, x, maxDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+
+        case 2:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+            
+            xy0 = vmulq_f32(xy0, vLo);
+            
+            float32x4x2_t zb = vuzpq_f32( z0, z0);
+            float32x4_t z = vmulq_f32( zb.val[0], vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            
+            uint32x4_t mask = vcgtq_f32(x, maxDot);
+            maxDot = vbslq_f32( mask, x, maxDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+
+        case 1:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); 
+            
+            xy0 = vmulq_f32(xy0, vLo);
+            
+            z = vmulq_f32( z, vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            
+            uint32x4_t mask = vcgtq_f32(x, maxDot);
+            maxDot = vbslq_f32( mask, x, maxDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+
+        default:
+            break;
+    }
+    
+    
+    // select best answer between hi and lo results
+    uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
+    float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
+    uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
+    
+    // select best answer between even and odd results
+    float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
+    uint32x2_t indexHi = vdup_lane_u32(index2, 1);
+    mask = vcgt_f32( maxDotO, maxDot2 );
+    maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
+    index2 = vbsl_u32(mask, indexHi, index2);
+    
+    *dotResult = vget_lane_f32( maxDot2, 0);
+    return vget_lane_u32(index2, 0);
+    
+}
+
+long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
+{
+    unsigned long i = 0;
+    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
+    float32x2_t vLo = vget_low_f32(vvec);
+    float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
+    float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
+    float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
+    uint32x2_t indexLo = (uint32x2_t) {0, 1};
+    uint32x2_t indexHi = (uint32x2_t) {2, 3};
+    uint32x2_t iLo = (uint32x2_t) {-1, -1};
+    uint32x2_t iHi = (uint32x2_t) {-1, -1};
+    const uint32x2_t four = (uint32x2_t) {4,4};
+    
+    for( ; i+8 <= count; i+= 8 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        
+        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        
+        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
+        
+        float32x2_t rLo = vpadd_f32( xy0, xy1);
+        float32x2_t rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        
+        uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+        uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
+        dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+        dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+        
+        v0 = vld1q_f32_aligned_postincrement( vv );
+        v1 = vld1q_f32_aligned_postincrement( vv );
+        v2 = vld1q_f32_aligned_postincrement( vv );
+        v3 = vld1q_f32_aligned_postincrement( vv );
+        
+        xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        
+        z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        zLo = vmul_f32( z0.val[0], vHi);
+        zHi = vmul_f32( z1.val[0], vHi);
+        
+        rLo = vpadd_f32( xy0, xy1);
+        rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        
+        maskLo = vclt_f32( rLo, dotMinLo );
+        maskHi = vclt_f32( rHi, dotMinHi );
+        dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+        dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+    }
+
+    for( ; i+4 <= count; i+= 4 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        
+        float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+        float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+        float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+        float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
+        
+        float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
+        float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+        float32x2_t zHi = vmul_f32( z1.val[0], vHi);
+        
+        float32x2_t rLo = vpadd_f32( xy0, xy1);
+        float32x2_t rHi = vpadd_f32( xy2, xy3);
+        rLo = vadd_f32(rLo, zLo);
+        rHi = vadd_f32(rHi, zHi);
+        
+        uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+        uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
+        dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+        dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
+        iLo = vbsl_u32(maskLo, indexLo, iLo);
+        iHi = vbsl_u32(maskHi, indexHi, iHi);
+        indexLo = vadd_u32(indexLo, four);
+        indexHi = vadd_u32(indexHi, four);
+    }
+    switch( count & 3 )
+    {
+        case 3:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+            
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+            float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
+            
+            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+            float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
+            
+            float32x2_t rLo = vpadd_f32( xy0, xy1);
+            float32x2_t rHi = vpadd_f32( xy2, xy2);
+            rLo = vadd_f32(rLo, zLo);
+            rHi = vadd_f32(rHi, zHi);
+            
+            uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+            uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
+            dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+            dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+            iHi = vbsl_u32(maskHi, indexHi, iHi);
+        }
+            break;
+        case 2:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
+            
+            float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x2_t zLo = vmul_f32( z0.val[0], vHi);
+            
+            float32x2_t rLo = vpadd_f32( xy0, xy1);
+            rLo = vadd_f32(rLo, zLo);
+            
+            uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+            dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+        }
+            break;
+        case 1:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
+            float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
+            float32x2_t zLo = vmul_f32( z0, vHi);
+            float32x2_t rLo = vpadd_f32( xy0, xy0);
+            rLo = vadd_f32(rLo, zLo);
+            uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
+            dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
+            iLo = vbsl_u32(maskLo, indexLo, iLo);
+        }
+            break;
+            
+        default:
+            break;
+    }
+    
+    // select best answer between hi and lo results
+    uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
+    dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
+    iLo = vbsl_u32(mask, iHi, iLo);
+    
+    // select best answer between even and odd results
+    dotMinHi = vdup_lane_f32(dotMinLo, 1);
+    iHi = vdup_lane_u32(iLo, 1);
+    mask = vclt_f32( dotMinHi, dotMinLo );
+    dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
+    iLo = vbsl_u32(mask, iHi, iLo);
+    
+    *dotResult = vget_lane_f32( dotMinLo, 0);
+    return vget_lane_u32(iLo, 0);
+}
+
+long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
+{
+    float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
+    float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
+    float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
+    const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
+    uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
+    uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
+    float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
+    
+    unsigned long i = 0;
+    for( ; i + 8 <= count; i += 8 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        
+        float32x4x2_t zb = vuzpq_f32( z0, z1);
+        float32x4_t z = vmulq_f32( zb.val[0], vHi);
+        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        
+        uint32x4_t mask = vcltq_f32(x, minDot);
+        minDot = vbslq_f32( mask, x, minDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+        
+        v0 = vld1q_f32_aligned_postincrement( vv );
+        v1 = vld1q_f32_aligned_postincrement( vv );
+        v2 = vld1q_f32_aligned_postincrement( vv );
+        v3 = vld1q_f32_aligned_postincrement( vv );
+        
+        // the next two lines should resolve to a single vswp d, d
+        xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        
+        zb = vuzpq_f32( z0, z1);
+        z = vmulq_f32( zb.val[0], vHi);
+        xy = vuzpq_f32( xy0, xy1);
+        x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        
+        mask = vcltq_f32(x, minDot);
+        minDot = vbslq_f32( mask, x, minDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+    }
+    
+    for( ; i + 4 <= count; i += 4 )
+    {
+        float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+        float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
+        
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+        float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
+        // the next two lines should resolve to a single vswp d, d
+        float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+        float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
+        
+        xy0 = vmulq_f32(xy0, vLo);
+        xy1 = vmulq_f32(xy1, vLo);
+        
+        float32x4x2_t zb = vuzpq_f32( z0, z1);
+        float32x4_t z = vmulq_f32( zb.val[0], vHi);
+        float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+        float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+        x = vaddq_f32(x, z);
+        
+        uint32x4_t mask = vcltq_f32(x, minDot);
+        minDot = vbslq_f32( mask, x, minDot);
+        index = vbslq_u32(mask, local_index, index);
+        local_index = vaddq_u32(local_index, four);
+    }
+    
+    switch (count & 3) {
+        case 3:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
+            
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+            float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+            float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
+            
+            xy0 = vmulq_f32(xy0, vLo);
+            xy1 = vmulq_f32(xy1, vLo);
+            
+            float32x4x2_t zb = vuzpq_f32( z0, z1);
+            float32x4_t z = vmulq_f32( zb.val[0], vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy1);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            
+            uint32x4_t mask = vcltq_f32(x, minDot);
+            minDot = vbslq_f32( mask, x, minDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+            
+        case 2:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
+            
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
+            
+            xy0 = vmulq_f32(xy0, vLo);
+            
+            float32x4x2_t zb = vuzpq_f32( z0, z0);
+            float32x4_t z = vmulq_f32( zb.val[0], vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            
+            uint32x4_t mask = vcltq_f32(x, minDot);
+            minDot = vbslq_f32( mask, x, minDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+            
+        case 1:
+        {
+            float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
+            
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
+            // the next two lines should resolve to a single vswp d, d
+            float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); 
+            
+            xy0 = vmulq_f32(xy0, vLo);
+            
+            z = vmulq_f32( z, vHi);
+            float32x4x2_t xy = vuzpq_f32( xy0, xy0);
+            float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+            x = vaddq_f32(x, z);
+            
+            uint32x4_t mask = vcltq_f32(x, minDot);
+            minDot = vbslq_f32( mask, x, minDot);
+            index = vbslq_u32(mask, local_index, index);
+            local_index = vaddq_u32(local_index, four);
+        }
+            break;
+            
+        default:
+            break;
+    }
+    
+    
+    // select best answer between hi and lo results
+    uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
+    float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
+    uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
+    
+    // select best answer between even and odd results
+    float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
+    uint32x2_t indexHi = vdup_lane_u32(index2, 1);
+    mask = vclt_f32( minDotO, minDot2 );
+    minDot2 = vbsl_f32(mask, minDotO, minDot2);
+    index2 = vbsl_u32(mask, indexHi, index2);
+    
+    *dotResult = vget_lane_f32( minDot2, 0);
+    return vget_lane_u32(index2, 0);
+    
+}
+
+#else
+    #error Unhandled __APPLE__ arch
+#endif
+
+#endif  /* __APPLE__ */
+
+
diff --git a/src/LinearMath/btVector3.h b/src/LinearMath/btVector3.h
index d99b7c83a..b2448ca31 100644
--- a/src/LinearMath/btVector3.h
+++ b/src/LinearMath/btVector3.h
@@ -17,9 +17,10 @@ subject to the following restrictions:
 #ifndef BT_VECTOR3_H
 #define BT_VECTOR3_H
 
-
+//#include <stdint.h>
 #include "btScalar.h"
 #include "btMinMax.h"
+#include "btAlignedAllocator.h"
 
 #ifdef BT_USE_DOUBLE_PRECISION
 #define btVector3Data btVector3DoubleData
@@ -29,8 +30,46 @@ subject to the following restrictions:
 #define btVector3DataName "btVector3FloatData"
 #endif //BT_USE_DOUBLE_PRECISION
 
+#if defined BT_USE_SSE
+
+//typedef  uint32_t __m128i __attribute__ ((vector_size(16)));
+
+#ifdef _MSC_VER
+#pragma warning(disable: 4556) // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
+#endif
 
 
+#define BT_SHUFFLE(x,y,z,w) ((w)<<6 | (z)<<4 | (y)<<2 | (x))
+//#define bt_pshufd_ps( _a, _mask ) (__m128) _mm_shuffle_epi32((__m128i)(_a), (_mask) )
+#define bt_pshufd_ps( _a, _mask ) _mm_shuffle_ps((_a), (_a), (_mask) )
+#define bt_splat3_ps( _a, _i ) bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i, 3) )
+#define bt_splat_ps( _a, _i )  bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i,_i) )
+
+#define btv3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
+#define btvAbsMask (_mm_set_epi32( 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
+#define btvFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
+#define btv3AbsfMask btCastiTo128f(btv3AbsiMask)
+#define btvFFF0fMask btCastiTo128f(btvFFF0Mask)
+#define btvxyzMaskf btvFFF0fMask
+#define btvAbsfMask btCastiTo128f(btvAbsMask)
+
+
+
+const __m128 ATTRIBUTE_ALIGNED16(btvMzeroMask) = {-0.0f, -0.0f, -0.0f, -0.0f};
+const __m128 ATTRIBUTE_ALIGNED16(v1110) = {1.0f, 1.0f, 1.0f, 0.0f};
+const __m128 ATTRIBUTE_ALIGNED16(vHalf) = {0.5f, 0.5f, 0.5f, 0.5f};
+const __m128 ATTRIBUTE_ALIGNED16(v1_5)  = {1.5f, 1.5f, 1.5f, 1.5f};
+
+#endif
+
+#ifdef BT_USE_NEON
+
+const float32x4_t ATTRIBUTE_ALIGNED16(btvMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f};
+const int32x4_t ATTRIBUTE_ALIGNED16(btvFFF0Mask) = (int32x4_t){0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
+const int32x4_t ATTRIBUTE_ALIGNED16(btvAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+const int32x4_t ATTRIBUTE_ALIGNED16(btv3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0};
+
+#endif
 
 /**@brief btVector3 can be used to represent 3D points and vectors.
  * It has an un-used w component to suit 16-byte alignment when btVector3 is stored in containers. This extra component can be used by derived classes (Quaternion?) or by user
@@ -40,6 +79,8 @@ ATTRIBUTE_ALIGNED16(class) btVector3
 {
 public:
 
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
 #if defined (__SPU__) && defined (__CELLOS_LV2__)
 		btScalar	m_floats[4];
 public:
@@ -49,28 +90,31 @@ public:
 	}
 public:
 #else //__CELLOS_LV2__ __SPU__
-#ifdef BT_USE_SSE // _WIN32
-	union {
-		__m128 mVec128;
-		btScalar	m_floats[4];
-	};
-	SIMD_FORCE_INLINE	__m128	get128() const
-	{
-		return mVec128;
-	}
-	SIMD_FORCE_INLINE	void	set128(__m128 v128)
-	{
-		mVec128 = v128;
-	}
-#else
-	btScalar	m_floats[4];
-#endif
+    #if defined (BT_USE_SSE) || defined(BT_USE_NEON) // _WIN32 || ARM
+        union {
+            btSimdFloat4      mVec128;
+            btScalar	m_floats[4];
+        };
+        SIMD_FORCE_INLINE	btSimdFloat4	get128() const
+        {
+            return mVec128;
+        }
+        SIMD_FORCE_INLINE	void	set128(btSimdFloat4 v128)
+        {
+            mVec128 = v128;
+        }
+    #else
+        btScalar	m_floats[4];
+    #endif
 #endif //__CELLOS_LV2__ __SPU__
 
 	public:
 
   /**@brief No initialization constructor */
-	SIMD_FORCE_INLINE btVector3() {}
+	SIMD_FORCE_INLINE btVector3() 
+	{
+
+	}
 
  
 	
@@ -79,21 +123,50 @@ public:
    * @param y Y value 
    * @param z Z value 
    */
-	SIMD_FORCE_INLINE btVector3(const btScalar& x, const btScalar& y, const btScalar& z)
+	SIMD_FORCE_INLINE btVector3(const btScalar& _x, const btScalar& _y, const btScalar& _z)
 	{
-		m_floats[0] = x;
-		m_floats[1] = y;
-		m_floats[2] = z;
-		m_floats[3] = btScalar(0.);
+		m_floats[0] = _x;
+		m_floats[1] = _y;
+		m_floats[2] = _z;
+		m_floats[3] = btScalar(0.f);
 	}
 
-	
+#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) )|| defined (BT_USE_NEON)
+	// Set Vector 
+	SIMD_FORCE_INLINE btVector3( btSimdFloat4 v)
+	{
+		mVec128 = v;
+	}
+
+	// Copy constructor
+	SIMD_FORCE_INLINE btVector3(const btVector3& rhs)
+	{
+		mVec128 = rhs.mVec128;
+	}
+
+	// Assignment Operator
+	SIMD_FORCE_INLINE btVector3& 
+	operator=(const btVector3& v) 
+	{
+		mVec128 = v.mVec128;
+		
+		return *this;
+	}
+#endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON) 
+    
 /**@brief Add a vector to this one 
  * @param The vector to add to this one */
 	SIMD_FORCE_INLINE btVector3& operator+=(const btVector3& v)
 	{
-
-		m_floats[0] += v.m_floats[0]; m_floats[1] += v.m_floats[1];m_floats[2] += v.m_floats[2];
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_add_ps(mVec128, v.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vaddq_f32(mVec128, v.mVec128);
+#else
+		m_floats[0] += v.m_floats[0]; 
+		m_floats[1] += v.m_floats[1];
+		m_floats[2] += v.m_floats[2];
+#endif
 		return *this;
 	}
 
@@ -102,14 +175,33 @@ public:
    * @param The vector to subtract */
 	SIMD_FORCE_INLINE btVector3& operator-=(const btVector3& v) 
 	{
-		m_floats[0] -= v.m_floats[0]; m_floats[1] -= v.m_floats[1];m_floats[2] -= v.m_floats[2];
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_sub_ps(mVec128, v.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vsubq_f32(mVec128, v.mVec128);
+#else
+		m_floats[0] -= v.m_floats[0]; 
+		m_floats[1] -= v.m_floats[1];
+		m_floats[2] -= v.m_floats[2];
+#endif
 		return *this;
 	}
+	
   /**@brief Scale the vector
    * @param s Scale factor */
 	SIMD_FORCE_INLINE btVector3& operator*=(const btScalar& s)
 	{
-		m_floats[0] *= s; m_floats[1] *= s;m_floats[2] *= s;
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+		vs = bt_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
+		mVec128 = _mm_mul_ps(mVec128, vs);
+#elif defined(BT_USE_NEON)
+		mVec128 = vmulq_n_f32(mVec128, s);
+#else
+		m_floats[0] *= s; 
+		m_floats[1] *= s;
+		m_floats[2] *= s;
+#endif
 		return *this;
 	}
 
@@ -118,14 +210,42 @@ public:
 	SIMD_FORCE_INLINE btVector3& operator/=(const btScalar& s) 
 	{
 		btFullAssert(s != btScalar(0.0));
+
+#if 0 //defined(BT_USE_SSE_IN_API)
+// this code is not faster !
+		__m128 vs = _mm_load_ss(&s);
+		vs = _mm_div_ss(v1110, vs);
+		vs = bt_pshufd_ps(vs, 0x00);	//	(S S S S)
+
+		mVec128 = _mm_mul_ps(mVec128, vs);
+		
+		return *this;
+#else
 		return *this *= btScalar(1.0) / s;
+#endif
 	}
 
   /**@brief Return the dot product
    * @param v The other vector in the dot product */
 	SIMD_FORCE_INLINE btScalar dot(const btVector3& v) const
 	{
-		return m_floats[0] * v.m_floats[0] + m_floats[1] * v.m_floats[1] +m_floats[2] * v.m_floats[2];
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)		
+		__m128 vd = _mm_mul_ps(mVec128, v.mVec128);
+		__m128 z = _mm_movehl_ps(vd, vd);
+		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, y);
+		vd = _mm_add_ss(vd, z);
+		return _mm_cvtss_f32(vd);
+#elif defined(BT_USE_NEON)
+		float32x4_t vd = vmulq_f32(mVec128, v.mVec128);
+		float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd));  
+		x = vadd_f32(x, vget_high_f32(vd));
+		return vget_lane_f32(x, 0);
+#else	
+		return	m_floats[0] * v.m_floats[0] + 
+				m_floats[1] * v.m_floats[1] + 
+				m_floats[2] * v.m_floats[2];
+#endif
 	}
 
   /**@brief Return the length of the vector squared */
@@ -165,7 +285,44 @@ public:
    * x^2 + y^2 + z^2 = 1 */
 	SIMD_FORCE_INLINE btVector3& normalize() 
 	{
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)		
+        // dot product first
+		__m128 vd = _mm_mul_ps(mVec128, mVec128);
+		__m128 z = _mm_movehl_ps(vd, vd);
+		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, y);
+		vd = _mm_add_ss(vd, z);
+		
+        #if 0
+        vd = _mm_sqrt_ss(vd);
+		vd = _mm_div_ss(v1110, vd);
+		vd = bt_splat_ps(vd, 0x80);
+		mVec128 = _mm_mul_ps(mVec128, vd);
+        #else
+        
+        // NR step 1/sqrt(x) - vd is x, y is output 
+        y = _mm_rsqrt_ss(vd); // estimate 
+        
+        //  one step NR 
+        z = v1_5;
+        vd = _mm_mul_ss(vd, vHalf); // vd * 0.5	
+        //x2 = vd;
+        vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0
+        vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0 * y0
+        z = _mm_sub_ss(z, vd);  // 1.5 - vd * 0.5 * y0 * y0 
+
+        y = _mm_mul_ss(y, z);   // y0 * (1.5 - vd * 0.5 * y0 * y0)
+
+		y = bt_splat_ps(y, 0x80);
+		mVec128 = _mm_mul_ps(mVec128, y);
+
+        #endif
+
+		
+		return *this;
+#else	
 		return *this /= length();
+#endif
 	}
 
   /**@brief Return a normalized version of this vector */
@@ -184,29 +341,111 @@ public:
 		btFullAssert(s != btScalar(0.0));
 		return btAcos(dot(v) / s);
 	}
+	
   /**@brief Return a vector will the absolute values of each element */
 	SIMD_FORCE_INLINE btVector3 absolute() const 
 	{
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) 
+		return btVector3(_mm_and_ps(mVec128, btv3AbsfMask));
+#elif defined(BT_USE_NEON)
+		return btVector3(vabsq_f32(mVec128));
+#else	
 		return btVector3(
 			btFabs(m_floats[0]), 
 			btFabs(m_floats[1]), 
 			btFabs(m_floats[2]));
+#endif
 	}
+	
   /**@brief Return the cross product between this and another vector 
    * @param v The other vector */
 	SIMD_FORCE_INLINE btVector3 cross(const btVector3& v) const
 	{
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	T, V;
+		
+		T = bt_pshufd_ps(mVec128, BT_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		V = bt_pshufd_ps(v.mVec128, BT_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		
+		V = _mm_mul_ps(V, mVec128);
+		T = _mm_mul_ps(T, v.mVec128);
+		V = _mm_sub_ps(V, T);
+		
+		V = bt_pshufd_ps(V, BT_SHUFFLE(1, 2, 0, 3));
+		return btVector3(V);
+#elif defined(BT_USE_NEON)
+		float32x4_t T, V;
+		// form (Y, Z, X, _) of mVec128 and v.mVec128
+		float32x2_t Tlow = vget_low_f32(mVec128);
+		float32x2_t Vlow = vget_low_f32(v.mVec128);
+		T = vcombine_f32(vext_f32(Tlow, vget_high_f32(mVec128), 1), Tlow);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v.mVec128), 1), Vlow);
+		
+		V = vmulq_f32(V, mVec128);
+		T = vmulq_f32(T, v.mVec128);
+		V = vsubq_f32(V, T);
+		Vlow = vget_low_f32(V);
+		// form (Y, Z, X, _);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
+		V = (float32x4_t)vandq_s32((int32x4_t)V, btvFFF0Mask);
+		
+		return btVector3(V);
+#else
 		return btVector3(
-			m_floats[1] * v.m_floats[2] -m_floats[2] * v.m_floats[1],
+			m_floats[1] * v.m_floats[2] - m_floats[2] * v.m_floats[1],
 			m_floats[2] * v.m_floats[0] - m_floats[0] * v.m_floats[2],
 			m_floats[0] * v.m_floats[1] - m_floats[1] * v.m_floats[0]);
+#endif
 	}
 
 	SIMD_FORCE_INLINE btScalar triple(const btVector3& v1, const btVector3& v2) const
 	{
-		return m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) + 
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		// cross:
+		__m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, BT_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		__m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, BT_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
+		
+		V = _mm_mul_ps(V, v1.mVec128);
+		T = _mm_mul_ps(T, v2.mVec128);
+		V = _mm_sub_ps(V, T);
+		
+		V = _mm_shuffle_ps(V, V, BT_SHUFFLE(1, 2, 0, 3));
+
+		// dot: 
+		V = _mm_mul_ps(V, mVec128);
+		__m128 z = _mm_movehl_ps(V, V);
+		__m128 y = _mm_shuffle_ps(V, V, 0x55);
+		V = _mm_add_ss(V, y);
+		V = _mm_add_ss(V, z);
+		return _mm_cvtss_f32(V);
+
+#elif defined(BT_USE_NEON)
+		// cross:
+		float32x4_t T, V;
+		// form (Y, Z, X, _) of mVec128 and v.mVec128
+		float32x2_t Tlow = vget_low_f32(v1.mVec128);
+		float32x2_t Vlow = vget_low_f32(v2.mVec128);
+		T = vcombine_f32(vext_f32(Tlow, vget_high_f32(v1.mVec128), 1), Tlow);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v2.mVec128), 1), Vlow);
+		
+		V = vmulq_f32(V, v1.mVec128);
+		T = vmulq_f32(T, v2.mVec128);
+		V = vsubq_f32(V, T);
+		Vlow = vget_low_f32(V);
+		// form (Y, Z, X, _);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
+
+		// dot: 
+		V = vmulq_f32(mVec128, V);
+		float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V));  
+		x = vadd_f32(x, vget_high_f32(V));
+		return vget_lane_f32(x, 0);
+#else
+		return 
+			m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) + 
 			m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) + 
 			m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]);
+#endif
 	}
 
   /**@brief Return the axis with the smallest value 
@@ -235,12 +474,25 @@ public:
 
 	SIMD_FORCE_INLINE void setInterpolate3(const btVector3& v0, const btVector3& v1, btScalar rt)
 	{
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vrt = _mm_load_ss(&rt);	//	(rt 0 0 0)
+		vrt = bt_pshufd_ps(vrt, 0x80);	//	(rt rt rt 0.0)
+		
+		mVec128 = _mm_sub_ps(v1.mVec128, v0.mVec128);
+		mVec128 = _mm_mul_ps(mVec128, vrt);
+		mVec128 = _mm_add_ps(mVec128, v0.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vsubq_f32(v1.mVec128, v0.mVec128);
+		mVec128 = vmulq_n_f32(mVec128, rt);
+		mVec128 = vaddq_f32(mVec128, v0.mVec128);
+#else	
 		btScalar s = btScalar(1.0) - rt;
 		m_floats[0] = s * v0.m_floats[0] + rt * v1.m_floats[0];
 		m_floats[1] = s * v0.m_floats[1] + rt * v1.m_floats[1];
 		m_floats[2] = s * v0.m_floats[2] + rt * v1.m_floats[2];
 		//don't do the unused w component
 		//		m_co[3] = s * v0[3] + rt * v1[3];
+#endif
 	}
 
   /**@brief Return the linear interpolation between this and another vector 
@@ -248,16 +500,41 @@ public:
    * @param t The ration of this to v (t = 0 => return this, t=1 => return other) */
 	SIMD_FORCE_INLINE btVector3 lerp(const btVector3& v, const btScalar& t) const 
 	{
-		return btVector3(m_floats[0] + (v.m_floats[0] - m_floats[0]) * t,
-			m_floats[1] + (v.m_floats[1] - m_floats[1]) * t,
-			m_floats[2] + (v.m_floats[2] -m_floats[2]) * t);
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		__m128	vt = _mm_load_ss(&t);	//	(t 0 0 0)
+		vt = bt_pshufd_ps(vt, 0x80);	//	(rt rt rt 0.0)
+		__m128 vl = _mm_sub_ps(v.mVec128, mVec128);
+		vl = _mm_mul_ps(vl, vt);
+		vl = _mm_add_ps(vl, mVec128);
+		
+		return btVector3(vl);
+#elif defined(BT_USE_NEON)
+		float32x4_t vl = vsubq_f32(v.mVec128, mVec128);
+		vl = vmulq_n_f32(vl, t);
+		vl = vaddq_f32(vl, mVec128);
+		
+		return btVector3(vl);
+#else	
+		return 
+			btVector3(	m_floats[0] + (v.m_floats[0] - m_floats[0]) * t,
+						m_floats[1] + (v.m_floats[1] - m_floats[1]) * t,
+						m_floats[2] + (v.m_floats[2] - m_floats[2]) * t);
+#endif
 	}
 
   /**@brief Elementwise multiply this vector by the other 
    * @param v The other vector */
 	SIMD_FORCE_INLINE btVector3& operator*=(const btVector3& v)
 	{
-		m_floats[0] *= v.m_floats[0]; m_floats[1] *= v.m_floats[1];m_floats[2] *= v.m_floats[2];
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_mul_ps(mVec128, v.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vmulq_f32(mVec128, v.mVec128);
+#else	
+		m_floats[0] *= v.m_floats[0]; 
+		m_floats[1] *= v.m_floats[1];
+		m_floats[2] *= v.m_floats[2];
+#endif
 		return *this;
 	}
 
@@ -268,13 +545,13 @@ public:
   /**@brief Return the z value */
 		SIMD_FORCE_INLINE const btScalar& getZ() const { return m_floats[2]; }
   /**@brief Set the x value */
-		SIMD_FORCE_INLINE void	setX(btScalar x) { m_floats[0] = x;};
+		SIMD_FORCE_INLINE void	setX(btScalar _x) { m_floats[0] = _x;};
   /**@brief Set the y value */
-		SIMD_FORCE_INLINE void	setY(btScalar y) { m_floats[1] = y;};
+		SIMD_FORCE_INLINE void	setY(btScalar _y) { m_floats[1] = _y;};
   /**@brief Set the z value */
-		SIMD_FORCE_INLINE void	setZ(btScalar z) {m_floats[2] = z;};
+		SIMD_FORCE_INLINE void	setZ(btScalar _z) { m_floats[2] = _z;};
   /**@brief Set the w value */
-		SIMD_FORCE_INLINE void	setW(btScalar w) { m_floats[3] = w;};
+		SIMD_FORCE_INLINE void	setW(btScalar _w) { m_floats[3] = _w;};
   /**@brief Return the x value */
 		SIMD_FORCE_INLINE const btScalar& x() const { return m_floats[0]; }
   /**@brief Return the y value */
@@ -292,7 +569,14 @@ public:
 
 	SIMD_FORCE_INLINE	bool	operator==(const btVector3& other) const
 	{
-		return ((m_floats[3]==other.m_floats[3]) && (m_floats[2]==other.m_floats[2]) && (m_floats[1]==other.m_floats[1]) && (m_floats[0]==other.m_floats[0]));
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+        return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
+#else 
+		return ((m_floats[3]==other.m_floats[3]) && 
+                (m_floats[2]==other.m_floats[2]) && 
+                (m_floats[1]==other.m_floats[1]) && 
+                (m_floats[0]==other.m_floats[0]));
+#endif
 	}
 
 	SIMD_FORCE_INLINE	bool	operator!=(const btVector3& other) const
@@ -300,103 +584,230 @@ public:
 		return !(*this == other);
 	}
 
-	 /**@brief Set each element to the max of the current values and the values of another btVector3
+  /**@brief Set each element to the max of the current values and the values of another btVector3
    * @param other The other btVector3 to compare with 
    */
-		SIMD_FORCE_INLINE void	setMax(const btVector3& other)
-		{
-			btSetMax(m_floats[0], other.m_floats[0]);
-			btSetMax(m_floats[1], other.m_floats[1]);
-			btSetMax(m_floats[2], other.m_floats[2]);
-			btSetMax(m_floats[3], other.w());
-		}
+	SIMD_FORCE_INLINE void	setMax(const btVector3& other)
+	{
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_max_ps(mVec128, other.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vmaxq_f32(mVec128, other.mVec128);
+#else
+		btSetMax(m_floats[0], other.m_floats[0]);
+		btSetMax(m_floats[1], other.m_floats[1]);
+		btSetMax(m_floats[2], other.m_floats[2]);
+		btSetMax(m_floats[3], other.w());
+#endif
+	}
+
   /**@brief Set each element to the min of the current values and the values of another btVector3
    * @param other The other btVector3 to compare with 
    */
-		SIMD_FORCE_INLINE void	setMin(const btVector3& other)
-		{
-			btSetMin(m_floats[0], other.m_floats[0]);
-			btSetMin(m_floats[1], other.m_floats[1]);
-			btSetMin(m_floats[2], other.m_floats[2]);
-			btSetMin(m_floats[3], other.w());
-		}
+	SIMD_FORCE_INLINE void	setMin(const btVector3& other)
+	{
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = _mm_min_ps(mVec128, other.mVec128);
+#elif defined(BT_USE_NEON)
+		mVec128 = vminq_f32(mVec128, other.mVec128);
+#else
+		btSetMin(m_floats[0], other.m_floats[0]);
+		btSetMin(m_floats[1], other.m_floats[1]);
+		btSetMin(m_floats[2], other.m_floats[2]);
+		btSetMin(m_floats[3], other.w());
+#endif
+	}
 
-		SIMD_FORCE_INLINE void 	setValue(const btScalar& x, const btScalar& y, const btScalar& z)
-		{
-			m_floats[0]=x;
-			m_floats[1]=y;
-			m_floats[2]=z;
-			m_floats[3] = btScalar(0.);
-		}
+	SIMD_FORCE_INLINE void 	setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z)
+	{
+		m_floats[0]=_x;
+		m_floats[1]=_y;
+		m_floats[2]=_z;
+		m_floats[3] = btScalar(0.f);
+	}
 
-		void	getSkewSymmetricMatrix(btVector3* v0,btVector3* v1,btVector3* v2) const
-		{
-			v0->setValue(0.		,-z()		,y());
-			v1->setValue(z()	,0.			,-x());
-			v2->setValue(-y()	,x()	,0.);
-		}
+	void	getSkewSymmetricMatrix(btVector3* v0,btVector3* v1,btVector3* v2) const
+	{
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+ 
+		__m128 V  = _mm_and_ps(mVec128, btvFFF0fMask);
+		__m128 V0 = _mm_xor_ps(btvMzeroMask, V);
+		__m128 V2 = _mm_movelh_ps(V0, V);
+		
+		__m128 V1 = _mm_shuffle_ps(V, V0, 0xCE);
+		
+        V0 = _mm_shuffle_ps(V0, V, 0xDB);
+		V2 = _mm_shuffle_ps(V2, V, 0xF9);
+		
+		v0->mVec128 = V0;
+		v1->mVec128 = V1;
+		v2->mVec128 = V2;
+#else
+		v0->setValue(0.		,-z()		,y());
+		v1->setValue(z()	,0.			,-x());
+		v2->setValue(-y()	,x()	,0.);
+#endif
+	}
 
-		void	setZero()
-		{
-			setValue(btScalar(0.),btScalar(0.),btScalar(0.));
-		}
+	void setZero()
+	{
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+		mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128);
+#elif defined(BT_USE_NEON)
+		int32x4_t vi = vdupq_n_s32(0); 
+		mVec128 = vreinterpretq_f32_s32(vi);
+#else	
+		setValue(btScalar(0.),btScalar(0.),btScalar(0.));
+#endif
+	}
 
-		SIMD_FORCE_INLINE bool isZero() const 
-		{
-			return m_floats[0] == btScalar(0) && m_floats[1] == btScalar(0) && m_floats[2] == btScalar(0);
-		}
+	SIMD_FORCE_INLINE bool isZero() const 
+	{
+		return m_floats[0] == btScalar(0) && m_floats[1] == btScalar(0) && m_floats[2] == btScalar(0);
+	}
 
-		SIMD_FORCE_INLINE bool fuzzyZero() const 
-		{
-			return length2() < SIMD_EPSILON;
-		}
+	SIMD_FORCE_INLINE bool fuzzyZero() const 
+	{
+		return length2() < SIMD_EPSILON;
+	}
 
-		SIMD_FORCE_INLINE	void	serialize(struct	btVector3Data& dataOut) const;
+	SIMD_FORCE_INLINE	void	serialize(struct	btVector3Data& dataOut) const;
 
-		SIMD_FORCE_INLINE	void	deSerialize(const struct	btVector3Data& dataIn);
+	SIMD_FORCE_INLINE	void	deSerialize(const struct	btVector3Data& dataIn);
 
-		SIMD_FORCE_INLINE	void	serializeFloat(struct	btVector3FloatData& dataOut) const;
+	SIMD_FORCE_INLINE	void	serializeFloat(struct	btVector3FloatData& dataOut) const;
 
-		SIMD_FORCE_INLINE	void	deSerializeFloat(const struct	btVector3FloatData& dataIn);
+	SIMD_FORCE_INLINE	void	deSerializeFloat(const struct	btVector3FloatData& dataIn);
 
-		SIMD_FORCE_INLINE	void	serializeDouble(struct	btVector3DoubleData& dataOut) const;
+	SIMD_FORCE_INLINE	void	serializeDouble(struct	btVector3DoubleData& dataOut) const;
 
-		SIMD_FORCE_INLINE	void	deSerializeDouble(const struct	btVector3DoubleData& dataIn);
+	SIMD_FORCE_INLINE	void	deSerializeDouble(const struct	btVector3DoubleData& dataIn);
+    
+        /**@brief returns index of maximum dot product between this and vectors in array[]
+         * @param array The other vectors 
+         * @param array_count The number of other vectors 
+         * @param dotOut The maximum dot product */
+        SIMD_FORCE_INLINE   long    maxDot( const btVector3 *array, long array_count, btScalar &dotOut ) const; 
 
+        /**@brief returns index of minimum dot product between this and vectors in array[]
+         * @param array The other vectors 
+         * @param array_count The number of other vectors 
+         * @param dotOut The minimum dot product */    
+        SIMD_FORCE_INLINE   long    minDot( const btVector3 *array, long array_count, btScalar &dotOut ) const; 
+
+    /* create a vector as  btVector3( this->dot( btVector3 v0 ), this->dot( btVector3 v1), this->dot( btVector3 v2 ))  */
+    SIMD_FORCE_INLINE btVector3  dot3( const btVector3 &v0, const btVector3 &v1, const btVector3 &v2 ) const
+    {
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+
+        __m128 a0 = _mm_mul_ps( v0.mVec128, this->mVec128 );
+        __m128 a1 = _mm_mul_ps( v1.mVec128, this->mVec128 );
+        __m128 a2 = _mm_mul_ps( v2.mVec128, this->mVec128 );
+        __m128 b0 = _mm_unpacklo_ps( a0, a1 );
+        __m128 b1 = _mm_unpackhi_ps( a0, a1 );
+        __m128 b2 = _mm_unpacklo_ps( a2, _mm_setzero_ps() );
+        __m128 r = _mm_movelh_ps( b0, b2 );
+        r = _mm_add_ps( r, _mm_movehl_ps( b2, b0 ));
+        a2 = _mm_and_ps( a2, btvxyzMaskf);
+        r = _mm_add_ps( r, btCastdTo128f (_mm_move_sd( btCastfTo128d(a2), btCastfTo128d(b1) )));
+        return btVector3(r);
+        
+#elif defined(BT_USE_NEON)
+        static const uint32x4_t xyzMask = (const uint32x4_t){ -1, -1, -1, 0 };
+        float32x4_t a0 = vmulq_f32( v0.mVec128, this->mVec128);
+        float32x4_t a1 = vmulq_f32( v1.mVec128, this->mVec128);
+        float32x4_t a2 = vmulq_f32( v2.mVec128, this->mVec128);
+        float32x2x2_t zLo = vtrn_f32( vget_high_f32(a0), vget_high_f32(a1));
+        a2 = (float32x4_t) vandq_u32((uint32x4_t) a2, xyzMask );
+        float32x2_t b0 = vadd_f32( vpadd_f32( vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0] );
+        float32x2_t b1 = vpadd_f32( vpadd_f32( vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
+        return btVector3( vcombine_f32(b0, b1) );
+#else	
+		return btVector3( dot(v0), dot(v1), dot(v2));
+#endif
+    }
 };
 
 /**@brief Return the sum of two vectors (Point symantics)*/
 SIMD_FORCE_INLINE btVector3 
 operator+(const btVector3& v1, const btVector3& v2) 
 {
-	return btVector3(v1.m_floats[0] + v2.m_floats[0], v1.m_floats[1] + v2.m_floats[1], v1.m_floats[2] + v2.m_floats[2]);
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	return btVector3(_mm_add_ps(v1.mVec128, v2.mVec128));
+#elif defined(BT_USE_NEON)
+	return btVector3(vaddq_f32(v1.mVec128, v2.mVec128));
+#else
+	return btVector3(
+			v1.m_floats[0] + v2.m_floats[0], 
+			v1.m_floats[1] + v2.m_floats[1], 
+			v1.m_floats[2] + v2.m_floats[2]);
+#endif
 }
 
 /**@brief Return the elementwise product of two vectors */
 SIMD_FORCE_INLINE btVector3 
 operator*(const btVector3& v1, const btVector3& v2) 
 {
-	return btVector3(v1.m_floats[0] * v2.m_floats[0], v1.m_floats[1] * v2.m_floats[1], v1.m_floats[2] * v2.m_floats[2]);
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	return btVector3(_mm_mul_ps(v1.mVec128, v2.mVec128));
+#elif defined(BT_USE_NEON)
+	return btVector3(vmulq_f32(v1.mVec128, v2.mVec128));
+#else
+	return btVector3(
+			v1.m_floats[0] * v2.m_floats[0], 
+			v1.m_floats[1] * v2.m_floats[1], 
+			v1.m_floats[2] * v2.m_floats[2]);
+#endif
 }
 
 /**@brief Return the difference between two vectors */
 SIMD_FORCE_INLINE btVector3 
 operator-(const btVector3& v1, const btVector3& v2)
 {
-	return btVector3(v1.m_floats[0] - v2.m_floats[0], v1.m_floats[1] - v2.m_floats[1], v1.m_floats[2] - v2.m_floats[2]);
+#if (defined(BT_USE_SSE_IN_API)  && defined(BT_USE_SSE))
+
+	//	without _mm_and_ps this code causes slowdown in Concave moving
+	__m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
+	return btVector3(_mm_and_ps(r, btvFFF0fMask));
+#elif defined(BT_USE_NEON)
+	float32x4_t r = vsubq_f32(v1.mVec128, v2.mVec128);
+	return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
+#else
+	return btVector3(
+			v1.m_floats[0] - v2.m_floats[0], 
+			v1.m_floats[1] - v2.m_floats[1], 
+			v1.m_floats[2] - v2.m_floats[2]);
+#endif
 }
+
 /**@brief Return the negative of the vector */
 SIMD_FORCE_INLINE btVector3 
 operator-(const btVector3& v)
 {
+#if (defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
+	__m128 r = _mm_xor_ps(v.mVec128, btvMzeroMask);
+	return btVector3(_mm_and_ps(r, btvFFF0fMask)); 
+#elif defined(BT_USE_NEON)
+	return btVector3((btSimdFloat4)veorq_s32((int32x4_t)v.mVec128, (int32x4_t)btvMzeroMask));
+#else	
 	return btVector3(-v.m_floats[0], -v.m_floats[1], -v.m_floats[2]);
+#endif
 }
 
 /**@brief Return the vector scaled by s */
 SIMD_FORCE_INLINE btVector3 
 operator*(const btVector3& v, const btScalar& s)
 {
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
+	vs = bt_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
+	return btVector3(_mm_mul_ps(v.mVec128, vs));
+#elif defined(BT_USE_NEON)
+	float32x4_t r = vmulq_n_f32(v.mVec128, s);
+	return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
+#else
 	return btVector3(v.m_floats[0] * s, v.m_floats[1] * s, v.m_floats[2] * s);
+#endif
 }
 
 /**@brief Return the vector scaled by s */
@@ -411,14 +822,46 @@ SIMD_FORCE_INLINE btVector3
 operator/(const btVector3& v, const btScalar& s)
 {
 	btFullAssert(s != btScalar(0.0));
+#if 0 //defined(BT_USE_SSE_IN_API)
+// this code is not faster !
+	__m128 vs = _mm_load_ss(&s);
+    vs = _mm_div_ss(v1110, vs);
+	vs = bt_pshufd_ps(vs, 0x00);	//	(S S S S)
+
+	return btVector3(_mm_mul_ps(v.mVec128, vs));
+#else
 	return v * (btScalar(1.0) / s);
+#endif
 }
 
 /**@brief Return the vector inversely scaled by s */
 SIMD_FORCE_INLINE btVector3
 operator/(const btVector3& v1, const btVector3& v2)
 {
-	return btVector3(v1.m_floats[0] / v2.m_floats[0],v1.m_floats[1] / v2.m_floats[1],v1.m_floats[2] / v2.m_floats[2]);
+#if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE))
+	__m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
+	vec = _mm_and_ps(vec, btvFFF0fMask);
+	return btVector3(vec); 
+#elif defined(BT_USE_NEON)
+	float32x4_t x, y, v, m;
+
+	x = v1.mVec128;
+	y = v2.mVec128;
+	
+	v = vrecpeq_f32(y);			// v ~ 1/y
+	m = vrecpsq_f32(y, v);		// m = (2-v*y)
+	v = vmulq_f32(v, m);		// vv = v*m ~~ 1/y
+	m = vrecpsq_f32(y, v);		// mm = (2-vv*y)
+	v = vmulq_f32(v, x);		// x*vv
+	v = vmulq_f32(v, m);		// (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y
+
+	return btVector3(v);
+#else
+	return btVector3(
+			v1.m_floats[0] / v2.m_floats[0], 
+			v1.m_floats[1] / v2.m_floats[1],
+			v1.m_floats[2] / v2.m_floats[2]);
+#endif
 }
 
 /**@brief Return the dot product between two vectors */
@@ -488,22 +931,135 @@ SIMD_FORCE_INLINE btScalar btVector3::distance(const btVector3& v) const
 
 SIMD_FORCE_INLINE btVector3 btVector3::normalized() const
 {
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+	btVector3 norm = *this;
+
+	return norm.normalize();
+#else
 	return *this / length();
+#endif
 } 
 
-SIMD_FORCE_INLINE btVector3 btVector3::rotate( const btVector3& wAxis, const btScalar angle ) const
+SIMD_FORCE_INLINE btVector3 btVector3::rotate( const btVector3& wAxis, const btScalar _angle ) const
 {
 	// wAxis must be a unit lenght vector
 
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
+
+    __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
+	btScalar ssin = btSin( _angle );
+    __m128 C = wAxis.cross( mVec128 ).mVec128;
+	O = _mm_and_ps(O, btvFFF0fMask);
+    btScalar scos = btCos( _angle );
+	
+	__m128 vsin = _mm_load_ss(&ssin);	//	(S 0 0 0)
+    __m128 vcos = _mm_load_ss(&scos);	//	(S 0 0 0)
+	
+	__m128 Y = bt_pshufd_ps(O, 0xC9);	//	(Y Z X 0)
+	__m128 Z = bt_pshufd_ps(O, 0xD2);	//	(Z X Y 0)
+	O = _mm_add_ps(O, Y);
+	vsin = bt_pshufd_ps(vsin, 0x80);	//	(S S S 0)
+	O = _mm_add_ps(O, Z);
+    vcos = bt_pshufd_ps(vcos, 0x80);	//	(S S S 0)
+	
+    vsin = vsin * C; 
+	O = O * wAxis.mVec128; 
+	__m128 X = mVec128 - O; 
+	
+    O = O + vsin;
+	vcos = vcos * X;
+	O = O + vcos;	
+	
+	return btVector3(O);
+#else
 	btVector3 o = wAxis * wAxis.dot( *this );
-	btVector3 x = *this - o;
-	btVector3 y;
+	btVector3 _x = *this - o;
+	btVector3 _y;
 
-	y = wAxis.cross( *this );
+	_y = wAxis.cross( *this );
 
-	return ( o + x * btCos( angle ) + y * btSin( angle ) );
+	return ( o + _x * btCos( _angle ) + _y * btSin( _angle ) );
+#endif
 }
 
+SIMD_FORCE_INLINE   long    btVector3::maxDot( const btVector3 *array, long array_count, btScalar &dotOut ) const
+{
+#if defined (BT_USE_SSE) || defined (BT_USE_NEON)
+    #if defined _WIN32 || defined (BT_USE_SSE)
+        const long scalar_cutoff = 10;
+        long _maxdot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut );
+    #elif defined BT_USE_NEON
+        const long scalar_cutoff = 4;
+        extern long (*_maxdot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut );
+    #endif
+    if( array_count < scalar_cutoff )
+#else
+	
+#endif//BT_USE_SSE || BT_USE_NEON
+    {
+        btScalar maxDot = -SIMD_INFINITY;
+        int i = 0;
+        int ptIndex = -1;
+        for( i = 0; i < array_count; i++ )
+        {
+            btScalar dot = array[i].dot(*this);
+            
+            if( dot > maxDot )
+            {
+                maxDot = dot;
+                ptIndex = i;
+            }
+        }
+        
+        dotOut = maxDot;
+        return ptIndex;
+    }
+#if defined (BT_USE_SSE) || defined (BT_USE_NEON)
+    return _maxdot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut );
+#endif
+}
+
+SIMD_FORCE_INLINE   long    btVector3::minDot( const btVector3 *array, long array_count, btScalar &dotOut ) const
+{
+#if defined (BT_USE_SSE) || defined (BT_USE_NEON)
+    #if defined BT_USE_SSE
+        const long scalar_cutoff = 10;
+        long _mindot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut );
+    #elif defined BT_USE_NEON
+        const long scalar_cutoff = 4;
+        extern long (*_mindot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut );
+    #else
+        #error unhandled arch!
+    #endif
+    
+    if( array_count < scalar_cutoff )
+#endif//BT_USE_SSE || BT_USE_NEON
+    {
+        btScalar  minDot = SIMD_INFINITY;
+        int i = 0;
+        int ptIndex = -1;
+        
+        for( i = 0; i < array_count; i++ )
+        {
+            btScalar dot = array[i].dot(*this);
+            
+            if( dot < minDot )
+            {
+                minDot = dot;
+                ptIndex = i;
+            }
+        }
+        
+        dotOut = minDot;
+        
+        return ptIndex;
+    }
+#if defined (BT_USE_SSE) || defined (BT_USE_NEON)
+    return _mindot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut );
+#endif
+}
+
+
 class btVector4 : public btVector3
 {
 public:
@@ -511,24 +1067,47 @@ public:
 	SIMD_FORCE_INLINE btVector4() {}
 
 
-	SIMD_FORCE_INLINE btVector4(const btScalar& x, const btScalar& y, const btScalar& z,const btScalar& w) 
-		: btVector3(x,y,z)
+	SIMD_FORCE_INLINE btVector4(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w) 
+		: btVector3(_x,_y,_z)
 	{
-		m_floats[3] = w;
+		m_floats[3] = _w;
 	}
 
+#if (defined (BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined (BT_USE_NEON) 
+	SIMD_FORCE_INLINE btVector4(const btSimdFloat4 vec)
+	{
+		mVec128 = vec;
+	}
+
+	SIMD_FORCE_INLINE btVector4(const btVector3& rhs)
+	{
+		mVec128 = rhs.mVec128;
+	}
+
+	SIMD_FORCE_INLINE btVector4& 
+	operator=(const btVector4& v) 
+	{
+		mVec128 = v.mVec128;
+		return *this;
+	}
+#endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON) 
 
 	SIMD_FORCE_INLINE btVector4 absolute4() const 
 	{
+#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE) 
+		return btVector4(_mm_and_ps(mVec128, btvAbsfMask));
+#elif defined(BT_USE_NEON)
+		return btVector4(vabsq_f32(mVec128));
+#else	
 		return btVector4(
 			btFabs(m_floats[0]), 
 			btFabs(m_floats[1]), 
 			btFabs(m_floats[2]),
 			btFabs(m_floats[3]));
+#endif
 	}
 
 
-
 	btScalar	getW() const { return m_floats[3];}
 
 
@@ -556,12 +1135,8 @@ public:
 			maxIndex = 3;
 			maxVal = m_floats[3];
 		}
-		
-		
-		
 
 		return maxIndex;
-
 	}
 
 
@@ -591,7 +1166,6 @@ public:
 		}
 		
 		return minIndex;
-
 	}
 
 
@@ -623,12 +1197,12 @@ public:
    * @param z Value of z
    * @param w Value of w
    */
-		SIMD_FORCE_INLINE void	setValue(const btScalar& x, const btScalar& y, const btScalar& z,const btScalar& w)
+		SIMD_FORCE_INLINE void	setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w)
 		{
-			m_floats[0]=x;
-			m_floats[1]=y;
-			m_floats[2]=z;
-			m_floats[3]=w;
+			m_floats[0]=_x;
+			m_floats[1]=_y;
+			m_floats[2]=_z;
+			m_floats[3]=_w;
 		}
 
 
@@ -762,5 +1336,4 @@ SIMD_FORCE_INLINE void	btVector3::deSerialize(const struct	btVector3Data& dataIn
 		m_floats[i] = dataIn.m_floats[i];
 }
 
-
 #endif //BT_VECTOR3_H
diff --git a/src/MiniCL/MiniCL.cpp b/src/MiniCL/MiniCL.cpp
index 24f6751fc..04e75f5c2 100644
--- a/src/MiniCL/MiniCL.cpp
+++ b/src/MiniCL/MiniCL.cpp
@@ -534,13 +534,15 @@ CL_API_ENTRY cl_kernel CL_API_CALL clCreateKernel(cl_program       program ,
                cl_int *         errcode_ret ) CL_API_SUFFIX__VERSION_1_0
 {
 	MiniCLTaskScheduler* scheduler = (MiniCLTaskScheduler*) program;
-	MiniCLKernel* kernel = new MiniCLKernel();
 	int nameLen = strlen(kernel_name);
 	if(nameLen >= MINI_CL_MAX_KERNEL_NAME)
 	{
 		*errcode_ret = CL_INVALID_KERNEL_NAME;
 		return NULL;
 	}
+
+	MiniCLKernel* kernel = new MiniCLKernel();
+
 	strcpy(kernel->m_name, kernel_name);
 	kernel->m_numArgs = 0;
 
@@ -556,6 +558,7 @@ CL_API_ENTRY cl_kernel CL_API_CALL clCreateKernel(cl_program       program ,
 	if(kernel->registerSelf() == NULL)
 	{
 		*errcode_ret = CL_INVALID_KERNEL_NAME;
+		delete kernel;
 		return NULL;
 	}
 	else
diff --git a/src/vectormath/neon/boolInVec.h b/src/vectormath/neon/boolInVec.h
new file mode 100644
index 000000000..ba16838c0
--- /dev/null
+++ b/src/vectormath/neon/boolInVec.h
@@ -0,0 +1,226 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef _BOOLINVEC_H
+#define _BOOLINVEC_H
+
+#include <math.h>
+namespace Vectormath {
+
+class floatInVec;
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec class
+//
+
+class boolInVec
+{
+private:
+    unsigned int mData;
+
+public:
+    // Default constructor; does no initialization
+    //
+    inline boolInVec( ) { };
+
+    // Construct from a value converted from float
+    //
+    inline boolInVec(floatInVec vec);
+
+    // Explicit cast from bool
+    //
+    explicit inline boolInVec(bool scalar);
+
+    // Explicit cast to bool
+    //
+    inline bool getAsBool() const;
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+    // Implicit cast to bool
+    //
+    inline operator bool() const;
+#endif
+
+    // Boolean negation operator
+    //
+    inline const boolInVec operator ! () const;
+
+    // Assignment operator
+    //
+    inline boolInVec& operator = (boolInVec vec);
+
+    // Boolean and assignment operator
+    //
+    inline boolInVec& operator &= (boolInVec vec);
+
+    // Boolean exclusive or assignment operator
+    //
+    inline boolInVec& operator ^= (boolInVec vec);
+
+    // Boolean or assignment operator
+    //
+    inline boolInVec& operator |= (boolInVec vec);
+
+};
+
+// Equal operator
+//
+inline const boolInVec operator == (boolInVec vec0, boolInVec vec1);
+
+// Not equal operator
+//
+inline const boolInVec operator != (boolInVec vec0, boolInVec vec1);
+
+// And operator
+//
+inline const boolInVec operator & (boolInVec vec0, boolInVec vec1);
+
+// Exclusive or operator
+//
+inline const boolInVec operator ^ (boolInVec vec0, boolInVec vec1);
+
+// Or operator
+//
+inline const boolInVec operator | (boolInVec vec0, boolInVec vec1);
+
+// Conditionally select between two values
+//
+inline const boolInVec select(boolInVec vec0, boolInVec vec1, boolInVec select_vec1);
+
+
+} // namespace Vectormath
+
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec implementation
+//
+
+#include "floatInVec.h"
+
+namespace Vectormath {
+
+inline
+boolInVec::boolInVec(floatInVec vec)
+{
+    *this = (vec != floatInVec(0.0f));
+}
+
+inline
+boolInVec::boolInVec(bool scalar)
+{
+    mData = -(int)scalar;
+}
+
+inline
+bool
+boolInVec::getAsBool() const
+{
+    return (mData > 0);
+}
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+inline
+boolInVec::operator bool() const
+{
+    return getAsBool();
+}
+#endif
+
+inline
+const boolInVec
+boolInVec::operator ! () const
+{
+    return boolInVec(!mData);
+}
+
+inline
+boolInVec&
+boolInVec::operator = (boolInVec vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator &= (boolInVec vec)
+{
+    *this = *this & vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator ^= (boolInVec vec)
+{
+    *this = *this ^ vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator |= (boolInVec vec)
+{
+    *this = *this | vec;
+    return *this;
+}
+
+inline
+const boolInVec
+operator == (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() == vec1.getAsBool());
+}
+
+inline
+const boolInVec
+operator != (boolInVec vec0, boolInVec vec1)
+{
+    return !(vec0 == vec1);
+}
+
+inline
+const boolInVec
+operator & (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() & vec1.getAsBool());
+}
+
+inline
+const boolInVec
+operator | (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() | vec1.getAsBool());
+}
+
+inline
+const boolInVec
+operator ^ (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() ^ vec1.getAsBool());
+}
+
+inline
+const boolInVec
+select(boolInVec vec0, boolInVec vec1, boolInVec select_vec1)
+{
+    return (select_vec1.getAsBool() == 0) ? vec0 : vec1;
+}
+
+} // namespace Vectormath
+
+#endif // boolInVec_h
+
diff --git a/src/vectormath/neon/floatInVec.h b/src/vectormath/neon/floatInVec.h
new file mode 100644
index 000000000..26147d22b
--- /dev/null
+++ b/src/vectormath/neon/floatInVec.h
@@ -0,0 +1,344 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+#ifndef _FLOATINVEC_H
+#define _FLOATINVEC_H
+
+#include <math.h>
+namespace Vectormath {
+
+class boolInVec;
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec class
+//
+
+// A class representing a scalar float value contained in a vector register
+// This class does not support fastmath
+class floatInVec
+{
+private:
+    float mData;
+
+public:
+    // Default constructor; does no initialization
+    //
+    inline floatInVec( ) { };
+
+    // Construct from a value converted from bool
+    //
+    inline floatInVec(boolInVec vec);
+
+    // Explicit cast from float
+    //
+    explicit inline floatInVec(float scalar);
+
+    // Explicit cast to float
+    //
+    inline float getAsFloat() const;
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+    // Implicit cast to float
+    //
+    inline operator float() const;
+#endif
+
+    // Post increment (add 1.0f)
+    //
+    inline const floatInVec operator ++ (int);
+
+    // Post decrement (subtract 1.0f)
+    //
+    inline const floatInVec operator -- (int);
+
+    // Pre increment (add 1.0f)
+    //
+    inline floatInVec& operator ++ ();
+
+    // Pre decrement (subtract 1.0f)
+    //
+    inline floatInVec& operator -- ();
+
+    // Negation operator
+    //
+    inline const floatInVec operator - () const;
+
+    // Assignment operator
+    //
+    inline floatInVec& operator = (floatInVec vec);
+
+    // Multiplication assignment operator
+    //
+    inline floatInVec& operator *= (floatInVec vec);
+
+    // Division assignment operator
+    //
+    inline floatInVec& operator /= (floatInVec vec);
+
+    // Addition assignment operator
+    //
+    inline floatInVec& operator += (floatInVec vec);
+
+    // Subtraction assignment operator
+    //
+    inline floatInVec& operator -= (floatInVec vec);
+
+};
+
+// Multiplication operator
+//
+inline const floatInVec operator * (floatInVec vec0, floatInVec vec1);
+
+// Division operator
+//
+inline const floatInVec operator / (floatInVec vec0, floatInVec vec1);
+
+// Addition operator
+//
+inline const floatInVec operator + (floatInVec vec0, floatInVec vec1);
+
+// Subtraction operator
+//
+inline const floatInVec operator - (floatInVec vec0, floatInVec vec1);
+
+// Less than operator
+//
+inline const boolInVec operator < (floatInVec vec0, floatInVec vec1);
+
+// Less than or equal operator
+//
+inline const boolInVec operator <= (floatInVec vec0, floatInVec vec1);
+
+// Greater than operator
+//
+inline const boolInVec operator > (floatInVec vec0, floatInVec vec1);
+
+// Greater than or equal operator
+//
+inline const boolInVec operator >= (floatInVec vec0, floatInVec vec1);
+
+// Equal operator
+//
+inline const boolInVec operator == (floatInVec vec0, floatInVec vec1);
+
+// Not equal operator
+//
+inline const boolInVec operator != (floatInVec vec0, floatInVec vec1);
+
+// Conditionally select between two values
+//
+inline const floatInVec select(floatInVec vec0, floatInVec vec1, boolInVec select_vec1);
+
+
+} // namespace Vectormath
+
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec implementation
+//
+
+#include "boolInVec.h"
+
+namespace Vectormath {
+
+inline
+floatInVec::floatInVec(boolInVec vec)
+{
+    mData = float(vec.getAsBool());
+}
+
+inline
+floatInVec::floatInVec(float scalar)
+{
+    mData = scalar;
+}
+
+inline
+float
+floatInVec::getAsFloat() const
+{
+    return mData;
+}
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+inline
+floatInVec::operator float() const
+{
+    return getAsFloat();
+}
+#endif
+
+inline
+const floatInVec
+floatInVec::operator ++ (int)
+{
+    float olddata = mData;
+    operator ++();
+    return floatInVec(olddata);
+}
+
+inline
+const floatInVec
+floatInVec::operator -- (int)
+{
+    float olddata = mData;
+    operator --();
+    return floatInVec(olddata);
+}
+
+inline
+floatInVec&
+floatInVec::operator ++ ()
+{
+    *this += floatInVec(1.0f);
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -- ()
+{
+    *this -= floatInVec(1.0f);
+    return *this;
+}
+
+inline
+const floatInVec
+floatInVec::operator - () const
+{
+    return floatInVec(-mData);
+}
+
+inline
+floatInVec&
+floatInVec::operator = (floatInVec vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator *= (floatInVec vec)
+{
+    *this = *this * vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator /= (floatInVec vec)
+{
+    *this = *this / vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator += (floatInVec vec)
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -= (floatInVec vec)
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline
+const floatInVec
+operator * (floatInVec vec0, floatInVec vec1)
+{
+    return floatInVec(vec0.getAsFloat() * vec1.getAsFloat());
+}
+
+inline
+const floatInVec
+operator / (floatInVec num, floatInVec den)
+{
+    return floatInVec(num.getAsFloat() / den.getAsFloat());
+}
+
+inline
+const floatInVec
+operator + (floatInVec vec0, floatInVec vec1)
+{
+    return floatInVec(vec0.getAsFloat() + vec1.getAsFloat());
+}
+
+inline
+const floatInVec
+operator - (floatInVec vec0, floatInVec vec1)
+{
+    return floatInVec(vec0.getAsFloat() - vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator < (floatInVec vec0, floatInVec vec1)
+{
+    return boolInVec(vec0.getAsFloat() < vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator <= (floatInVec vec0, floatInVec vec1)
+{
+    return !(vec0 > vec1);
+}
+
+inline
+const boolInVec
+operator > (floatInVec vec0, floatInVec vec1)
+{
+    return boolInVec(vec0.getAsFloat() > vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator >= (floatInVec vec0, floatInVec vec1)
+{
+    return !(vec0 < vec1);
+}
+
+inline
+const boolInVec
+operator == (floatInVec vec0, floatInVec vec1)
+{
+    return boolInVec(vec0.getAsFloat() == vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator != (floatInVec vec0, floatInVec vec1)
+{
+    return !(vec0 == vec1);
+}
+
+inline
+const floatInVec
+select(floatInVec vec0, floatInVec vec1, boolInVec select_vec1)
+{
+    return (select_vec1.getAsBool() == 0) ? vec0 : vec1;
+}
+
+} // namespace Vectormath
+
+#endif // floatInVec_h
+
diff --git a/src/vectormath/neon/mat_aos.h b/src/vectormath/neon/mat_aos.h
new file mode 100644
index 000000000..e61f601c3
--- /dev/null
+++ b/src/vectormath/neon/mat_aos.h
@@ -0,0 +1,1631 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef _VECTORMATH_MAT_AOS_CPP_H
+#define _VECTORMATH_MAT_AOS_CPP_H
+
+namespace Vectormath {
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// Constants
+
+#define _VECTORMATH_PI_OVER_2 1.570796327f
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+inline Matrix3::Matrix3( const Matrix3 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+}
+
+inline Matrix3::Matrix3( float scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+}
+
+inline Matrix3::Matrix3( const Quat & unitQuat )
+{
+    float qx, qy, qz, qw, qx2, qy2, qz2, qxqx2, qyqy2, qzqz2, qxqy2, qyqz2, qzqw2, qxqz2, qyqw2, qxqw2;
+    qx = unitQuat.getX();
+    qy = unitQuat.getY();
+    qz = unitQuat.getZ();
+    qw = unitQuat.getW();
+    qx2 = ( qx + qx );
+    qy2 = ( qy + qy );
+    qz2 = ( qz + qz );
+    qxqx2 = ( qx * qx2 );
+    qxqy2 = ( qx * qy2 );
+    qxqz2 = ( qx * qz2 );
+    qxqw2 = ( qw * qx2 );
+    qyqy2 = ( qy * qy2 );
+    qyqz2 = ( qy * qz2 );
+    qyqw2 = ( qw * qy2 );
+    qzqz2 = ( qz * qz2 );
+    qzqw2 = ( qw * qz2 );
+    mCol0 = Vector3( ( ( 1.0f - qyqy2 ) - qzqz2 ), ( qxqy2 + qzqw2 ), ( qxqz2 - qyqw2 ) );
+    mCol1 = Vector3( ( qxqy2 - qzqw2 ), ( ( 1.0f - qxqx2 ) - qzqz2 ), ( qyqz2 + qxqw2 ) );
+    mCol2 = Vector3( ( qxqz2 + qyqw2 ), ( qyqz2 - qxqw2 ), ( ( 1.0f - qxqx2 ) - qyqy2 ) );
+}
+
+inline Matrix3::Matrix3( const Vector3 & _col0, const Vector3 & _col1, const Vector3 & _col2 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+}
+
+inline Matrix3 & Matrix3::setCol0( const Vector3 & _col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setCol1( const Vector3 & _col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setCol2( const Vector3 & _col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setCol( int col, const Vector3 & vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setRow( int row, const Vector3 & vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setElem( int col, int row, float val )
+{
+    Vector3 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+inline float Matrix3::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+inline const Vector3 Matrix3::getCol0( ) const
+{
+    return mCol0;
+}
+
+inline const Vector3 Matrix3::getCol1( ) const
+{
+    return mCol1;
+}
+
+inline const Vector3 Matrix3::getCol2( ) const
+{
+    return mCol2;
+}
+
+inline const Vector3 Matrix3::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector3 Matrix3::getRow( int row ) const
+{
+    return Vector3( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ) );
+}
+
+inline Vector3 & Matrix3::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector3 Matrix3::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline Matrix3 & Matrix3::operator =( const Matrix3 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    return *this;
+}
+
+inline const Matrix3 transpose( const Matrix3 & mat )
+{
+    return Matrix3(
+        Vector3( mat.getCol0().getX(), mat.getCol1().getX(), mat.getCol2().getX() ),
+        Vector3( mat.getCol0().getY(), mat.getCol1().getY(), mat.getCol2().getY() ),
+        Vector3( mat.getCol0().getZ(), mat.getCol1().getZ(), mat.getCol2().getZ() )
+    );
+}
+
+inline const Matrix3 inverse( const Matrix3 & mat )
+{
+    Vector3 tmp0, tmp1, tmp2;
+    float detinv;
+    tmp0 = cross( mat.getCol1(), mat.getCol2() );
+    tmp1 = cross( mat.getCol2(), mat.getCol0() );
+    tmp2 = cross( mat.getCol0(), mat.getCol1() );
+    detinv = ( 1.0f / dot( mat.getCol2(), tmp2 ) );
+    return Matrix3(
+        Vector3( ( tmp0.getX() * detinv ), ( tmp1.getX() * detinv ), ( tmp2.getX() * detinv ) ),
+        Vector3( ( tmp0.getY() * detinv ), ( tmp1.getY() * detinv ), ( tmp2.getY() * detinv ) ),
+        Vector3( ( tmp0.getZ() * detinv ), ( tmp1.getZ() * detinv ), ( tmp2.getZ() * detinv ) )
+    );
+}
+
+inline float determinant( const Matrix3 & mat )
+{
+    return dot( mat.getCol2(), cross( mat.getCol0(), mat.getCol1() ) );
+}
+
+inline const Matrix3 Matrix3::operator +( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( mCol0 + mat.mCol0 ),
+        ( mCol1 + mat.mCol1 ),
+        ( mCol2 + mat.mCol2 )
+    );
+}
+
+inline const Matrix3 Matrix3::operator -( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( mCol0 - mat.mCol0 ),
+        ( mCol1 - mat.mCol1 ),
+        ( mCol2 - mat.mCol2 )
+    );
+}
+
+inline Matrix3 & Matrix3::operator +=( const Matrix3 & mat )
+{
+    *this = *this + mat;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::operator -=( const Matrix3 & mat )
+{
+    *this = *this - mat;
+    return *this;
+}
+
+inline const Matrix3 Matrix3::operator -( ) const
+{
+    return Matrix3(
+        ( -mCol0 ),
+        ( -mCol1 ),
+        ( -mCol2 )
+    );
+}
+
+inline const Matrix3 absPerElem( const Matrix3 & mat )
+{
+    return Matrix3(
+        absPerElem( mat.getCol0() ),
+        absPerElem( mat.getCol1() ),
+        absPerElem( mat.getCol2() )
+    );
+}
+
+inline const Matrix3 Matrix3::operator *( float scalar ) const
+{
+    return Matrix3(
+        ( mCol0 * scalar ),
+        ( mCol1 * scalar ),
+        ( mCol2 * scalar )
+    );
+}
+
+inline Matrix3 & Matrix3::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Matrix3 operator *( float scalar, const Matrix3 & mat )
+{
+    return mat * scalar;
+}
+
+inline const Vector3 Matrix3::operator *( const Vector3 & vec ) const
+{
+    return Vector3(
+        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
+        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
+        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
+    );
+}
+
+inline const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( *this * mat.mCol0 ),
+        ( *this * mat.mCol1 ),
+        ( *this * mat.mCol2 )
+    );
+}
+
+inline Matrix3 & Matrix3::operator *=( const Matrix3 & mat )
+{
+    *this = *this * mat;
+    return *this;
+}
+
+inline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 )
+{
+    return Matrix3(
+        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
+        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
+        mulPerElem( mat0.getCol2(), mat1.getCol2() )
+    );
+}
+
+inline const Matrix3 Matrix3::identity( )
+{
+    return Matrix3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( )
+    );
+}
+
+inline const Matrix3 Matrix3::rotationX( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix3(
+        Vector3::xAxis( ),
+        Vector3( 0.0f, c, s ),
+        Vector3( 0.0f, -s, c )
+    );
+}
+
+inline const Matrix3 Matrix3::rotationY( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix3(
+        Vector3( c, 0.0f, -s ),
+        Vector3::yAxis( ),
+        Vector3( s, 0.0f, c )
+    );
+}
+
+inline const Matrix3 Matrix3::rotationZ( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix3(
+        Vector3( c, s, 0.0f ),
+        Vector3( -s, c, 0.0f ),
+        Vector3::zAxis( )
+    );
+}
+
+inline const Matrix3 Matrix3::rotationZYX( const Vector3 & radiansXYZ )
+{
+    float sX, cX, sY, cY, sZ, cZ, tmp0, tmp1;
+    sX = sinf( radiansXYZ.getX() );
+    cX = cosf( radiansXYZ.getX() );
+    sY = sinf( radiansXYZ.getY() );
+    cY = cosf( radiansXYZ.getY() );
+    sZ = sinf( radiansXYZ.getZ() );
+    cZ = cosf( radiansXYZ.getZ() );
+    tmp0 = ( cZ * sY );
+    tmp1 = ( sZ * sY );
+    return Matrix3(
+        Vector3( ( cZ * cY ), ( sZ * cY ), -sY ),
+        Vector3( ( ( tmp0 * sX ) - ( sZ * cX ) ), ( ( tmp1 * sX ) + ( cZ * cX ) ), ( cY * sX ) ),
+        Vector3( ( ( tmp0 * cX ) + ( sZ * sX ) ), ( ( tmp1 * cX ) - ( cZ * sX ) ), ( cY * cX ) )
+    );
+}
+
+inline const Matrix3 Matrix3::rotation( float radians, const Vector3 & unitVec )
+{
+    float x, y, z, s, c, oneMinusC, xy, yz, zx;
+    s = sinf( radians );
+    c = cosf( radians );
+    x = unitVec.getX();
+    y = unitVec.getY();
+    z = unitVec.getZ();
+    xy = ( x * y );
+    yz = ( y * z );
+    zx = ( z * x );
+    oneMinusC = ( 1.0f - c );
+    return Matrix3(
+        Vector3( ( ( ( x * x ) * oneMinusC ) + c ), ( ( xy * oneMinusC ) + ( z * s ) ), ( ( zx * oneMinusC ) - ( y * s ) ) ),
+        Vector3( ( ( xy * oneMinusC ) - ( z * s ) ), ( ( ( y * y ) * oneMinusC ) + c ), ( ( yz * oneMinusC ) + ( x * s ) ) ),
+        Vector3( ( ( zx * oneMinusC ) + ( y * s ) ), ( ( yz * oneMinusC ) - ( x * s ) ), ( ( ( z * z ) * oneMinusC ) + c ) )
+    );
+}
+
+inline const Matrix3 Matrix3::rotation( const Quat & unitQuat )
+{
+    return Matrix3( unitQuat );
+}
+
+inline const Matrix3 Matrix3::scale( const Vector3 & scaleVec )
+{
+    return Matrix3(
+        Vector3( scaleVec.getX(), 0.0f, 0.0f ),
+        Vector3( 0.0f, scaleVec.getY(), 0.0f ),
+        Vector3( 0.0f, 0.0f, scaleVec.getZ() )
+    );
+}
+
+inline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 & scaleVec )
+{
+    return Matrix3(
+        ( mat.getCol0() * scaleVec.getX( ) ),
+        ( mat.getCol1() * scaleVec.getY( ) ),
+        ( mat.getCol2() * scaleVec.getZ( ) )
+    );
+}
+
+inline const Matrix3 prependScale( const Vector3 & scaleVec, const Matrix3 & mat )
+{
+    return Matrix3(
+        mulPerElem( mat.getCol0(), scaleVec ),
+        mulPerElem( mat.getCol1(), scaleVec ),
+        mulPerElem( mat.getCol2(), scaleVec )
+    );
+}
+
+inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 )
+{
+    return Matrix3(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Matrix3 & mat )
+{
+    print( mat.getRow( 0 ) );
+    print( mat.getRow( 1 ) );
+    print( mat.getRow( 2 ) );
+}
+
+inline void print( const Matrix3 & mat, const char * name )
+{
+    printf("%s:\n", name);
+    print( mat );
+}
+
+#endif
+
+inline Matrix4::Matrix4( const Matrix4 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    mCol3 = mat.mCol3;
+}
+
+inline Matrix4::Matrix4( float scalar )
+{
+    mCol0 = Vector4( scalar );
+    mCol1 = Vector4( scalar );
+    mCol2 = Vector4( scalar );
+    mCol3 = Vector4( scalar );
+}
+
+inline Matrix4::Matrix4( const Transform3 & mat )
+{
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( mat.getCol3(), 1.0f );
+}
+
+inline Matrix4::Matrix4( const Vector4 & _col0, const Vector4 & _col1, const Vector4 & _col2, const Vector4 & _col3 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+    mCol3 = _col3;
+}
+
+inline Matrix4::Matrix4( const Matrix3 & mat, const Vector3 & translateVec )
+{
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( translateVec, 1.0f );
+}
+
+inline Matrix4::Matrix4( const Quat & unitQuat, const Vector3 & translateVec )
+{
+    Matrix3 mat;
+    mat = Matrix3( unitQuat );
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( translateVec, 1.0f );
+}
+
+inline Matrix4 & Matrix4::setCol0( const Vector4 & _col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setCol1( const Vector4 & _col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setCol2( const Vector4 & _col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setCol3( const Vector4 & _col3 )
+{
+    mCol3 = _col3;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setCol( int col, const Vector4 & vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setRow( int row, const Vector4 & vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    mCol3.setElem( row, vec.getElem( 3 ) );
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setElem( int col, int row, float val )
+{
+    Vector4 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+inline float Matrix4::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+inline const Vector4 Matrix4::getCol0( ) const
+{
+    return mCol0;
+}
+
+inline const Vector4 Matrix4::getCol1( ) const
+{
+    return mCol1;
+}
+
+inline const Vector4 Matrix4::getCol2( ) const
+{
+    return mCol2;
+}
+
+inline const Vector4 Matrix4::getCol3( ) const
+{
+    return mCol3;
+}
+
+inline const Vector4 Matrix4::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector4 Matrix4::getRow( int row ) const
+{
+    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
+}
+
+inline Vector4 & Matrix4::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector4 Matrix4::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline Matrix4 & Matrix4::operator =( const Matrix4 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    mCol3 = mat.mCol3;
+    return *this;
+}
+
+inline const Matrix4 transpose( const Matrix4 & mat )
+{
+    return Matrix4(
+        Vector4( mat.getCol0().getX(), mat.getCol1().getX(), mat.getCol2().getX(), mat.getCol3().getX() ),
+        Vector4( mat.getCol0().getY(), mat.getCol1().getY(), mat.getCol2().getY(), mat.getCol3().getY() ),
+        Vector4( mat.getCol0().getZ(), mat.getCol1().getZ(), mat.getCol2().getZ(), mat.getCol3().getZ() ),
+        Vector4( mat.getCol0().getW(), mat.getCol1().getW(), mat.getCol2().getW(), mat.getCol3().getW() )
+    );
+}
+
+inline const Matrix4 inverse( const Matrix4 & mat )
+{
+    Vector4 res0, res1, res2, res3;
+    float mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, detInv;
+    mA = mat.getCol0().getX();
+    mB = mat.getCol0().getY();
+    mC = mat.getCol0().getZ();
+    mD = mat.getCol0().getW();
+    mE = mat.getCol1().getX();
+    mF = mat.getCol1().getY();
+    mG = mat.getCol1().getZ();
+    mH = mat.getCol1().getW();
+    mI = mat.getCol2().getX();
+    mJ = mat.getCol2().getY();
+    mK = mat.getCol2().getZ();
+    mL = mat.getCol2().getW();
+    mM = mat.getCol3().getX();
+    mN = mat.getCol3().getY();
+    mO = mat.getCol3().getZ();
+    mP = mat.getCol3().getW();
+    tmp0 = ( ( mK * mD ) - ( mC * mL ) );
+    tmp1 = ( ( mO * mH ) - ( mG * mP ) );
+    tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
+    tmp3 = ( ( mF * mO ) - ( mN * mG ) );
+    tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
+    tmp5 = ( ( mN * mH ) - ( mF * mP ) );
+    res0.setX( ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) ) );
+    res0.setY( ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) ) );
+    res0.setZ( ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) ) );
+    res0.setW( ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) ) );
+    detInv = ( 1.0f / ( ( ( ( mA * res0.getX() ) + ( mE * res0.getY() ) ) + ( mI * res0.getZ() ) ) + ( mM * res0.getW() ) ) );
+    res1.setX( ( mI * tmp1 ) );
+    res1.setY( ( mM * tmp0 ) );
+    res1.setZ( ( mA * tmp1 ) );
+    res1.setW( ( mE * tmp0 ) );
+    res3.setX( ( mI * tmp3 ) );
+    res3.setY( ( mM * tmp2 ) );
+    res3.setZ( ( mA * tmp3 ) );
+    res3.setW( ( mE * tmp2 ) );
+    res2.setX( ( mI * tmp5 ) );
+    res2.setY( ( mM * tmp4 ) );
+    res2.setZ( ( mA * tmp5 ) );
+    res2.setW( ( mE * tmp4 ) );
+    tmp0 = ( ( mI * mB ) - ( mA * mJ ) );
+    tmp1 = ( ( mM * mF ) - ( mE * mN ) );
+    tmp2 = ( ( mI * mD ) - ( mA * mL ) );
+    tmp3 = ( ( mM * mH ) - ( mE * mP ) );
+    tmp4 = ( ( mI * mC ) - ( mA * mK ) );
+    tmp5 = ( ( mM * mG ) - ( mE * mO ) );
+    res2.setX( ( ( ( mL * tmp1 ) - ( mJ * tmp3 ) ) + res2.getX() ) );
+    res2.setY( ( ( ( mP * tmp0 ) - ( mN * tmp2 ) ) + res2.getY() ) );
+    res2.setZ( ( ( ( mB * tmp3 ) - ( mD * tmp1 ) ) - res2.getZ() ) );
+    res2.setW( ( ( ( mF * tmp2 ) - ( mH * tmp0 ) ) - res2.getW() ) );
+    res3.setX( ( ( ( mJ * tmp5 ) - ( mK * tmp1 ) ) + res3.getX() ) );
+    res3.setY( ( ( ( mN * tmp4 ) - ( mO * tmp0 ) ) + res3.getY() ) );
+    res3.setZ( ( ( ( mC * tmp1 ) - ( mB * tmp5 ) ) - res3.getZ() ) );
+    res3.setW( ( ( ( mG * tmp0 ) - ( mF * tmp4 ) ) - res3.getW() ) );
+    res1.setX( ( ( ( mK * tmp3 ) - ( mL * tmp5 ) ) - res1.getX() ) );
+    res1.setY( ( ( ( mO * tmp2 ) - ( mP * tmp4 ) ) - res1.getY() ) );
+    res1.setZ( ( ( ( mD * tmp5 ) - ( mC * tmp3 ) ) + res1.getZ() ) );
+    res1.setW( ( ( ( mH * tmp4 ) - ( mG * tmp2 ) ) + res1.getW() ) );
+    return Matrix4(
+        ( res0 * detInv ),
+        ( res1 * detInv ),
+        ( res2 * detInv ),
+        ( res3 * detInv )
+    );
+}
+
+inline const Matrix4 affineInverse( const Matrix4 & mat )
+{
+    Transform3 affineMat;
+    affineMat.setCol0( mat.getCol0().getXYZ( ) );
+    affineMat.setCol1( mat.getCol1().getXYZ( ) );
+    affineMat.setCol2( mat.getCol2().getXYZ( ) );
+    affineMat.setCol3( mat.getCol3().getXYZ( ) );
+    return Matrix4( inverse( affineMat ) );
+}
+
+inline const Matrix4 orthoInverse( const Matrix4 & mat )
+{
+    Transform3 affineMat;
+    affineMat.setCol0( mat.getCol0().getXYZ( ) );
+    affineMat.setCol1( mat.getCol1().getXYZ( ) );
+    affineMat.setCol2( mat.getCol2().getXYZ( ) );
+    affineMat.setCol3( mat.getCol3().getXYZ( ) );
+    return Matrix4( orthoInverse( affineMat ) );
+}
+
+inline float determinant( const Matrix4 & mat )
+{
+    float dx, dy, dz, dw, mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    mA = mat.getCol0().getX();
+    mB = mat.getCol0().getY();
+    mC = mat.getCol0().getZ();
+    mD = mat.getCol0().getW();
+    mE = mat.getCol1().getX();
+    mF = mat.getCol1().getY();
+    mG = mat.getCol1().getZ();
+    mH = mat.getCol1().getW();
+    mI = mat.getCol2().getX();
+    mJ = mat.getCol2().getY();
+    mK = mat.getCol2().getZ();
+    mL = mat.getCol2().getW();
+    mM = mat.getCol3().getX();
+    mN = mat.getCol3().getY();
+    mO = mat.getCol3().getZ();
+    mP = mat.getCol3().getW();
+    tmp0 = ( ( mK * mD ) - ( mC * mL ) );
+    tmp1 = ( ( mO * mH ) - ( mG * mP ) );
+    tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
+    tmp3 = ( ( mF * mO ) - ( mN * mG ) );
+    tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
+    tmp5 = ( ( mN * mH ) - ( mF * mP ) );
+    dx = ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) );
+    dy = ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) );
+    dz = ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) );
+    dw = ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) );
+    return ( ( ( ( mA * dx ) + ( mE * dy ) ) + ( mI * dz ) ) + ( mM * dw ) );
+}
+
+inline const Matrix4 Matrix4::operator +( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( mCol0 + mat.mCol0 ),
+        ( mCol1 + mat.mCol1 ),
+        ( mCol2 + mat.mCol2 ),
+        ( mCol3 + mat.mCol3 )
+    );
+}
+
+inline const Matrix4 Matrix4::operator -( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( mCol0 - mat.mCol0 ),
+        ( mCol1 - mat.mCol1 ),
+        ( mCol2 - mat.mCol2 ),
+        ( mCol3 - mat.mCol3 )
+    );
+}
+
+inline Matrix4 & Matrix4::operator +=( const Matrix4 & mat )
+{
+    *this = *this + mat;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::operator -=( const Matrix4 & mat )
+{
+    *this = *this - mat;
+    return *this;
+}
+
+inline const Matrix4 Matrix4::operator -( ) const
+{
+    return Matrix4(
+        ( -mCol0 ),
+        ( -mCol1 ),
+        ( -mCol2 ),
+        ( -mCol3 )
+    );
+}
+
+inline const Matrix4 absPerElem( const Matrix4 & mat )
+{
+    return Matrix4(
+        absPerElem( mat.getCol0() ),
+        absPerElem( mat.getCol1() ),
+        absPerElem( mat.getCol2() ),
+        absPerElem( mat.getCol3() )
+    );
+}
+
+inline const Matrix4 Matrix4::operator *( float scalar ) const
+{
+    return Matrix4(
+        ( mCol0 * scalar ),
+        ( mCol1 * scalar ),
+        ( mCol2 * scalar ),
+        ( mCol3 * scalar )
+    );
+}
+
+inline Matrix4 & Matrix4::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Matrix4 operator *( float scalar, const Matrix4 & mat )
+{
+    return mat * scalar;
+}
+
+inline const Vector4 Matrix4::operator *( const Vector4 & vec ) const
+{
+    return Vector4(
+        ( ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ) + ( mCol3.getX() * vec.getW() ) ),
+        ( ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ) + ( mCol3.getY() * vec.getW() ) ),
+        ( ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) ) + ( mCol3.getZ() * vec.getW() ) ),
+        ( ( ( ( mCol0.getW() * vec.getX() ) + ( mCol1.getW() * vec.getY() ) ) + ( mCol2.getW() * vec.getZ() ) ) + ( mCol3.getW() * vec.getW() ) )
+    );
+}
+
+inline const Vector4 Matrix4::operator *( const Vector3 & vec ) const
+{
+    return Vector4(
+        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
+        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
+        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) ),
+        ( ( ( mCol0.getW() * vec.getX() ) + ( mCol1.getW() * vec.getY() ) ) + ( mCol2.getW() * vec.getZ() ) )
+    );
+}
+
+inline const Vector4 Matrix4::operator *( const Point3 & pnt ) const
+{
+    return Vector4(
+        ( ( ( ( mCol0.getX() * pnt.getX() ) + ( mCol1.getX() * pnt.getY() ) ) + ( mCol2.getX() * pnt.getZ() ) ) + mCol3.getX() ),
+        ( ( ( ( mCol0.getY() * pnt.getX() ) + ( mCol1.getY() * pnt.getY() ) ) + ( mCol2.getY() * pnt.getZ() ) ) + mCol3.getY() ),
+        ( ( ( ( mCol0.getZ() * pnt.getX() ) + ( mCol1.getZ() * pnt.getY() ) ) + ( mCol2.getZ() * pnt.getZ() ) ) + mCol3.getZ() ),
+        ( ( ( ( mCol0.getW() * pnt.getX() ) + ( mCol1.getW() * pnt.getY() ) ) + ( mCol2.getW() * pnt.getZ() ) ) + mCol3.getW() )
+    );
+}
+
+inline const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( *this * mat.mCol0 ),
+        ( *this * mat.mCol1 ),
+        ( *this * mat.mCol2 ),
+        ( *this * mat.mCol3 )
+    );
+}
+
+inline Matrix4 & Matrix4::operator *=( const Matrix4 & mat )
+{
+    *this = *this * mat;
+    return *this;
+}
+
+inline const Matrix4 Matrix4::operator *( const Transform3 & tfrm ) const
+{
+    return Matrix4(
+        ( *this * tfrm.getCol0() ),
+        ( *this * tfrm.getCol1() ),
+        ( *this * tfrm.getCol2() ),
+        ( *this * Point3( tfrm.getCol3() ) )
+    );
+}
+
+inline Matrix4 & Matrix4::operator *=( const Transform3 & tfrm )
+{
+    *this = *this * tfrm;
+    return *this;
+}
+
+inline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 )
+{
+    return Matrix4(
+        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
+        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
+        mulPerElem( mat0.getCol2(), mat1.getCol2() ),
+        mulPerElem( mat0.getCol3(), mat1.getCol3() )
+    );
+}
+
+inline const Matrix4 Matrix4::identity( )
+{
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4::yAxis( ),
+        Vector4::zAxis( ),
+        Vector4::wAxis( )
+    );
+}
+
+inline Matrix4 & Matrix4::setUpper3x3( const Matrix3 & mat3 )
+{
+    mCol0.setXYZ( mat3.getCol0() );
+    mCol1.setXYZ( mat3.getCol1() );
+    mCol2.setXYZ( mat3.getCol2() );
+    return *this;
+}
+
+inline const Matrix3 Matrix4::getUpper3x3( ) const
+{
+    return Matrix3(
+        mCol0.getXYZ( ),
+        mCol1.getXYZ( ),
+        mCol2.getXYZ( )
+    );
+}
+
+inline Matrix4 & Matrix4::setTranslation( const Vector3 & translateVec )
+{
+    mCol3.setXYZ( translateVec );
+    return *this;
+}
+
+inline const Vector3 Matrix4::getTranslation( ) const
+{
+    return mCol3.getXYZ( );
+}
+
+inline const Matrix4 Matrix4::rotationX( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4( 0.0f, c, s, 0.0f ),
+        Vector4( 0.0f, -s, c, 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotationY( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix4(
+        Vector4( c, 0.0f, -s, 0.0f ),
+        Vector4::yAxis( ),
+        Vector4( s, 0.0f, c, 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotationZ( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix4(
+        Vector4( c, s, 0.0f, 0.0f ),
+        Vector4( -s, c, 0.0f, 0.0f ),
+        Vector4::zAxis( ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotationZYX( const Vector3 & radiansXYZ )
+{
+    float sX, cX, sY, cY, sZ, cZ, tmp0, tmp1;
+    sX = sinf( radiansXYZ.getX() );
+    cX = cosf( radiansXYZ.getX() );
+    sY = sinf( radiansXYZ.getY() );
+    cY = cosf( radiansXYZ.getY() );
+    sZ = sinf( radiansXYZ.getZ() );
+    cZ = cosf( radiansXYZ.getZ() );
+    tmp0 = ( cZ * sY );
+    tmp1 = ( sZ * sY );
+    return Matrix4(
+        Vector4( ( cZ * cY ), ( sZ * cY ), -sY, 0.0f ),
+        Vector4( ( ( tmp0 * sX ) - ( sZ * cX ) ), ( ( tmp1 * sX ) + ( cZ * cX ) ), ( cY * sX ), 0.0f ),
+        Vector4( ( ( tmp0 * cX ) + ( sZ * sX ) ), ( ( tmp1 * cX ) - ( cZ * sX ) ), ( cY * cX ), 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotation( float radians, const Vector3 & unitVec )
+{
+    float x, y, z, s, c, oneMinusC, xy, yz, zx;
+    s = sinf( radians );
+    c = cosf( radians );
+    x = unitVec.getX();
+    y = unitVec.getY();
+    z = unitVec.getZ();
+    xy = ( x * y );
+    yz = ( y * z );
+    zx = ( z * x );
+    oneMinusC = ( 1.0f - c );
+    return Matrix4(
+        Vector4( ( ( ( x * x ) * oneMinusC ) + c ), ( ( xy * oneMinusC ) + ( z * s ) ), ( ( zx * oneMinusC ) - ( y * s ) ), 0.0f ),
+        Vector4( ( ( xy * oneMinusC ) - ( z * s ) ), ( ( ( y * y ) * oneMinusC ) + c ), ( ( yz * oneMinusC ) + ( x * s ) ), 0.0f ),
+        Vector4( ( ( zx * oneMinusC ) + ( y * s ) ), ( ( yz * oneMinusC ) - ( x * s ) ), ( ( ( z * z ) * oneMinusC ) + c ), 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotation( const Quat & unitQuat )
+{
+    return Matrix4( Transform3::rotation( unitQuat ) );
+}
+
+inline const Matrix4 Matrix4::scale( const Vector3 & scaleVec )
+{
+    return Matrix4(
+        Vector4( scaleVec.getX(), 0.0f, 0.0f, 0.0f ),
+        Vector4( 0.0f, scaleVec.getY(), 0.0f, 0.0f ),
+        Vector4( 0.0f, 0.0f, scaleVec.getZ(), 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 & scaleVec )
+{
+    return Matrix4(
+        ( mat.getCol0() * scaleVec.getX( ) ),
+        ( mat.getCol1() * scaleVec.getY( ) ),
+        ( mat.getCol2() * scaleVec.getZ( ) ),
+        mat.getCol3()
+    );
+}
+
+inline const Matrix4 prependScale( const Vector3 & scaleVec, const Matrix4 & mat )
+{
+    Vector4 scale4;
+    scale4 = Vector4( scaleVec, 1.0f );
+    return Matrix4(
+        mulPerElem( mat.getCol0(), scale4 ),
+        mulPerElem( mat.getCol1(), scale4 ),
+        mulPerElem( mat.getCol2(), scale4 ),
+        mulPerElem( mat.getCol3(), scale4 )
+    );
+}
+
+inline const Matrix4 Matrix4::translation( const Vector3 & translateVec )
+{
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4::yAxis( ),
+        Vector4::zAxis( ),
+        Vector4( translateVec, 1.0f )
+    );
+}
+
+inline const Matrix4 Matrix4::lookAt( const Point3 & eyePos, const Point3 & lookAtPos, const Vector3 & upVec )
+{
+    Matrix4 m4EyeFrame;
+    Vector3 v3X, v3Y, v3Z;
+    v3Y = normalize( upVec );
+    v3Z = normalize( ( eyePos - lookAtPos ) );
+    v3X = normalize( cross( v3Y, v3Z ) );
+    v3Y = cross( v3Z, v3X );
+    m4EyeFrame = Matrix4( Vector4( v3X ), Vector4( v3Y ), Vector4( v3Z ), Vector4( eyePos ) );
+    return orthoInverse( m4EyeFrame );
+}
+
+inline const Matrix4 Matrix4::perspective( float fovyRadians, float aspect, float zNear, float zFar )
+{
+    float f, rangeInv;
+    f = tanf( ( (float)( _VECTORMATH_PI_OVER_2 ) - ( 0.5f * fovyRadians ) ) );
+    rangeInv = ( 1.0f / ( zNear - zFar ) );
+    return Matrix4(
+        Vector4( ( f / aspect ), 0.0f, 0.0f, 0.0f ),
+        Vector4( 0.0f, f, 0.0f, 0.0f ),
+        Vector4( 0.0f, 0.0f, ( ( zNear + zFar ) * rangeInv ), -1.0f ),
+        Vector4( 0.0f, 0.0f, ( ( ( zNear * zFar ) * rangeInv ) * 2.0f ), 0.0f )
+    );
+}
+
+inline const Matrix4 Matrix4::frustum( float left, float right, float bottom, float top, float zNear, float zFar )
+{
+    float sum_rl, sum_tb, sum_nf, inv_rl, inv_tb, inv_nf, n2;
+    sum_rl = ( right + left );
+    sum_tb = ( top + bottom );
+    sum_nf = ( zNear + zFar );
+    inv_rl = ( 1.0f / ( right - left ) );
+    inv_tb = ( 1.0f / ( top - bottom ) );
+    inv_nf = ( 1.0f / ( zNear - zFar ) );
+    n2 = ( zNear + zNear );
+    return Matrix4(
+        Vector4( ( n2 * inv_rl ), 0.0f, 0.0f, 0.0f ),
+        Vector4( 0.0f, ( n2 * inv_tb ), 0.0f, 0.0f ),
+        Vector4( ( sum_rl * inv_rl ), ( sum_tb * inv_tb ), ( sum_nf * inv_nf ), -1.0f ),
+        Vector4( 0.0f, 0.0f, ( ( n2 * inv_nf ) * zFar ), 0.0f )
+    );
+}
+
+inline const Matrix4 Matrix4::orthographic( float left, float right, float bottom, float top, float zNear, float zFar )
+{
+    float sum_rl, sum_tb, sum_nf, inv_rl, inv_tb, inv_nf;
+    sum_rl = ( right + left );
+    sum_tb = ( top + bottom );
+    sum_nf = ( zNear + zFar );
+    inv_rl = ( 1.0f / ( right - left ) );
+    inv_tb = ( 1.0f / ( top - bottom ) );
+    inv_nf = ( 1.0f / ( zNear - zFar ) );
+    return Matrix4(
+        Vector4( ( inv_rl + inv_rl ), 0.0f, 0.0f, 0.0f ),
+        Vector4( 0.0f, ( inv_tb + inv_tb ), 0.0f, 0.0f ),
+        Vector4( 0.0f, 0.0f, ( inv_nf + inv_nf ), 0.0f ),
+        Vector4( ( -sum_rl * inv_rl ), ( -sum_tb * inv_tb ), ( sum_nf * inv_nf ), 1.0f )
+    );
+}
+
+inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 )
+{
+    return Matrix4(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 ),
+        select( mat0.getCol3(), mat1.getCol3(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Matrix4 & mat )
+{
+    print( mat.getRow( 0 ) );
+    print( mat.getRow( 1 ) );
+    print( mat.getRow( 2 ) );
+    print( mat.getRow( 3 ) );
+}
+
+inline void print( const Matrix4 & mat, const char * name )
+{
+    printf("%s:\n", name);
+    print( mat );
+}
+
+#endif
+
+inline Transform3::Transform3( const Transform3 & tfrm )
+{
+    mCol0 = tfrm.mCol0;
+    mCol1 = tfrm.mCol1;
+    mCol2 = tfrm.mCol2;
+    mCol3 = tfrm.mCol3;
+}
+
+inline Transform3::Transform3( float scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+    mCol3 = Vector3( scalar );
+}
+
+inline Transform3::Transform3( const Vector3 & _col0, const Vector3 & _col1, const Vector3 & _col2, const Vector3 & _col3 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+    mCol3 = _col3;
+}
+
+inline Transform3::Transform3( const Matrix3 & tfrm, const Vector3 & translateVec )
+{
+    this->setUpper3x3( tfrm );
+    this->setTranslation( translateVec );
+}
+
+inline Transform3::Transform3( const Quat & unitQuat, const Vector3 & translateVec )
+{
+    this->setUpper3x3( Matrix3( unitQuat ) );
+    this->setTranslation( translateVec );
+}
+
+inline Transform3 & Transform3::setCol0( const Vector3 & _col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+inline Transform3 & Transform3::setCol1( const Vector3 & _col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+inline Transform3 & Transform3::setCol2( const Vector3 & _col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+inline Transform3 & Transform3::setCol3( const Vector3 & _col3 )
+{
+    mCol3 = _col3;
+    return *this;
+}
+
+inline Transform3 & Transform3::setCol( int col, const Vector3 & vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+inline Transform3 & Transform3::setRow( int row, const Vector4 & vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    mCol3.setElem( row, vec.getElem( 3 ) );
+    return *this;
+}
+
+inline Transform3 & Transform3::setElem( int col, int row, float val )
+{
+    Vector3 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+inline float Transform3::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+inline const Vector3 Transform3::getCol0( ) const
+{
+    return mCol0;
+}
+
+inline const Vector3 Transform3::getCol1( ) const
+{
+    return mCol1;
+}
+
+inline const Vector3 Transform3::getCol2( ) const
+{
+    return mCol2;
+}
+
+inline const Vector3 Transform3::getCol3( ) const
+{
+    return mCol3;
+}
+
+inline const Vector3 Transform3::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector4 Transform3::getRow( int row ) const
+{
+    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
+}
+
+inline Vector3 & Transform3::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector3 Transform3::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline Transform3 & Transform3::operator =( const Transform3 & tfrm )
+{
+    mCol0 = tfrm.mCol0;
+    mCol1 = tfrm.mCol1;
+    mCol2 = tfrm.mCol2;
+    mCol3 = tfrm.mCol3;
+    return *this;
+}
+
+inline const Transform3 inverse( const Transform3 & tfrm )
+{
+    Vector3 tmp0, tmp1, tmp2, inv0, inv1, inv2;
+    float detinv;
+    tmp0 = cross( tfrm.getCol1(), tfrm.getCol2() );
+    tmp1 = cross( tfrm.getCol2(), tfrm.getCol0() );
+    tmp2 = cross( tfrm.getCol0(), tfrm.getCol1() );
+    detinv = ( 1.0f / dot( tfrm.getCol2(), tmp2 ) );
+    inv0 = Vector3( ( tmp0.getX() * detinv ), ( tmp1.getX() * detinv ), ( tmp2.getX() * detinv ) );
+    inv1 = Vector3( ( tmp0.getY() * detinv ), ( tmp1.getY() * detinv ), ( tmp2.getY() * detinv ) );
+    inv2 = Vector3( ( tmp0.getZ() * detinv ), ( tmp1.getZ() * detinv ), ( tmp2.getZ() * detinv ) );
+    return Transform3(
+        inv0,
+        inv1,
+        inv2,
+        Vector3( ( -( ( inv0 * tfrm.getCol3().getX() ) + ( ( inv1 * tfrm.getCol3().getY() ) + ( inv2 * tfrm.getCol3().getZ() ) ) ) ) )
+    );
+}
+
+inline const Transform3 orthoInverse( const Transform3 & tfrm )
+{
+    Vector3 inv0, inv1, inv2;
+    inv0 = Vector3( tfrm.getCol0().getX(), tfrm.getCol1().getX(), tfrm.getCol2().getX() );
+    inv1 = Vector3( tfrm.getCol0().getY(), tfrm.getCol1().getY(), tfrm.getCol2().getY() );
+    inv2 = Vector3( tfrm.getCol0().getZ(), tfrm.getCol1().getZ(), tfrm.getCol2().getZ() );
+    return Transform3(
+        inv0,
+        inv1,
+        inv2,
+        Vector3( ( -( ( inv0 * tfrm.getCol3().getX() ) + ( ( inv1 * tfrm.getCol3().getY() ) + ( inv2 * tfrm.getCol3().getZ() ) ) ) ) )
+    );
+}
+
+inline const Transform3 absPerElem( const Transform3 & tfrm )
+{
+    return Transform3(
+        absPerElem( tfrm.getCol0() ),
+        absPerElem( tfrm.getCol1() ),
+        absPerElem( tfrm.getCol2() ),
+        absPerElem( tfrm.getCol3() )
+    );
+}
+
+inline const Vector3 Transform3::operator *( const Vector3 & vec ) const
+{
+    return Vector3(
+        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
+        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
+        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
+    );
+}
+
+inline const Point3 Transform3::operator *( const Point3 & pnt ) const
+{
+    return Point3(
+        ( ( ( ( mCol0.getX() * pnt.getX() ) + ( mCol1.getX() * pnt.getY() ) ) + ( mCol2.getX() * pnt.getZ() ) ) + mCol3.getX() ),
+        ( ( ( ( mCol0.getY() * pnt.getX() ) + ( mCol1.getY() * pnt.getY() ) ) + ( mCol2.getY() * pnt.getZ() ) ) + mCol3.getY() ),
+        ( ( ( ( mCol0.getZ() * pnt.getX() ) + ( mCol1.getZ() * pnt.getY() ) ) + ( mCol2.getZ() * pnt.getZ() ) ) + mCol3.getZ() )
+    );
+}
+
+inline const Transform3 Transform3::operator *( const Transform3 & tfrm ) const
+{
+    return Transform3(
+        ( *this * tfrm.mCol0 ),
+        ( *this * tfrm.mCol1 ),
+        ( *this * tfrm.mCol2 ),
+        Vector3( ( *this * Point3( tfrm.mCol3 ) ) )
+    );
+}
+
+inline Transform3 & Transform3::operator *=( const Transform3 & tfrm )
+{
+    *this = *this * tfrm;
+    return *this;
+}
+
+inline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 )
+{
+    return Transform3(
+        mulPerElem( tfrm0.getCol0(), tfrm1.getCol0() ),
+        mulPerElem( tfrm0.getCol1(), tfrm1.getCol1() ),
+        mulPerElem( tfrm0.getCol2(), tfrm1.getCol2() ),
+        mulPerElem( tfrm0.getCol3(), tfrm1.getCol3() )
+    );
+}
+
+inline const Transform3 Transform3::identity( )
+{
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( ),
+        Vector3( 0.0f )
+    );
+}
+
+inline Transform3 & Transform3::setUpper3x3( const Matrix3 & tfrm )
+{
+    mCol0 = tfrm.getCol0();
+    mCol1 = tfrm.getCol1();
+    mCol2 = tfrm.getCol2();
+    return *this;
+}
+
+inline const Matrix3 Transform3::getUpper3x3( ) const
+{
+    return Matrix3( mCol0, mCol1, mCol2 );
+}
+
+inline Transform3 & Transform3::setTranslation( const Vector3 & translateVec )
+{
+    mCol3 = translateVec;
+    return *this;
+}
+
+inline const Vector3 Transform3::getTranslation( ) const
+{
+    return mCol3;
+}
+
+inline const Transform3 Transform3::rotationX( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3( 0.0f, c, s ),
+        Vector3( 0.0f, -s, c ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 Transform3::rotationY( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Transform3(
+        Vector3( c, 0.0f, -s ),
+        Vector3::yAxis( ),
+        Vector3( s, 0.0f, c ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 Transform3::rotationZ( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Transform3(
+        Vector3( c, s, 0.0f ),
+        Vector3( -s, c, 0.0f ),
+        Vector3::zAxis( ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 Transform3::rotationZYX( const Vector3 & radiansXYZ )
+{
+    float sX, cX, sY, cY, sZ, cZ, tmp0, tmp1;
+    sX = sinf( radiansXYZ.getX() );
+    cX = cosf( radiansXYZ.getX() );
+    sY = sinf( radiansXYZ.getY() );
+    cY = cosf( radiansXYZ.getY() );
+    sZ = sinf( radiansXYZ.getZ() );
+    cZ = cosf( radiansXYZ.getZ() );
+    tmp0 = ( cZ * sY );
+    tmp1 = ( sZ * sY );
+    return Transform3(
+        Vector3( ( cZ * cY ), ( sZ * cY ), -sY ),
+        Vector3( ( ( tmp0 * sX ) - ( sZ * cX ) ), ( ( tmp1 * sX ) + ( cZ * cX ) ), ( cY * sX ) ),
+        Vector3( ( ( tmp0 * cX ) + ( sZ * sX ) ), ( ( tmp1 * cX ) - ( cZ * sX ) ), ( cY * cX ) ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 Transform3::rotation( float radians, const Vector3 & unitVec )
+{
+    return Transform3( Matrix3::rotation( radians, unitVec ), Vector3( 0.0f ) );
+}
+
+inline const Transform3 Transform3::rotation( const Quat & unitQuat )
+{
+    return Transform3( Matrix3( unitQuat ), Vector3( 0.0f ) );
+}
+
+inline const Transform3 Transform3::scale( const Vector3 & scaleVec )
+{
+    return Transform3(
+        Vector3( scaleVec.getX(), 0.0f, 0.0f ),
+        Vector3( 0.0f, scaleVec.getY(), 0.0f ),
+        Vector3( 0.0f, 0.0f, scaleVec.getZ() ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 & scaleVec )
+{
+    return Transform3(
+        ( tfrm.getCol0() * scaleVec.getX( ) ),
+        ( tfrm.getCol1() * scaleVec.getY( ) ),
+        ( tfrm.getCol2() * scaleVec.getZ( ) ),
+        tfrm.getCol3()
+    );
+}
+
+inline const Transform3 prependScale( const Vector3 & scaleVec, const Transform3 & tfrm )
+{
+    return Transform3(
+        mulPerElem( tfrm.getCol0(), scaleVec ),
+        mulPerElem( tfrm.getCol1(), scaleVec ),
+        mulPerElem( tfrm.getCol2(), scaleVec ),
+        mulPerElem( tfrm.getCol3(), scaleVec )
+    );
+}
+
+inline const Transform3 Transform3::translation( const Vector3 & translateVec )
+{
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( ),
+        translateVec
+    );
+}
+
+inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 )
+{
+    return Transform3(
+        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
+        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
+        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
+        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Transform3 & tfrm )
+{
+    print( tfrm.getRow( 0 ) );
+    print( tfrm.getRow( 1 ) );
+    print( tfrm.getRow( 2 ) );
+}
+
+inline void print( const Transform3 & tfrm, const char * name )
+{
+    printf("%s:\n", name);
+    print( tfrm );
+}
+
+#endif
+
+inline Quat::Quat( const Matrix3 & tfrm )
+{
+    float trace, radicand, scale, xx, yx, zx, xy, yy, zy, xz, yz, zz, tmpx, tmpy, tmpz, tmpw, qx, qy, qz, qw;
+    int negTrace, ZgtX, ZgtY, YgtX;
+    int largestXorY, largestYorZ, largestZorX;
+
+    xx = tfrm.getCol0().getX();
+    yx = tfrm.getCol0().getY();
+    zx = tfrm.getCol0().getZ();
+    xy = tfrm.getCol1().getX();
+    yy = tfrm.getCol1().getY();
+    zy = tfrm.getCol1().getZ();
+    xz = tfrm.getCol2().getX();
+    yz = tfrm.getCol2().getY();
+    zz = tfrm.getCol2().getZ();
+
+    trace = ( ( xx + yy ) + zz );
+
+    negTrace = ( trace < 0.0f );
+    ZgtX = zz > xx;
+    ZgtY = zz > yy;
+    YgtX = yy > xx;
+    largestXorY = ( !ZgtX || !ZgtY ) && negTrace;
+    largestYorZ = ( YgtX || ZgtX ) && negTrace;
+    largestZorX = ( ZgtY || !YgtX ) && negTrace;
+    
+    if ( largestXorY )
+    {
+        zz = -zz;
+        xy = -xy;
+    }
+    if ( largestYorZ )
+    {
+        xx = -xx;
+        yz = -yz;
+    }
+    if ( largestZorX )
+    {
+        yy = -yy;
+        zx = -zx;
+    }
+
+    radicand = ( ( ( xx + yy ) + zz ) + 1.0f );
+    scale = ( 0.5f * ( 1.0f / sqrtf( radicand ) ) );
+
+    tmpx = ( ( zy - yz ) * scale );
+    tmpy = ( ( xz - zx ) * scale );
+    tmpz = ( ( yx - xy ) * scale );
+    tmpw = ( radicand * scale );
+    qx = tmpx;
+    qy = tmpy;
+    qz = tmpz;
+    qw = tmpw;
+
+    if ( largestXorY )
+    {
+        qx = tmpw;
+        qy = tmpz;
+        qz = tmpy;
+        qw = tmpx;
+    }
+    if ( largestYorZ )
+    {
+        tmpx = qx;
+        tmpz = qz;
+        qx = qy;
+        qy = tmpx;
+        qz = qw;
+        qw = tmpz;
+    }
+
+    mXYZW[0] = qx;
+    mXYZW[1] = qy;
+    mXYZW[2] = qz;
+    mXYZW[3] = qw;
+}
+
+inline const Matrix3 outer( const Vector3 & tfrm0, const Vector3 & tfrm1 )
+{
+    return Matrix3(
+        ( tfrm0 * tfrm1.getX( ) ),
+        ( tfrm0 * tfrm1.getY( ) ),
+        ( tfrm0 * tfrm1.getZ( ) )
+    );
+}
+
+inline const Matrix4 outer( const Vector4 & tfrm0, const Vector4 & tfrm1 )
+{
+    return Matrix4(
+        ( tfrm0 * tfrm1.getX( ) ),
+        ( tfrm0 * tfrm1.getY( ) ),
+        ( tfrm0 * tfrm1.getZ( ) ),
+        ( tfrm0 * tfrm1.getW( ) )
+    );
+}
+
+inline const Vector3 rowMul( const Vector3 & vec, const Matrix3 & mat )
+{
+    return Vector3(
+        ( ( ( vec.getX() * mat.getCol0().getX() ) + ( vec.getY() * mat.getCol0().getY() ) ) + ( vec.getZ() * mat.getCol0().getZ() ) ),
+        ( ( ( vec.getX() * mat.getCol1().getX() ) + ( vec.getY() * mat.getCol1().getY() ) ) + ( vec.getZ() * mat.getCol1().getZ() ) ),
+        ( ( ( vec.getX() * mat.getCol2().getX() ) + ( vec.getY() * mat.getCol2().getY() ) ) + ( vec.getZ() * mat.getCol2().getZ() ) )
+    );
+}
+
+inline const Matrix3 crossMatrix( const Vector3 & vec )
+{
+    return Matrix3(
+        Vector3( 0.0f, vec.getZ(), -vec.getY() ),
+        Vector3( -vec.getZ(), 0.0f, vec.getX() ),
+        Vector3( vec.getY(), -vec.getX(), 0.0f )
+    );
+}
+
+inline const Matrix3 crossMatrixMul( const Vector3 & vec, const Matrix3 & mat )
+{
+    return Matrix3( cross( vec, mat.getCol0() ), cross( vec, mat.getCol1() ), cross( vec, mat.getCol2() ) );
+}
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
+
diff --git a/src/vectormath/neon/quat_aos.h b/src/vectormath/neon/quat_aos.h
new file mode 100644
index 000000000..d06184603
--- /dev/null
+++ b/src/vectormath/neon/quat_aos.h
@@ -0,0 +1,413 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef _VECTORMATH_QUAT_AOS_CPP_H
+#define _VECTORMATH_QUAT_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+    inline Quat::Quat( const Quat & quat )
+    {        
+        vXYZW = quat.vXYZW;
+    }
+    
+    inline Quat::Quat( float _x, float _y, float _z, float _w )
+    {        
+        mXYZW[0] = _x;
+        mXYZW[1] = _y;
+        mXYZW[2] = _z;
+        mXYZW[3] = _w;
+    }
+    
+    inline Quat::Quat( float32x4_t fXYZW )  
+    {        
+        vXYZW = fXYZW;
+    }
+    
+    inline Quat::Quat( const Vector3 & xyz, float _w )
+    {        
+        this->setXYZ( xyz );
+        this->setW( _w );
+    }
+    
+    inline Quat::Quat( const Vector4 & vec )
+    {        
+        mXYZW[0] = vec.getX();
+        mXYZW[1] = vec.getY();
+        mXYZW[2] = vec.getZ();
+        mXYZW[3] = vec.getW();
+    }
+    
+    inline Quat::Quat( float scalar )  
+    {        
+        vXYZW = vdupq_n_f32(scalar);
+    }
+    
+    inline const Quat Quat::identity( )
+    {        
+        return Quat( 0.0f, 0.0f, 0.0f, 1.0f );
+    }
+    
+    inline const Quat lerp( float t, const Quat & quat0, const Quat & quat1 )
+    {        
+        return ( quat0 + ( ( quat1 - quat0 ) * t ) );
+    }
+    
+    inline const Quat slerp( float t, const Quat & unitQuat0, const Quat & unitQuat1 )
+    {
+        Quat start;
+        float recipSinAngle, scale0, scale1, cosAngle, angle;
+        cosAngle = dot( unitQuat0, unitQuat1 );
+        if ( cosAngle < 0.0f ) {
+            cosAngle = -cosAngle;
+            start = ( -unitQuat0 );
+        } else {
+            start = unitQuat0;
+        }
+        if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
+            angle = acosf( cosAngle );
+            recipSinAngle = ( 1.0f / sinf( angle ) );
+            scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
+            scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
+        } else {
+            scale0 = ( 1.0f - t );
+            scale1 = t;
+        }
+        return ( ( start * scale0 ) + ( unitQuat1 * scale1 ) );
+    }
+    
+    inline const Quat squad( float t, const Quat & unitQuat0, const Quat & unitQuat1, const Quat & unitQuat2, const Quat & unitQuat3 )
+    {        
+        Quat tmp0, tmp1;
+        tmp0 = slerp( t, unitQuat0, unitQuat3 );
+        tmp1 = slerp( t, unitQuat1, unitQuat2 );
+        return slerp( ( ( 2.0f * t ) * ( 1.0f - t ) ), tmp0, tmp1 );
+    }
+    
+    inline void loadXYZW( Quat & quat, const float * fptr )
+    {        
+        quat = Quat( fptr[0], fptr[1], fptr[2], fptr[3] );
+    }
+    
+    inline void storeXYZW( const Quat & quat, float * fptr )
+    {        
+        vst1q_f32(fptr, quat.getvXYZW());
+    }
+    
+    inline Quat & Quat::operator =( const Quat & quat )
+    {        
+        vXYZW = quat.getvXYZW();
+        return *this;
+    }
+    
+    inline Quat & Quat::setXYZ( const Vector3 & vec )
+    {        
+        mXYZW[0] = vec.getX();
+        mXYZW[1] = vec.getY();
+        mXYZW[2] = vec.getZ();
+        return *this;
+    }
+    
+    inline const Vector3 Quat::getXYZ( ) const
+    {        
+        return Vector3( mXYZW[0], mXYZW[1], mXYZW[2] );
+    }
+    
+    inline float32x4_t Quat::getvXYZW( ) const
+    {        
+        return vXYZW;
+    }
+    
+    inline Quat & Quat::setX( float _x )
+    {        
+        mXYZW[0] = _x;
+        return *this;
+    }
+    
+    inline float Quat::getX( ) const
+    {        
+        return mXYZW[0];
+    }
+    
+    inline Quat & Quat::setY( float _y )
+    {        
+        mXYZW[1] = _y;
+        return *this;
+    }
+    
+    inline float Quat::getY( ) const
+    {        
+        return mXYZW[1];
+    }
+    
+    inline Quat & Quat::setZ( float _z )
+    {        
+        mXYZW[2] = _z;
+        return *this;
+    }
+    
+    inline float Quat::getZ( ) const
+    {        
+        return mXYZW[2];
+    }
+    
+    inline Quat & Quat::setW( float _w )
+    {        
+        mXYZW[3] = _w;
+        return *this;
+    }
+    
+    inline float Quat::getW( ) const
+    {        
+        return mXYZW[3];
+    }
+    
+    inline Quat & Quat::setElem( int idx, float value )
+    {        
+        *(&mXYZW[0] + idx) = value;
+        return *this;
+    }
+    
+    inline float Quat::getElem( int idx ) const
+    {        
+        return *(&mXYZW[0] + idx);
+    }
+    
+    inline float & Quat::operator []( int idx )
+    {        
+        return *(&mXYZW[0] + idx);
+    }
+    
+    inline float Quat::operator []( int idx ) const
+    {        
+        return *(&mXYZW[0] + idx);
+    }
+    
+    inline const Quat Quat::operator +( const Quat & quat ) const
+    {        
+        return Quat( vaddq_f32(vXYZW, quat.vXYZW) );
+    }
+    
+    inline const Quat Quat::operator -( const Quat & quat ) const
+    {        
+        return Quat( vsubq_f32(vXYZW, quat.vXYZW) );
+    }
+    
+    inline const Quat Quat::operator *( float scalar ) const
+    {        
+        float32x4_t v_scalar = vdupq_n_f32(scalar);
+        return Quat( vmulq_f32(vXYZW, v_scalar) );
+    }
+    
+    inline Quat & Quat::operator +=( const Quat & quat )
+    {        
+        *this = *this + quat;
+        return *this;
+    }
+    
+    inline Quat & Quat::operator -=( const Quat & quat )
+    {
+        *this = *this - quat;
+        return *this;
+    }
+    
+    inline Quat & Quat::operator *=( float scalar )
+    {        
+        *this = *this * scalar;
+        return *this;
+    }
+    
+    inline const Quat Quat::operator /( float scalar ) const
+    {        
+        return Quat(
+                    ( mXYZW[0] / scalar ),
+                    ( mXYZW[1] / scalar ),
+                    ( mXYZW[2] / scalar ),
+                    ( mXYZW[3] / scalar )
+                    );
+    }
+    
+    inline Quat & Quat::operator /=( float scalar )
+    {        
+        *this = *this / scalar;
+        return *this;
+    }
+    
+    inline const Quat Quat::operator -( ) const
+    {        
+        return Quat( vnegq_f32(vXYZW) );
+    }
+    
+    inline const Quat operator *( float scalar, const Quat & quat )
+    {        
+        return quat * scalar;
+    }
+    
+    inline float dot( const Quat & quat0, const Quat & quat1 )
+    {        
+        float result;
+        result = ( quat0.getX() * quat1.getX() );
+        result = ( result + ( quat0.getY() * quat1.getY() ) );
+        result = ( result + ( quat0.getZ() * quat1.getZ() ) );
+        result = ( result + ( quat0.getW() * quat1.getW() ) );
+        return result;
+    }
+    
+    inline float norm( const Quat & quat )
+    {        
+        float result;
+        result = ( quat.getX() * quat.getX() );
+        result = ( result + ( quat.getY() * quat.getY() ) );
+        result = ( result + ( quat.getZ() * quat.getZ() ) );
+        result = ( result + ( quat.getW() * quat.getW() ) );
+        return result;
+    }
+    
+    inline float length( const Quat & quat )
+    {        
+        return ::sqrtf( norm( quat ) );
+    }
+    
+    inline const Quat normalize( const Quat & quat )
+    {        
+        float lenSqr, lenInv;
+        lenSqr = norm( quat );
+        lenInv = ( 1.0f / sqrtf( lenSqr ) );
+        return Quat(
+                    ( quat.getX() * lenInv ),
+                    ( quat.getY() * lenInv ),
+                    ( quat.getZ() * lenInv ),
+                    ( quat.getW() * lenInv )
+                    );
+    }
+    
+    inline const Quat Quat::rotation( const Vector3 & unitVec0, const Vector3 & unitVec1 )
+    {        
+        float cosHalfAngleX2, recipCosHalfAngleX2;
+        cosHalfAngleX2 = sqrtf( ( 2.0f * ( 1.0f + dot( unitVec0, unitVec1 ) ) ) );
+        recipCosHalfAngleX2 = ( 1.0f / cosHalfAngleX2 );
+        return Quat( ( cross( unitVec0, unitVec1 ) * recipCosHalfAngleX2 ), ( cosHalfAngleX2 * 0.5f ) );
+    }
+    
+    inline const Quat Quat::rotation( float radians, const Vector3 & unitVec )
+    {        
+        float s, c, angle;
+        angle = ( radians * 0.5f );
+        s = sinf( angle );
+        c = cosf( angle );
+        return Quat( ( unitVec * s ), c );
+    }
+    
+    inline const Quat Quat::rotationX( float radians )
+    {        
+        float s, c, angle;
+        angle = ( radians * 0.5f );
+        s = sinf( angle );
+        c = cosf( angle );
+        return Quat( s, 0.0f, 0.0f, c );
+    }
+    
+    inline const Quat Quat::rotationY( float radians )
+    {        
+        float s, c, angle;
+        angle = ( radians * 0.5f );
+        s = sinf( angle );
+        c = cosf( angle );
+        return Quat( 0.0f, s, 0.0f, c );
+    }
+    
+    inline const Quat Quat::rotationZ( float radians )
+    {        
+        float s, c, angle;
+        angle = ( radians * 0.5f );
+        s = sinf( angle );
+        c = cosf( angle );
+        return Quat( 0.0f, 0.0f, s, c );
+    }
+    
+    inline const Quat Quat::operator *( const Quat & quat ) const
+    {        
+        return Quat(
+                    ( ( ( ( mXYZW[3] * quat.mXYZW[0] ) + ( mXYZW[0] * quat.mXYZW[3] ) ) + ( mXYZW[1] * quat.mXYZW[2] ) ) - ( mXYZW[2] * quat.mXYZW[1] ) ),
+                    ( ( ( ( mXYZW[3] * quat.mXYZW[1] ) + ( mXYZW[1] * quat.mXYZW[3] ) ) + ( mXYZW[2] * quat.mXYZW[0] ) ) - ( mXYZW[0] * quat.mXYZW[2] ) ),
+                    ( ( ( ( mXYZW[3] * quat.mXYZW[2] ) + ( mXYZW[2] * quat.mXYZW[3] ) ) + ( mXYZW[0] * quat.mXYZW[1] ) ) - ( mXYZW[1] * quat.mXYZW[0] ) ),
+                    ( ( ( ( mXYZW[3] * quat.mXYZW[3] ) - ( mXYZW[0] * quat.mXYZW[0] ) ) - ( mXYZW[1] * quat.mXYZW[1] ) ) - ( mXYZW[2] * quat.mXYZW[2] ) )
+                    );
+    }
+    
+    inline Quat & Quat::operator *=( const Quat & quat )
+    {        
+        *this = *this * quat;
+        return *this;
+    }
+    
+    inline const Vector3 rotate( const Quat & quat, const Vector3 & vec )
+    {
+        float tmpX, tmpY, tmpZ, tmpW;
+        tmpX = ( ( ( quat.getW() * vec.getX() ) + ( quat.getY() * vec.getZ() ) ) - ( quat.getZ() * vec.getY() ) );
+        tmpY = ( ( ( quat.getW() * vec.getY() ) + ( quat.getZ() * vec.getX() ) ) - ( quat.getX() * vec.getZ() ) );
+        tmpZ = ( ( ( quat.getW() * vec.getZ() ) + ( quat.getX() * vec.getY() ) ) - ( quat.getY() * vec.getX() ) );
+        tmpW = ( ( ( quat.getX() * vec.getX() ) + ( quat.getY() * vec.getY() ) ) + ( quat.getZ() * vec.getZ() ) );
+        return Vector3(
+                       ( ( ( ( tmpW * quat.getX() ) + ( tmpX * quat.getW() ) ) - ( tmpY * quat.getZ() ) ) + ( tmpZ * quat.getY() ) ),
+                       ( ( ( ( tmpW * quat.getY() ) + ( tmpY * quat.getW() ) ) - ( tmpZ * quat.getX() ) ) + ( tmpX * quat.getZ() ) ),
+                       ( ( ( ( tmpW * quat.getZ() ) + ( tmpZ * quat.getW() ) ) - ( tmpX * quat.getY() ) ) + ( tmpY * quat.getX() ) )
+                       );
+    }
+    
+    inline const Quat conj( const Quat & quat )
+    {        
+        return Quat( -quat.getX(), -quat.getY(), -quat.getZ(), quat.getW() );
+    }
+    
+    inline const Quat select( const Quat & quat0, const Quat & quat1, bool select1 )
+    {
+        return Quat(
+                    ( select1 )? quat1.getX() : quat0.getX(),
+                    ( select1 )? quat1.getY() : quat0.getY(),
+                    ( select1 )? quat1.getZ() : quat0.getZ(),
+                    ( select1 )? quat1.getW() : quat0.getW()
+                    );
+    }
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Quat & quat )
+{
+    printf( "( %f %f %f %f )\n", quat.getX(), quat.getY(), quat.getZ(), quat.getW() );
+}
+
+inline void print( const Quat & quat, const char * name )
+{
+    printf( "%s: ( %f %f %f %f )\n", name, quat.getX(), quat.getY(), quat.getZ(), quat.getW() );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
+
diff --git a/src/vectormath/neon/vec_aos.h b/src/vectormath/neon/vec_aos.h
new file mode 100644
index 000000000..7bcf8dbec
--- /dev/null
+++ b/src/vectormath/neon/vec_aos.h
@@ -0,0 +1,1427 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef _VECTORMATH_VEC_AOS_CPP_H
+#define _VECTORMATH_VEC_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Constants
+
+#define _VECTORMATH_SLERP_TOL 0.999f
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+inline Vector3::Vector3( const Vector3 & vec )
+{
+    mX = vec.mX;
+    mY = vec.mY;
+    mZ = vec.mZ;
+}
+
+inline Vector3::Vector3( float _x, float _y, float _z )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+}
+
+inline Vector3::Vector3( const Point3 & pnt )
+{
+    mX = pnt.getX();
+    mY = pnt.getY();
+    mZ = pnt.getZ();
+}
+
+inline Vector3::Vector3( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+}
+
+inline const Vector3 Vector3::xAxis( )
+{
+    return Vector3( 1.0f, 0.0f, 0.0f );
+}
+
+inline const Vector3 Vector3::yAxis( )
+{
+    return Vector3( 0.0f, 1.0f, 0.0f );
+}
+
+inline const Vector3 Vector3::zAxis( )
+{
+    return Vector3( 0.0f, 0.0f, 1.0f );
+}
+
+inline const Vector3 lerp( float t, const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
+}
+
+inline const Vector3 slerp( float t, const Vector3 & unitVec0, const Vector3 & unitVec1 )
+{
+    float recipSinAngle, scale0, scale1, cosAngle, angle;
+    cosAngle = dot( unitVec0, unitVec1 );
+    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
+        angle = acosf( cosAngle );
+        recipSinAngle = ( 1.0f / sinf( angle ) );
+        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
+        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
+    } else {
+        scale0 = ( 1.0f - t );
+        scale1 = t;
+    }
+    return ( ( unitVec0 * scale0 ) + ( unitVec1 * scale1 ) );
+}
+
+inline void loadXYZ( Vector3 & vec, const float * fptr )
+{
+    vec = Vector3( fptr[0], fptr[1], fptr[2] );
+}
+
+inline void storeXYZ( const Vector3 & vec, float * fptr )
+{
+    fptr[0] = vec.getX();
+    fptr[1] = vec.getY();
+    fptr[2] = vec.getZ();
+}
+
+inline void loadHalfFloats( Vector3 & vec, const unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 3; i++) {
+        unsigned short fp16 = hfptr[i];
+        unsigned int sign = fp16 >> 15;
+        unsigned int exponent = (fp16 >> 10) & ((1 << 5) - 1);
+        unsigned int mantissa = fp16 & ((1 << 10) - 1);
+
+        if (exponent == 0) {
+            // zero
+            mantissa = 0;
+
+        } else if (exponent == 31) {
+            // infinity or nan -> infinity
+            exponent = 255;
+	    mantissa = 0;
+
+        } else {
+            exponent += 127 - 15;
+            mantissa <<= 13;
+        }
+
+        Data32 d;
+        d.u32 = (sign << 31) | (exponent << 23) | mantissa;
+        vec[i] = d.f32;
+    }
+}
+
+inline void storeHalfFloats( const Vector3 & vec, unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 3; i++) {
+        Data32 d;
+        d.f32 = vec[i];
+
+        unsigned int sign = d.u32 >> 31;
+        unsigned int exponent = (d.u32 >> 23) & ((1 << 8) - 1);
+        unsigned int mantissa = d.u32 & ((1 << 23) - 1);;
+
+        if (exponent == 0) {
+            // zero or denorm -> zero
+            mantissa = 0;
+
+        } else if (exponent == 255 && mantissa != 0) {
+            // nan -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent >= 127 - 15 + 31) {
+            // overflow or infinity -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent <= 127 - 15) {
+            // underflow -> zero
+            exponent = 0;
+            mantissa = 0;
+
+        } else {
+            exponent -= 127 - 15;
+            mantissa >>= 13;
+        }
+
+        hfptr[i] = (unsigned short)((sign << 15) | (exponent << 10) | mantissa);
+    }
+}
+
+inline Vector3 & Vector3::operator =( const Vector3 & vec )
+{
+    mX = vec.mX;
+    mY = vec.mY;
+    mZ = vec.mZ;
+    return *this;
+}
+
+inline Vector3 & Vector3::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Vector3::getX( ) const
+{
+    return mX;
+}
+
+inline Vector3 & Vector3::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Vector3::getY( ) const
+{
+    return mY;
+}
+
+inline Vector3 & Vector3::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Vector3::getZ( ) const
+{
+    return mZ;
+}
+
+inline Vector3 & Vector3::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Vector3::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Vector3::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Vector3::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Vector3 Vector3::operator +( const Vector3 & vec ) const
+{
+    return Vector3(
+        ( mX + vec.mX ),
+        ( mY + vec.mY ),
+        ( mZ + vec.mZ )
+    );
+}
+
+inline const Vector3 Vector3::operator -( const Vector3 & vec ) const
+{
+    return Vector3(
+        ( mX - vec.mX ),
+        ( mY - vec.mY ),
+        ( mZ - vec.mZ )
+    );
+}
+
+inline const Point3 Vector3::operator +( const Point3 & pnt ) const
+{
+    return Point3(
+        ( mX + pnt.getX() ),
+        ( mY + pnt.getY() ),
+        ( mZ + pnt.getZ() )
+    );
+}
+
+inline const Vector3 Vector3::operator *( float scalar ) const
+{
+    return Vector3(
+        ( mX * scalar ),
+        ( mY * scalar ),
+        ( mZ * scalar )
+    );
+}
+
+inline Vector3 & Vector3::operator +=( const Vector3 & vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline Vector3 & Vector3::operator -=( const Vector3 & vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline Vector3 & Vector3::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Vector3 Vector3::operator /( float scalar ) const
+{
+    return Vector3(
+        ( mX / scalar ),
+        ( mY / scalar ),
+        ( mZ / scalar )
+    );
+}
+
+inline Vector3 & Vector3::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+inline const Vector3 Vector3::operator -( ) const
+{
+    return Vector3(
+        -mX,
+        -mY,
+        -mZ
+    );
+}
+
+inline const Vector3 operator *( float scalar, const Vector3 & vec )
+{
+    return vec * scalar;
+}
+
+inline const Vector3 mulPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        ( vec0.getX() * vec1.getX() ),
+        ( vec0.getY() * vec1.getY() ),
+        ( vec0.getZ() * vec1.getZ() )
+    );
+}
+
+inline const Vector3 divPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        ( vec0.getX() / vec1.getX() ),
+        ( vec0.getY() / vec1.getY() ),
+        ( vec0.getZ() / vec1.getZ() )
+    );
+}
+
+inline const Vector3 recipPerElem( const Vector3 & vec )
+{
+    return Vector3(
+        ( 1.0f / vec.getX() ),
+        ( 1.0f / vec.getY() ),
+        ( 1.0f / vec.getZ() )
+    );
+}
+
+inline const Vector3 sqrtPerElem( const Vector3 & vec )
+{
+    return Vector3(
+        sqrtf( vec.getX() ),
+        sqrtf( vec.getY() ),
+        sqrtf( vec.getZ() )
+    );
+}
+
+inline const Vector3 rsqrtPerElem( const Vector3 & vec )
+{
+    return Vector3(
+        ( 1.0f / sqrtf( vec.getX() ) ),
+        ( 1.0f / sqrtf( vec.getY() ) ),
+        ( 1.0f / sqrtf( vec.getZ() ) )
+    );
+}
+
+inline const Vector3 absPerElem( const Vector3 & vec )
+{
+    return Vector3(
+        fabsf( vec.getX() ),
+        fabsf( vec.getY() ),
+        fabsf( vec.getZ() )
+    );
+}
+
+inline const Vector3 copySignPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        ( vec1.getX() < 0.0f )? -fabsf( vec0.getX() ) : fabsf( vec0.getX() ),
+        ( vec1.getY() < 0.0f )? -fabsf( vec0.getY() ) : fabsf( vec0.getY() ),
+        ( vec1.getZ() < 0.0f )? -fabsf( vec0.getZ() ) : fabsf( vec0.getZ() )
+    );
+}
+
+inline const Vector3 maxPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        (vec0.getX() > vec1.getX())? vec0.getX() : vec1.getX(),
+        (vec0.getY() > vec1.getY())? vec0.getY() : vec1.getY(),
+        (vec0.getZ() > vec1.getZ())? vec0.getZ() : vec1.getZ()
+    );
+}
+
+inline float maxElem( const Vector3 & vec )
+{
+    float result;
+    result = (vec.getX() > vec.getY())? vec.getX() : vec.getY();
+    result = (vec.getZ() > result)? vec.getZ() : result;
+    return result;
+}
+
+inline const Vector3 minPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        (vec0.getX() < vec1.getX())? vec0.getX() : vec1.getX(),
+        (vec0.getY() < vec1.getY())? vec0.getY() : vec1.getY(),
+        (vec0.getZ() < vec1.getZ())? vec0.getZ() : vec1.getZ()
+    );
+}
+
+inline float minElem( const Vector3 & vec )
+{
+    float result;
+    result = (vec.getX() < vec.getY())? vec.getX() : vec.getY();
+    result = (vec.getZ() < result)? vec.getZ() : result;
+    return result;
+}
+
+inline float sum( const Vector3 & vec )
+{
+    float result;
+    result = ( vec.getX() + vec.getY() );
+    result = ( result + vec.getZ() );
+    return result;
+}
+
+inline float dot( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    float result;
+    result = ( vec0.getX() * vec1.getX() );
+    result = ( result + ( vec0.getY() * vec1.getY() ) );
+    result = ( result + ( vec0.getZ() * vec1.getZ() ) );
+    return result;
+}
+
+inline float lengthSqr( const Vector3 & vec )
+{
+    float result;
+    result = ( vec.getX() * vec.getX() );
+    result = ( result + ( vec.getY() * vec.getY() ) );
+    result = ( result + ( vec.getZ() * vec.getZ() ) );
+    return result;
+}
+
+inline float length( const Vector3 & vec )
+{
+    return ::sqrtf( lengthSqr( vec ) );
+}
+
+inline const Vector3 normalize( const Vector3 & vec )
+{
+    float lenSqr, lenInv;
+    lenSqr = lengthSqr( vec );
+    lenInv = ( 1.0f / sqrtf( lenSqr ) );
+    return Vector3(
+        ( vec.getX() * lenInv ),
+        ( vec.getY() * lenInv ),
+        ( vec.getZ() * lenInv )
+    );
+}
+
+inline const Vector3 cross( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        ( ( vec0.getY() * vec1.getZ() ) - ( vec0.getZ() * vec1.getY() ) ),
+        ( ( vec0.getZ() * vec1.getX() ) - ( vec0.getX() * vec1.getZ() ) ),
+        ( ( vec0.getX() * vec1.getY() ) - ( vec0.getY() * vec1.getX() ) )
+    );
+}
+
+inline const Vector3 select( const Vector3 & vec0, const Vector3 & vec1, bool select1 )
+{
+    return Vector3(
+        ( select1 )? vec1.getX() : vec0.getX(),
+        ( select1 )? vec1.getY() : vec0.getY(),
+        ( select1 )? vec1.getZ() : vec0.getZ()
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Vector3 & vec )
+{
+    printf( "( %f %f %f )\n", vec.getX(), vec.getY(), vec.getZ() );
+}
+
+inline void print( const Vector3 & vec, const char * name )
+{
+    printf( "%s: ( %f %f %f )\n", name, vec.getX(), vec.getY(), vec.getZ() );
+}
+
+#endif
+
+inline Vector4::Vector4( const Vector4 & vec )
+{
+    mX = vec.mX;
+    mY = vec.mY;
+    mZ = vec.mZ;
+    mW = vec.mW;
+}
+
+inline Vector4::Vector4( float _x, float _y, float _z, float _w )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+    mW = _w;
+}
+
+inline Vector4::Vector4( const Vector3 & xyz, float _w )
+{
+    this->setXYZ( xyz );
+    this->setW( _w );
+}
+
+inline Vector4::Vector4( const Vector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    mW = 0.0f;
+}
+
+inline Vector4::Vector4( const Point3 & pnt )
+{
+    mX = pnt.getX();
+    mY = pnt.getY();
+    mZ = pnt.getZ();
+    mW = 1.0f;
+}
+
+inline Vector4::Vector4( const Quat & quat )
+{
+    mX = quat.getX();
+    mY = quat.getY();
+    mZ = quat.getZ();
+    mW = quat.getW();
+}
+
+inline Vector4::Vector4( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+    mW = scalar;
+}
+
+inline const Vector4 Vector4::xAxis( )
+{
+    return Vector4( 1.0f, 0.0f, 0.0f, 0.0f );
+}
+
+inline const Vector4 Vector4::yAxis( )
+{
+    return Vector4( 0.0f, 1.0f, 0.0f, 0.0f );
+}
+
+inline const Vector4 Vector4::zAxis( )
+{
+    return Vector4( 0.0f, 0.0f, 1.0f, 0.0f );
+}
+
+inline const Vector4 Vector4::wAxis( )
+{
+    return Vector4( 0.0f, 0.0f, 0.0f, 1.0f );
+}
+
+inline const Vector4 lerp( float t, const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
+}
+
+inline const Vector4 slerp( float t, const Vector4 & unitVec0, const Vector4 & unitVec1 )
+{
+    float recipSinAngle, scale0, scale1, cosAngle, angle;
+    cosAngle = dot( unitVec0, unitVec1 );
+    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
+        angle = acosf( cosAngle );
+        recipSinAngle = ( 1.0f / sinf( angle ) );
+        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
+        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
+    } else {
+        scale0 = ( 1.0f - t );
+        scale1 = t;
+    }
+    return ( ( unitVec0 * scale0 ) + ( unitVec1 * scale1 ) );
+}
+
+inline void loadXYZW( Vector4 & vec, const float * fptr )
+{
+    vec = Vector4( fptr[0], fptr[1], fptr[2], fptr[3] );
+}
+
+inline void storeXYZW( const Vector4 & vec, float * fptr )
+{
+    fptr[0] = vec.getX();
+    fptr[1] = vec.getY();
+    fptr[2] = vec.getZ();
+    fptr[3] = vec.getW();
+}
+
+inline void loadHalfFloats( Vector4 & vec, const unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 4; i++) {
+        unsigned short fp16 = hfptr[i];
+        unsigned int sign = fp16 >> 15;
+        unsigned int exponent = (fp16 >> 10) & ((1 << 5) - 1);
+        unsigned int mantissa = fp16 & ((1 << 10) - 1);
+
+        if (exponent == 0) {
+            // zero
+            mantissa = 0;
+
+        } else if (exponent == 31) {
+            // infinity or nan -> infinity
+            exponent = 255;
+	    mantissa = 0;
+
+        } else {
+            exponent += 127 - 15;
+            mantissa <<= 13;
+        }
+
+        Data32 d;
+        d.u32 = (sign << 31) | (exponent << 23) | mantissa;
+        vec[i] = d.f32;
+    }
+}
+
+inline void storeHalfFloats( const Vector4 & vec, unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 4; i++) {
+        Data32 d;
+        d.f32 = vec[i];
+
+        unsigned int sign = d.u32 >> 31;
+        unsigned int exponent = (d.u32 >> 23) & ((1 << 8) - 1);
+        unsigned int mantissa = d.u32 & ((1 << 23) - 1);;
+
+        if (exponent == 0) {
+            // zero or denorm -> zero
+            mantissa = 0;
+
+        } else if (exponent == 255 && mantissa != 0) {
+            // nan -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent >= 127 - 15 + 31) {
+            // overflow or infinity -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent <= 127 - 15) {
+            // underflow -> zero
+            exponent = 0;
+            mantissa = 0;
+
+        } else {
+            exponent -= 127 - 15;
+            mantissa >>= 13;
+        }
+
+        hfptr[i] = (unsigned short)((sign << 15) | (exponent << 10) | mantissa);
+    }
+}
+
+inline Vector4 & Vector4::operator =( const Vector4 & vec )
+{
+    mX = vec.mX;
+    mY = vec.mY;
+    mZ = vec.mZ;
+    mW = vec.mW;
+    return *this;
+}
+
+inline Vector4 & Vector4::setXYZ( const Vector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    return *this;
+}
+
+inline const Vector3 Vector4::getXYZ( ) const
+{
+    return Vector3( mX, mY, mZ );
+}
+
+inline Vector4 & Vector4::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Vector4::getX( ) const
+{
+    return mX;
+}
+
+inline Vector4 & Vector4::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Vector4::getY( ) const
+{
+    return mY;
+}
+
+inline Vector4 & Vector4::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Vector4::getZ( ) const
+{
+    return mZ;
+}
+
+inline Vector4 & Vector4::setW( float _w )
+{
+    mW = _w;
+    return *this;
+}
+
+inline float Vector4::getW( ) const
+{
+    return mW;
+}
+
+inline Vector4 & Vector4::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Vector4::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Vector4::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Vector4::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Vector4 Vector4::operator +( const Vector4 & vec ) const
+{
+    return Vector4(
+        ( mX + vec.mX ),
+        ( mY + vec.mY ),
+        ( mZ + vec.mZ ),
+        ( mW + vec.mW )
+    );
+}
+
+inline const Vector4 Vector4::operator -( const Vector4 & vec ) const
+{
+    return Vector4(
+        ( mX - vec.mX ),
+        ( mY - vec.mY ),
+        ( mZ - vec.mZ ),
+        ( mW - vec.mW )
+    );
+}
+
+inline const Vector4 Vector4::operator *( float scalar ) const
+{
+    return Vector4(
+        ( mX * scalar ),
+        ( mY * scalar ),
+        ( mZ * scalar ),
+        ( mW * scalar )
+    );
+}
+
+inline Vector4 & Vector4::operator +=( const Vector4 & vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline Vector4 & Vector4::operator -=( const Vector4 & vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline Vector4 & Vector4::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Vector4 Vector4::operator /( float scalar ) const
+{
+    return Vector4(
+        ( mX / scalar ),
+        ( mY / scalar ),
+        ( mZ / scalar ),
+        ( mW / scalar )
+    );
+}
+
+inline Vector4 & Vector4::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+inline const Vector4 Vector4::operator -( ) const
+{
+    return Vector4(
+        -mX,
+        -mY,
+        -mZ,
+        -mW
+    );
+}
+
+inline const Vector4 operator *( float scalar, const Vector4 & vec )
+{
+    return vec * scalar;
+}
+
+inline const Vector4 mulPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        ( vec0.getX() * vec1.getX() ),
+        ( vec0.getY() * vec1.getY() ),
+        ( vec0.getZ() * vec1.getZ() ),
+        ( vec0.getW() * vec1.getW() )
+    );
+}
+
+inline const Vector4 divPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        ( vec0.getX() / vec1.getX() ),
+        ( vec0.getY() / vec1.getY() ),
+        ( vec0.getZ() / vec1.getZ() ),
+        ( vec0.getW() / vec1.getW() )
+    );
+}
+
+inline const Vector4 recipPerElem( const Vector4 & vec )
+{
+    return Vector4(
+        ( 1.0f / vec.getX() ),
+        ( 1.0f / vec.getY() ),
+        ( 1.0f / vec.getZ() ),
+        ( 1.0f / vec.getW() )
+    );
+}
+
+inline const Vector4 sqrtPerElem( const Vector4 & vec )
+{
+    return Vector4(
+        sqrtf( vec.getX() ),
+        sqrtf( vec.getY() ),
+        sqrtf( vec.getZ() ),
+        sqrtf( vec.getW() )
+    );
+}
+
+inline const Vector4 rsqrtPerElem( const Vector4 & vec )
+{
+    return Vector4(
+        ( 1.0f / sqrtf( vec.getX() ) ),
+        ( 1.0f / sqrtf( vec.getY() ) ),
+        ( 1.0f / sqrtf( vec.getZ() ) ),
+        ( 1.0f / sqrtf( vec.getW() ) )
+    );
+}
+
+inline const Vector4 absPerElem( const Vector4 & vec )
+{
+    return Vector4(
+        fabsf( vec.getX() ),
+        fabsf( vec.getY() ),
+        fabsf( vec.getZ() ),
+        fabsf( vec.getW() )
+    );
+}
+
+inline const Vector4 copySignPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        ( vec1.getX() < 0.0f )? -fabsf( vec0.getX() ) : fabsf( vec0.getX() ),
+        ( vec1.getY() < 0.0f )? -fabsf( vec0.getY() ) : fabsf( vec0.getY() ),
+        ( vec1.getZ() < 0.0f )? -fabsf( vec0.getZ() ) : fabsf( vec0.getZ() ),
+        ( vec1.getW() < 0.0f )? -fabsf( vec0.getW() ) : fabsf( vec0.getW() )
+    );
+}
+
+inline const Vector4 maxPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        (vec0.getX() > vec1.getX())? vec0.getX() : vec1.getX(),
+        (vec0.getY() > vec1.getY())? vec0.getY() : vec1.getY(),
+        (vec0.getZ() > vec1.getZ())? vec0.getZ() : vec1.getZ(),
+        (vec0.getW() > vec1.getW())? vec0.getW() : vec1.getW()
+    );
+}
+
+inline float maxElem( const Vector4 & vec )
+{
+    float result;
+    result = (vec.getX() > vec.getY())? vec.getX() : vec.getY();
+    result = (vec.getZ() > result)? vec.getZ() : result;
+    result = (vec.getW() > result)? vec.getW() : result;
+    return result;
+}
+
+inline const Vector4 minPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        (vec0.getX() < vec1.getX())? vec0.getX() : vec1.getX(),
+        (vec0.getY() < vec1.getY())? vec0.getY() : vec1.getY(),
+        (vec0.getZ() < vec1.getZ())? vec0.getZ() : vec1.getZ(),
+        (vec0.getW() < vec1.getW())? vec0.getW() : vec1.getW()
+    );
+}
+
+inline float minElem( const Vector4 & vec )
+{
+    float result;
+    result = (vec.getX() < vec.getY())? vec.getX() : vec.getY();
+    result = (vec.getZ() < result)? vec.getZ() : result;
+    result = (vec.getW() < result)? vec.getW() : result;
+    return result;
+}
+
+inline float sum( const Vector4 & vec )
+{
+    float result;
+    result = ( vec.getX() + vec.getY() );
+    result = ( result + vec.getZ() );
+    result = ( result + vec.getW() );
+    return result;
+}
+
+inline float dot( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    float result;
+    result = ( vec0.getX() * vec1.getX() );
+    result = ( result + ( vec0.getY() * vec1.getY() ) );
+    result = ( result + ( vec0.getZ() * vec1.getZ() ) );
+    result = ( result + ( vec0.getW() * vec1.getW() ) );
+    return result;
+}
+
+inline float lengthSqr( const Vector4 & vec )
+{
+    float result;
+    result = ( vec.getX() * vec.getX() );
+    result = ( result + ( vec.getY() * vec.getY() ) );
+    result = ( result + ( vec.getZ() * vec.getZ() ) );
+    result = ( result + ( vec.getW() * vec.getW() ) );
+    return result;
+}
+
+inline float length( const Vector4 & vec )
+{
+    return ::sqrtf( lengthSqr( vec ) );
+}
+
+inline const Vector4 normalize( const Vector4 & vec )
+{
+    float lenSqr, lenInv;
+    lenSqr = lengthSqr( vec );
+    lenInv = ( 1.0f / sqrtf( lenSqr ) );
+    return Vector4(
+        ( vec.getX() * lenInv ),
+        ( vec.getY() * lenInv ),
+        ( vec.getZ() * lenInv ),
+        ( vec.getW() * lenInv )
+    );
+}
+
+inline const Vector4 select( const Vector4 & vec0, const Vector4 & vec1, bool select1 )
+{
+    return Vector4(
+        ( select1 )? vec1.getX() : vec0.getX(),
+        ( select1 )? vec1.getY() : vec0.getY(),
+        ( select1 )? vec1.getZ() : vec0.getZ(),
+        ( select1 )? vec1.getW() : vec0.getW()
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Vector4 & vec )
+{
+    printf( "( %f %f %f %f )\n", vec.getX(), vec.getY(), vec.getZ(), vec.getW() );
+}
+
+inline void print( const Vector4 & vec, const char * name )
+{
+    printf( "%s: ( %f %f %f %f )\n", name, vec.getX(), vec.getY(), vec.getZ(), vec.getW() );
+}
+
+#endif
+
+inline Point3::Point3( const Point3 & pnt )
+{
+    mX = pnt.mX;
+    mY = pnt.mY;
+    mZ = pnt.mZ;
+}
+
+inline Point3::Point3( float _x, float _y, float _z )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+}
+
+inline Point3::Point3( const Vector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+}
+
+inline Point3::Point3( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+}
+
+inline const Point3 lerp( float t, const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return ( pnt0 + ( ( pnt1 - pnt0 ) * t ) );
+}
+
+inline void loadXYZ( Point3 & pnt, const float * fptr )
+{
+    pnt = Point3( fptr[0], fptr[1], fptr[2] );
+}
+
+inline void storeXYZ( const Point3 & pnt, float * fptr )
+{
+    fptr[0] = pnt.getX();
+    fptr[1] = pnt.getY();
+    fptr[2] = pnt.getZ();
+}
+
+inline void loadHalfFloats( Point3 & vec, const unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 3; i++) {
+        unsigned short fp16 = hfptr[i];
+        unsigned int sign = fp16 >> 15;
+        unsigned int exponent = (fp16 >> 10) & ((1 << 5) - 1);
+        unsigned int mantissa = fp16 & ((1 << 10) - 1);
+
+        if (exponent == 0) {
+            // zero
+            mantissa = 0;
+
+        } else if (exponent == 31) {
+            // infinity or nan -> infinity
+            exponent = 255;
+	    mantissa = 0;
+
+        } else {
+            exponent += 127 - 15;
+            mantissa <<= 13;
+        }
+
+        Data32 d;
+        d.u32 = (sign << 31) | (exponent << 23) | mantissa;
+        vec[i] = d.f32;
+    }
+}
+
+inline void storeHalfFloats( const Point3 & vec, unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 3; i++) {
+        Data32 d;
+        d.f32 = vec[i];
+
+        unsigned int sign = d.u32 >> 31;
+        unsigned int exponent = (d.u32 >> 23) & ((1 << 8) - 1);
+        unsigned int mantissa = d.u32 & ((1 << 23) - 1);;
+
+        if (exponent == 0) {
+            // zero or denorm -> zero
+            mantissa = 0;
+
+        } else if (exponent == 255 && mantissa != 0) {
+            // nan -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent >= 127 - 15 + 31) {
+            // overflow or infinity -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent <= 127 - 15) {
+            // underflow -> zero
+            exponent = 0;
+            mantissa = 0;
+
+        } else {
+            exponent -= 127 - 15;
+            mantissa >>= 13;
+        }
+
+        hfptr[i] = (unsigned short)((sign << 15) | (exponent << 10) | mantissa);
+    }
+}
+
+inline Point3 & Point3::operator =( const Point3 & pnt )
+{
+    mX = pnt.mX;
+    mY = pnt.mY;
+    mZ = pnt.mZ;
+    return *this;
+}
+
+inline Point3 & Point3::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Point3::getX( ) const
+{
+    return mX;
+}
+
+inline Point3 & Point3::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Point3::getY( ) const
+{
+    return mY;
+}
+
+inline Point3 & Point3::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Point3::getZ( ) const
+{
+    return mZ;
+}
+
+inline Point3 & Point3::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Point3::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Point3::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Point3::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Vector3 Point3::operator -( const Point3 & pnt ) const
+{
+    return Vector3(
+        ( mX - pnt.mX ),
+        ( mY - pnt.mY ),
+        ( mZ - pnt.mZ )
+    );
+}
+
+inline const Point3 Point3::operator +( const Vector3 & vec ) const
+{
+    return Point3(
+        ( mX + vec.getX() ),
+        ( mY + vec.getY() ),
+        ( mZ + vec.getZ() )
+    );
+}
+
+inline const Point3 Point3::operator -( const Vector3 & vec ) const
+{
+    return Point3(
+        ( mX - vec.getX() ),
+        ( mY - vec.getY() ),
+        ( mZ - vec.getZ() )
+    );
+}
+
+inline Point3 & Point3::operator +=( const Vector3 & vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline Point3 & Point3::operator -=( const Vector3 & vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline const Point3 mulPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        ( pnt0.getX() * pnt1.getX() ),
+        ( pnt0.getY() * pnt1.getY() ),
+        ( pnt0.getZ() * pnt1.getZ() )
+    );
+}
+
+inline const Point3 divPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        ( pnt0.getX() / pnt1.getX() ),
+        ( pnt0.getY() / pnt1.getY() ),
+        ( pnt0.getZ() / pnt1.getZ() )
+    );
+}
+
+inline const Point3 recipPerElem( const Point3 & pnt )
+{
+    return Point3(
+        ( 1.0f / pnt.getX() ),
+        ( 1.0f / pnt.getY() ),
+        ( 1.0f / pnt.getZ() )
+    );
+}
+
+inline const Point3 sqrtPerElem( const Point3 & pnt )
+{
+    return Point3(
+        sqrtf( pnt.getX() ),
+        sqrtf( pnt.getY() ),
+        sqrtf( pnt.getZ() )
+    );
+}
+
+inline const Point3 rsqrtPerElem( const Point3 & pnt )
+{
+    return Point3(
+        ( 1.0f / sqrtf( pnt.getX() ) ),
+        ( 1.0f / sqrtf( pnt.getY() ) ),
+        ( 1.0f / sqrtf( pnt.getZ() ) )
+    );
+}
+
+inline const Point3 absPerElem( const Point3 & pnt )
+{
+    return Point3(
+        fabsf( pnt.getX() ),
+        fabsf( pnt.getY() ),
+        fabsf( pnt.getZ() )
+    );
+}
+
+inline const Point3 copySignPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        ( pnt1.getX() < 0.0f )? -fabsf( pnt0.getX() ) : fabsf( pnt0.getX() ),
+        ( pnt1.getY() < 0.0f )? -fabsf( pnt0.getY() ) : fabsf( pnt0.getY() ),
+        ( pnt1.getZ() < 0.0f )? -fabsf( pnt0.getZ() ) : fabsf( pnt0.getZ() )
+    );
+}
+
+inline const Point3 maxPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        (pnt0.getX() > pnt1.getX())? pnt0.getX() : pnt1.getX(),
+        (pnt0.getY() > pnt1.getY())? pnt0.getY() : pnt1.getY(),
+        (pnt0.getZ() > pnt1.getZ())? pnt0.getZ() : pnt1.getZ()
+    );
+}
+
+inline float maxElem( const Point3 & pnt )
+{
+    float result;
+    result = (pnt.getX() > pnt.getY())? pnt.getX() : pnt.getY();
+    result = (pnt.getZ() > result)? pnt.getZ() : result;
+    return result;
+}
+
+inline const Point3 minPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        (pnt0.getX() < pnt1.getX())? pnt0.getX() : pnt1.getX(),
+        (pnt0.getY() < pnt1.getY())? pnt0.getY() : pnt1.getY(),
+        (pnt0.getZ() < pnt1.getZ())? pnt0.getZ() : pnt1.getZ()
+    );
+}
+
+inline float minElem( const Point3 & pnt )
+{
+    float result;
+    result = (pnt.getX() < pnt.getY())? pnt.getX() : pnt.getY();
+    result = (pnt.getZ() < result)? pnt.getZ() : result;
+    return result;
+}
+
+inline float sum( const Point3 & pnt )
+{
+    float result;
+    result = ( pnt.getX() + pnt.getY() );
+    result = ( result + pnt.getZ() );
+    return result;
+}
+
+inline const Point3 scale( const Point3 & pnt, float scaleVal )
+{
+    return mulPerElem( pnt, Point3( scaleVal ) );
+}
+
+inline const Point3 scale( const Point3 & pnt, const Vector3 & scaleVec )
+{
+    return mulPerElem( pnt, Point3( scaleVec ) );
+}
+
+inline float projection( const Point3 & pnt, const Vector3 & unitVec )
+{
+    float result;
+    result = ( pnt.getX() * unitVec.getX() );
+    result = ( result + ( pnt.getY() * unitVec.getY() ) );
+    result = ( result + ( pnt.getZ() * unitVec.getZ() ) );
+    return result;
+}
+
+inline float distSqrFromOrigin( const Point3 & pnt )
+{
+    return lengthSqr( Vector3( pnt ) );
+}
+
+inline float distFromOrigin( const Point3 & pnt )
+{
+    return length( Vector3( pnt ) );
+}
+
+inline float distSqr( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return lengthSqr( ( pnt1 - pnt0 ) );
+}
+
+inline float dist( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return length( ( pnt1 - pnt0 ) );
+}
+
+inline const Point3 select( const Point3 & pnt0, const Point3 & pnt1, bool select1 )
+{
+    return Point3(
+        ( select1 )? pnt1.getX() : pnt0.getX(),
+        ( select1 )? pnt1.getY() : pnt0.getY(),
+        ( select1 )? pnt1.getZ() : pnt0.getZ()
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Point3 & pnt )
+{
+    printf( "( %f %f %f )\n", pnt.getX(), pnt.getY(), pnt.getZ() );
+}
+
+inline void print( const Point3 & pnt, const char * name )
+{
+    printf( "%s: ( %f %f %f )\n", name, pnt.getX(), pnt.getY(), pnt.getZ() );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
+
diff --git a/src/vectormath/neon/vectormath_aos.h b/src/vectormath/neon/vectormath_aos.h
new file mode 100644
index 000000000..97bdc278a
--- /dev/null
+++ b/src/vectormath/neon/vectormath_aos.h
@@ -0,0 +1,1890 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+ 
+This source version has been altered.
+
+*/
+
+#ifndef _VECTORMATH_AOS_CPP_H
+#define _VECTORMATH_AOS_CPP_H
+
+#include <math.h>
+
+#ifdef _VECTORMATH_DEBUG
+#include <stdio.h>
+#endif
+
+namespace Vectormath {
+
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// Forward Declarations
+//
+
+class Vector3;
+class Vector4;
+class Point3;
+class Quat;
+class Matrix3;
+class Matrix4;
+class Transform3;
+
+// A 3-D vector in array-of-structures format
+//
+class Vector3
+{
+    float mX;
+    float mY;
+    float mZ;
+#ifndef __GNUC__
+    float d;
+#endif
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Vector3( ) { };
+
+    // Copy a 3-D vector
+    // 
+    inline Vector3( const Vector3 & vec );
+
+    // Construct a 3-D vector from x, y, and z elements
+    // 
+    inline Vector3( float x, float y, float z );
+
+    // Copy elements from a 3-D point into a 3-D vector
+    // 
+    explicit inline Vector3( const Point3 & pnt );
+
+    // Set all elements of a 3-D vector to the same scalar value
+    // 
+    explicit inline Vector3( float scalar );
+
+    // Assign one 3-D vector to another
+    // 
+    inline Vector3 & operator =( const Vector3 & vec );
+
+    // Set the x element of a 3-D vector
+    // 
+    inline Vector3 & setX( float x );
+
+    // Set the y element of a 3-D vector
+    // 
+    inline Vector3 & setY( float y );
+
+    // Set the z element of a 3-D vector
+    // 
+    inline Vector3 & setZ( float z );
+
+    // Get the x element of a 3-D vector
+    // 
+    inline float getX( ) const;
+
+    // Get the y element of a 3-D vector
+    // 
+    inline float getY( ) const;
+
+    // Get the z element of a 3-D vector
+    // 
+    inline float getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D vector by index
+    // 
+    inline Vector3 & setElem( int idx, float value );
+
+    // Get an x, y, or z element of a 3-D vector by index
+    // 
+    inline float getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+
+    // Add two 3-D vectors
+    // 
+    inline const Vector3 operator +( const Vector3 & vec ) const;
+
+    // Subtract a 3-D vector from another 3-D vector
+    // 
+    inline const Vector3 operator -( const Vector3 & vec ) const;
+
+    // Add a 3-D vector to a 3-D point
+    // 
+    inline const Point3 operator +( const Point3 & pnt ) const;
+
+    // Multiply a 3-D vector by a scalar
+    // 
+    inline const Vector3 operator *( float scalar ) const;
+
+    // Divide a 3-D vector by a scalar
+    // 
+    inline const Vector3 operator /( float scalar ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    inline Vector3 & operator +=( const Vector3 & vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    inline Vector3 & operator -=( const Vector3 & vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Vector3 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    inline Vector3 & operator /=( float scalar );
+
+    // Negate all elements of a 3-D vector
+    // 
+    inline const Vector3 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static inline const Vector3 xAxis( );
+
+    // Construct y axis
+    // 
+    static inline const Vector3 yAxis( );
+
+    // Construct z axis
+    // 
+    static inline const Vector3 zAxis( );
+
+}
+#ifdef __GNUC__
+__attribute__ ((aligned(16)))
+#endif
+;
+
+// Multiply a 3-D vector by a scalar
+// 
+inline const Vector3 operator *( float scalar, const Vector3 & vec );
+
+// Multiply two 3-D vectors per element
+// 
+inline const Vector3 mulPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Divide two 3-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+inline const Vector3 divPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Compute the reciprocal of a 3-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+inline const Vector3 recipPerElem( const Vector3 & vec );
+
+// Compute the square root of a 3-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function sqrtf4.
+// 
+inline const Vector3 sqrtPerElem( const Vector3 & vec );
+
+// Compute the reciprocal square root of a 3-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function rsqrtf4.
+// 
+inline const Vector3 rsqrtPerElem( const Vector3 & vec );
+
+// Compute the absolute value of a 3-D vector per element
+// 
+inline const Vector3 absPerElem( const Vector3 & vec );
+
+// Copy sign from one 3-D vector to another, per element
+// 
+inline const Vector3 copySignPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Maximum of two 3-D vectors per element
+// 
+inline const Vector3 maxPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Minimum of two 3-D vectors per element
+// 
+inline const Vector3 minPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Maximum element of a 3-D vector
+// 
+inline float maxElem( const Vector3 & vec );
+
+// Minimum element of a 3-D vector
+// 
+inline float minElem( const Vector3 & vec );
+
+// Compute the sum of all elements of a 3-D vector
+// 
+inline float sum( const Vector3 & vec );
+
+// Compute the dot product of two 3-D vectors
+// 
+inline float dot( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Compute the square of the length of a 3-D vector
+// 
+inline float lengthSqr( const Vector3 & vec );
+
+// Compute the length of a 3-D vector
+// 
+inline float length( const Vector3 & vec );
+
+// Normalize a 3-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+inline const Vector3 normalize( const Vector3 & vec );
+
+// Compute cross product of two 3-D vectors
+// 
+inline const Vector3 cross( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Outer product of two 3-D vectors
+// 
+inline const Matrix3 outer( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Pre-multiply a row vector by a 3x3 matrix
+// 
+inline const Vector3 rowMul( const Vector3 & vec, const Matrix3 & mat );
+
+// Cross-product matrix of a 3-D vector
+// 
+inline const Matrix3 crossMatrix( const Vector3 & vec );
+
+// Create cross-product matrix and multiply
+// NOTE: 
+// Faster than separately creating a cross-product matrix and multiplying.
+// 
+inline const Matrix3 crossMatrixMul( const Vector3 & vec, const Matrix3 & mat );
+
+// Linear interpolation between two 3-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+inline const Vector3 lerp( float t, const Vector3 & vec0, const Vector3 & vec1 );
+
+// Spherical linear interpolation between two 3-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+inline const Vector3 slerp( float t, const Vector3 & unitVec0, const Vector3 & unitVec1 );
+
+// Conditionally select between two 3-D vectors
+// 
+inline const Vector3 select( const Vector3 & vec0, const Vector3 & vec1, bool select1 );
+
+// Load x, y, and z elements from the first three words of a float array.
+// 
+// 
+inline void loadXYZ( Vector3 & vec, const float * fptr );
+
+// Store x, y, and z elements of a 3-D vector in the first three words of a float array.
+// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
+// 
+inline void storeXYZ( const Vector3 & vec, float * fptr );
+
+// Load three-half-floats as a 3-D vector
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs.
+// 
+inline void loadHalfFloats( Vector3 & vec, const unsigned short * hfptr );
+
+// Store a 3-D vector as half-floats. Memory area of previous 16 bytes and next 32 bytes from <code><i>hfptr</i></code> might be accessed.
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs. Memory area of previous 16 bytes and next 32 bytes from hfptr might be accessed.
+// 
+inline void storeHalfFloats( const Vector3 & vec, unsigned short * hfptr );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Vector3 & vec );
+
+// Print a 3-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Vector3 & vec, const char * name );
+
+#endif
+
+// A 4-D vector in array-of-structures format
+//
+class Vector4
+{
+    float mX;
+    float mY;
+    float mZ;
+    float mW;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Vector4( ) { };
+
+    // Copy a 4-D vector
+    // 
+    inline Vector4( const Vector4 & vec );
+
+    // Construct a 4-D vector from x, y, z, and w elements
+    // 
+    inline Vector4( float x, float y, float z, float w );
+
+    // Construct a 4-D vector from a 3-D vector and a scalar
+    // 
+    inline Vector4( const Vector3 & xyz, float w );
+
+    // Copy x, y, and z from a 3-D vector into a 4-D vector, and set w to 0
+    // 
+    explicit inline Vector4( const Vector3 & vec );
+
+    // Copy x, y, and z from a 3-D point into a 4-D vector, and set w to 1
+    // 
+    explicit inline Vector4( const Point3 & pnt );
+
+    // Copy elements from a quaternion into a 4-D vector
+    // 
+    explicit inline Vector4( const Quat & quat );
+
+    // Set all elements of a 4-D vector to the same scalar value
+    // 
+    explicit inline Vector4( float scalar );
+
+    // Assign one 4-D vector to another
+    // 
+    inline Vector4 & operator =( const Vector4 & vec );
+
+    // Set the x, y, and z elements of a 4-D vector
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    inline Vector4 & setXYZ( const Vector3 & vec );
+
+    // Get the x, y, and z elements of a 4-D vector
+    // 
+    inline const Vector3 getXYZ( ) const;
+
+    // Set the x element of a 4-D vector
+    // 
+    inline Vector4 & setX( float x );
+
+    // Set the y element of a 4-D vector
+    // 
+    inline Vector4 & setY( float y );
+
+    // Set the z element of a 4-D vector
+    // 
+    inline Vector4 & setZ( float z );
+
+    // Set the w element of a 4-D vector
+    // 
+    inline Vector4 & setW( float w );
+
+    // Get the x element of a 4-D vector
+    // 
+    inline float getX( ) const;
+
+    // Get the y element of a 4-D vector
+    // 
+    inline float getY( ) const;
+
+    // Get the z element of a 4-D vector
+    // 
+    inline float getZ( ) const;
+
+    // Get the w element of a 4-D vector
+    // 
+    inline float getW( ) const;
+
+    // Set an x, y, z, or w element of a 4-D vector by index
+    // 
+    inline Vector4 & setElem( int idx, float value );
+
+    // Get an x, y, z, or w element of a 4-D vector by index
+    // 
+    inline float getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+
+    // Add two 4-D vectors
+    // 
+    inline const Vector4 operator +( const Vector4 & vec ) const;
+
+    // Subtract a 4-D vector from another 4-D vector
+    // 
+    inline const Vector4 operator -( const Vector4 & vec ) const;
+
+    // Multiply a 4-D vector by a scalar
+    // 
+    inline const Vector4 operator *( float scalar ) const;
+
+    // Divide a 4-D vector by a scalar
+    // 
+    inline const Vector4 operator /( float scalar ) const;
+
+    // Perform compound assignment and addition with a 4-D vector
+    // 
+    inline Vector4 & operator +=( const Vector4 & vec );
+
+    // Perform compound assignment and subtraction by a 4-D vector
+    // 
+    inline Vector4 & operator -=( const Vector4 & vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Vector4 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    inline Vector4 & operator /=( float scalar );
+
+    // Negate all elements of a 4-D vector
+    // 
+    inline const Vector4 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static inline const Vector4 xAxis( );
+
+    // Construct y axis
+    // 
+    static inline const Vector4 yAxis( );
+
+    // Construct z axis
+    // 
+    static inline const Vector4 zAxis( );
+
+    // Construct w axis
+    // 
+    static inline const Vector4 wAxis( );
+
+}
+#ifdef __GNUC__
+__attribute__ ((aligned(16)))
+#endif
+;
+
+// Multiply a 4-D vector by a scalar
+// 
+inline const Vector4 operator *( float scalar, const Vector4 & vec );
+
+// Multiply two 4-D vectors per element
+// 
+inline const Vector4 mulPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Divide two 4-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+inline const Vector4 divPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Compute the reciprocal of a 4-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+inline const Vector4 recipPerElem( const Vector4 & vec );
+
+// Compute the square root of a 4-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function sqrtf4.
+// 
+inline const Vector4 sqrtPerElem( const Vector4 & vec );
+
+// Compute the reciprocal square root of a 4-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function rsqrtf4.
+// 
+inline const Vector4 rsqrtPerElem( const Vector4 & vec );
+
+// Compute the absolute value of a 4-D vector per element
+// 
+inline const Vector4 absPerElem( const Vector4 & vec );
+
+// Copy sign from one 4-D vector to another, per element
+// 
+inline const Vector4 copySignPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Maximum of two 4-D vectors per element
+// 
+inline const Vector4 maxPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Minimum of two 4-D vectors per element
+// 
+inline const Vector4 minPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Maximum element of a 4-D vector
+// 
+inline float maxElem( const Vector4 & vec );
+
+// Minimum element of a 4-D vector
+// 
+inline float minElem( const Vector4 & vec );
+
+// Compute the sum of all elements of a 4-D vector
+// 
+inline float sum( const Vector4 & vec );
+
+// Compute the dot product of two 4-D vectors
+// 
+inline float dot( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Compute the square of the length of a 4-D vector
+// 
+inline float lengthSqr( const Vector4 & vec );
+
+// Compute the length of a 4-D vector
+// 
+inline float length( const Vector4 & vec );
+
+// Normalize a 4-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+inline const Vector4 normalize( const Vector4 & vec );
+
+// Outer product of two 4-D vectors
+// 
+inline const Matrix4 outer( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Linear interpolation between two 4-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+inline const Vector4 lerp( float t, const Vector4 & vec0, const Vector4 & vec1 );
+
+// Spherical linear interpolation between two 4-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+inline const Vector4 slerp( float t, const Vector4 & unitVec0, const Vector4 & unitVec1 );
+
+// Conditionally select between two 4-D vectors
+// 
+inline const Vector4 select( const Vector4 & vec0, const Vector4 & vec1, bool select1 );
+
+// Load x, y, z, and w elements from the first four words of a float array.
+// 
+// 
+inline void loadXYZW( Vector4 & vec, const float * fptr );
+
+// Store x, y, z, and w elements of a 4-D vector in the first four words of a float array.
+// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
+// 
+inline void storeXYZW( const Vector4 & vec, float * fptr );
+
+// Load four-half-floats as a 4-D vector
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs.
+// 
+inline void loadHalfFloats( Vector4 & vec, const unsigned short * hfptr );
+
+// Store a 4-D vector as half-floats. Memory area of previous 16 bytes and next 32 bytes from <code><i>hfptr</i></code> might be accessed.
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs. Memory area of previous 16 bytes and next 32 bytes from hfptr might be accessed.
+// 
+inline void storeHalfFloats( const Vector4 & vec, unsigned short * hfptr );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Vector4 & vec );
+
+// Print a 4-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Vector4 & vec, const char * name );
+
+#endif
+
+// A 3-D point in array-of-structures format
+//
+class Point3
+{
+    float mX;
+    float mY;
+    float mZ;
+#ifndef __GNUC__
+    float d;
+#endif
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Point3( ) { };
+
+    // Copy a 3-D point
+    // 
+    inline Point3( const Point3 & pnt );
+
+    // Construct a 3-D point from x, y, and z elements
+    // 
+    inline Point3( float x, float y, float z );
+
+    // Copy elements from a 3-D vector into a 3-D point
+    // 
+    explicit inline Point3( const Vector3 & vec );
+
+    // Set all elements of a 3-D point to the same scalar value
+    // 
+    explicit inline Point3( float scalar );
+
+    // Assign one 3-D point to another
+    // 
+    inline Point3 & operator =( const Point3 & pnt );
+
+    // Set the x element of a 3-D point
+    // 
+    inline Point3 & setX( float x );
+
+    // Set the y element of a 3-D point
+    // 
+    inline Point3 & setY( float y );
+
+    // Set the z element of a 3-D point
+    // 
+    inline Point3 & setZ( float z );
+
+    // Get the x element of a 3-D point
+    // 
+    inline float getX( ) const;
+
+    // Get the y element of a 3-D point
+    // 
+    inline float getY( ) const;
+
+    // Get the z element of a 3-D point
+    // 
+    inline float getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D point by index
+    // 
+    inline Point3 & setElem( int idx, float value );
+
+    // Get an x, y, or z element of a 3-D point by index
+    // 
+    inline float getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+
+    // Subtract a 3-D point from another 3-D point
+    // 
+    inline const Vector3 operator -( const Point3 & pnt ) const;
+
+    // Add a 3-D point to a 3-D vector
+    // 
+    inline const Point3 operator +( const Vector3 & vec ) const;
+
+    // Subtract a 3-D vector from a 3-D point
+    // 
+    inline const Point3 operator -( const Vector3 & vec ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    inline Point3 & operator +=( const Vector3 & vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    inline Point3 & operator -=( const Vector3 & vec );
+
+}
+#ifdef __GNUC__
+__attribute__ ((aligned(16)))
+#endif
+;
+
+// Multiply two 3-D points per element
+// 
+inline const Point3 mulPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Divide two 3-D points per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+inline const Point3 divPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Compute the reciprocal of a 3-D point per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+inline const Point3 recipPerElem( const Point3 & pnt );
+
+// Compute the square root of a 3-D point per element
+// NOTE: 
+// Floating-point behavior matches standard library function sqrtf4.
+// 
+inline const Point3 sqrtPerElem( const Point3 & pnt );
+
+// Compute the reciprocal square root of a 3-D point per element
+// NOTE: 
+// Floating-point behavior matches standard library function rsqrtf4.
+// 
+inline const Point3 rsqrtPerElem( const Point3 & pnt );
+
+// Compute the absolute value of a 3-D point per element
+// 
+inline const Point3 absPerElem( const Point3 & pnt );
+
+// Copy sign from one 3-D point to another, per element
+// 
+inline const Point3 copySignPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Maximum of two 3-D points per element
+// 
+inline const Point3 maxPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Minimum of two 3-D points per element
+// 
+inline const Point3 minPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Maximum element of a 3-D point
+// 
+inline float maxElem( const Point3 & pnt );
+
+// Minimum element of a 3-D point
+// 
+inline float minElem( const Point3 & pnt );
+
+// Compute the sum of all elements of a 3-D point
+// 
+inline float sum( const Point3 & pnt );
+
+// Apply uniform scale to a 3-D point
+// 
+inline const Point3 scale( const Point3 & pnt, float scaleVal );
+
+// Apply non-uniform scale to a 3-D point
+// 
+inline const Point3 scale( const Point3 & pnt, const Vector3 & scaleVec );
+
+// Scalar projection of a 3-D point on a unit-length 3-D vector
+// 
+inline float projection( const Point3 & pnt, const Vector3 & unitVec );
+
+// Compute the square of the distance of a 3-D point from the coordinate-system origin
+// 
+inline float distSqrFromOrigin( const Point3 & pnt );
+
+// Compute the distance of a 3-D point from the coordinate-system origin
+// 
+inline float distFromOrigin( const Point3 & pnt );
+
+// Compute the square of the distance between two 3-D points
+// 
+inline float distSqr( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Compute the distance between two 3-D points
+// 
+inline float dist( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Linear interpolation between two 3-D points
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+inline const Point3 lerp( float t, const Point3 & pnt0, const Point3 & pnt1 );
+
+// Conditionally select between two 3-D points
+// 
+inline const Point3 select( const Point3 & pnt0, const Point3 & pnt1, bool select1 );
+
+// Load x, y, and z elements from the first three words of a float array.
+// 
+// 
+inline void loadXYZ( Point3 & pnt, const float * fptr );
+
+// Store x, y, and z elements of a 3-D point in the first three words of a float array.
+// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
+// 
+inline void storeXYZ( const Point3 & pnt, float * fptr );
+
+// Load three-half-floats as a 3-D point
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs.
+// 
+inline void loadHalfFloats( Point3 & pnt, const unsigned short * hfptr );
+
+// Store a 3-D point as half-floats. Memory area of previous 16 bytes and next 32 bytes from <code><i>hfptr</i></code> might be accessed.
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs. Memory area of previous 16 bytes and next 32 bytes from hfptr might be accessed.
+// 
+inline void storeHalfFloats( const Point3 & pnt, unsigned short * hfptr );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D point
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Point3 & pnt );
+
+// Print a 3-D point and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Point3 & pnt, const char * name );
+
+#endif
+
+// A quaternion in array-of-structures format
+//
+class Quat
+{
+#if defined( __APPLE__ ) && defined( BT_USE_NEON )
+    union{
+        float32x4_t vXYZW;
+        float mXYZW[4];
+    };
+#else
+    float mX;
+    float mY;
+    float mZ;
+    float mW;
+#endif
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Quat( ) { };
+
+    // Copy a quaternion
+    // 
+    inline Quat( const Quat & quat );
+
+    // Construct a quaternion from x, y, z, and w elements
+    // 
+    inline Quat( float x, float y, float z, float w );
+    
+    // Construct a quaternion from vector of x, y, z, and w elements
+    // 
+    inline Quat( float32x4_t fXYZW );
+
+    // Construct a quaternion from a 3-D vector and a scalar
+    // 
+    inline Quat( const Vector3 & xyz, float w );
+
+    // Copy elements from a 4-D vector into a quaternion
+    // 
+    explicit inline Quat( const Vector4 & vec );
+
+    // Convert a rotation matrix to a unit-length quaternion
+    // 
+    explicit inline Quat( const Matrix3 & rotMat );
+
+    // Set all elements of a quaternion to the same scalar value
+    // 
+    explicit inline Quat( float scalar );
+    
+    // Assign one quaternion to another
+    // 
+    inline Quat & operator =( const Quat & quat );
+
+    // Set the x, y, and z elements of a quaternion
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    inline Quat & setXYZ( const Vector3 & vec );
+
+    // Get the x, y, and z elements of a quaternion
+    // 
+    inline const Vector3 getXYZ( ) const;
+
+    // Set the x element of a quaternion
+    // 
+    inline Quat & setX( float x );
+
+    // Set the y element of a quaternion
+    // 
+    inline Quat & setY( float y );
+
+    // Set the z element of a quaternion
+    // 
+    inline Quat & setZ( float z );
+
+    // Set the w element of a quaternion
+    // 
+    inline Quat & setW( float w );
+
+#if defined( __APPLE__ ) && defined( BT_USE_NEON )
+    inline float32x4_t getvXYZW( ) const;
+#endif
+    
+    // Get the x element of a quaternion
+    // 
+    inline float getX( ) const;
+
+    // Get the y element of a quaternion
+    // 
+    inline float getY( ) const;
+
+    // Get the z element of a quaternion
+    // 
+    inline float getZ( ) const;
+
+    // Get the w element of a quaternion
+    // 
+    inline float getW( ) const;
+
+    // Set an x, y, z, or w element of a quaternion by index
+    // 
+    inline Quat & setElem( int idx, float value );
+
+    // Get an x, y, z, or w element of a quaternion by index
+    // 
+    inline float getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+
+    // Add two quaternions
+    // 
+    inline const Quat operator +( const Quat & quat ) const;
+
+    // Subtract a quaternion from another quaternion
+    // 
+    inline const Quat operator -( const Quat & quat ) const;
+
+    // Multiply two quaternions
+    // 
+    inline const Quat operator *( const Quat & quat ) const;
+
+    // Multiply a quaternion by a scalar
+    // 
+    inline const Quat operator *( float scalar ) const;
+
+    // Divide a quaternion by a scalar
+    // 
+    inline const Quat operator /( float scalar ) const;
+
+    // Perform compound assignment and addition with a quaternion
+    // 
+    inline Quat & operator +=( const Quat & quat );
+
+    // Perform compound assignment and subtraction by a quaternion
+    // 
+    inline Quat & operator -=( const Quat & quat );
+
+    // Perform compound assignment and multiplication by a quaternion
+    // 
+    inline Quat & operator *=( const Quat & quat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Quat & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    inline Quat & operator /=( float scalar );
+
+    // Negate all elements of a quaternion
+    // 
+    inline const Quat operator -( ) const;
+
+    // Construct an identity quaternion
+    // 
+    static inline const Quat identity( );
+
+    // Construct a quaternion to rotate between two unit-length 3-D vectors
+    // NOTE: 
+    // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions.
+    // 
+    static inline const Quat rotation( const Vector3 & unitVec0, const Vector3 & unitVec1 );
+
+    // Construct a quaternion to rotate around a unit-length 3-D vector
+    // 
+    static inline const Quat rotation( float radians, const Vector3 & unitVec );
+
+    // Construct a quaternion to rotate around the x axis
+    // 
+    static inline const Quat rotationX( float radians );
+
+    // Construct a quaternion to rotate around the y axis
+    // 
+    static inline const Quat rotationY( float radians );
+
+    // Construct a quaternion to rotate around the z axis
+    // 
+    static inline const Quat rotationZ( float radians );
+
+}
+#ifdef __GNUC__
+__attribute__ ((aligned(16)))
+#endif
+;
+
+// Multiply a quaternion by a scalar
+// 
+inline const Quat operator *( float scalar, const Quat & quat );
+
+// Compute the conjugate of a quaternion
+// 
+inline const Quat conj( const Quat & quat );
+
+// Use a unit-length quaternion to rotate a 3-D vector
+// 
+inline const Vector3 rotate( const Quat & unitQuat, const Vector3 & vec );
+
+// Compute the dot product of two quaternions
+// 
+inline float dot( const Quat & quat0, const Quat & quat1 );
+
+// Compute the norm of a quaternion
+// 
+inline float norm( const Quat & quat );
+
+// Compute the length of a quaternion
+// 
+inline float length( const Quat & quat );
+
+// Normalize a quaternion
+// NOTE: 
+// The result is unpredictable when all elements of quat are at or near zero.
+// 
+inline const Quat normalize( const Quat & quat );
+
+// Linear interpolation between two quaternions
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+inline const Quat lerp( float t, const Quat & quat0, const Quat & quat1 );
+
+// Spherical linear interpolation between two quaternions
+// NOTE: 
+// Interpolates along the shortest path between orientations.
+// Does not clamp t between 0 and 1.
+// 
+inline const Quat slerp( float t, const Quat & unitQuat0, const Quat & unitQuat1 );
+
+// Spherical quadrangle interpolation
+// 
+inline const Quat squad( float t, const Quat & unitQuat0, const Quat & unitQuat1, const Quat & unitQuat2, const Quat & unitQuat3 );
+
+// Conditionally select between two quaternions
+// 
+inline const Quat select( const Quat & quat0, const Quat & quat1, bool select1 );
+
+// Load x, y, z, and w elements from the first four words of a float array.
+// 
+// 
+inline void loadXYZW( Quat & quat, const float * fptr );
+
+// Store x, y, z, and w elements of a quaternion in the first four words of a float array.
+// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
+// 
+inline void storeXYZW( const Quat & quat, float * fptr );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a quaternion
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Quat & quat );
+
+// Print a quaternion and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Quat & quat, const char * name );
+
+#endif
+
+// A 3x3 matrix in array-of-structures format
+//
+class Matrix3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Matrix3( ) { };
+
+    // Copy a 3x3 matrix
+    // 
+    inline Matrix3( const Matrix3 & mat );
+
+    // Construct a 3x3 matrix containing the specified columns
+    // 
+    inline Matrix3( const Vector3 & col0, const Vector3 & col1, const Vector3 & col2 );
+
+    // Construct a 3x3 rotation matrix from a unit-length quaternion
+    // 
+    explicit inline Matrix3( const Quat & unitQuat );
+
+    // Set all elements of a 3x3 matrix to the same scalar value
+    // 
+    explicit inline Matrix3( float scalar );
+
+    // Assign one 3x3 matrix to another
+    // 
+    inline Matrix3 & operator =( const Matrix3 & mat );
+
+    // Set column 0 of a 3x3 matrix
+    // 
+    inline Matrix3 & setCol0( const Vector3 & col0 );
+
+    // Set column 1 of a 3x3 matrix
+    // 
+    inline Matrix3 & setCol1( const Vector3 & col1 );
+
+    // Set column 2 of a 3x3 matrix
+    // 
+    inline Matrix3 & setCol2( const Vector3 & col2 );
+
+    // Get column 0 of a 3x3 matrix
+    // 
+    inline const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x3 matrix
+    // 
+    inline const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x3 matrix
+    // 
+    inline const Vector3 getCol2( ) const;
+
+    // Set the column of a 3x3 matrix referred to by the specified index
+    // 
+    inline Matrix3 & setCol( int col, const Vector3 & vec );
+
+    // Set the row of a 3x3 matrix referred to by the specified index
+    // 
+    inline Matrix3 & setRow( int row, const Vector3 & vec );
+
+    // Get the column of a 3x3 matrix referred to by the specified index
+    // 
+    inline const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x3 matrix referred to by the specified index
+    // 
+    inline const Vector3 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    inline Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    inline const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x3 matrix referred to by column and row indices
+    // 
+    inline Matrix3 & setElem( int col, int row, float val );
+
+    // Get the element of a 3x3 matrix referred to by column and row indices
+    // 
+    inline float getElem( int col, int row ) const;
+
+    // Add two 3x3 matrices
+    // 
+    inline const Matrix3 operator +( const Matrix3 & mat ) const;
+
+    // Subtract a 3x3 matrix from another 3x3 matrix
+    // 
+    inline const Matrix3 operator -( const Matrix3 & mat ) const;
+
+    // Negate all elements of a 3x3 matrix
+    // 
+    inline const Matrix3 operator -( ) const;
+
+    // Multiply a 3x3 matrix by a scalar
+    // 
+    inline const Matrix3 operator *( float scalar ) const;
+
+    // Multiply a 3x3 matrix by a 3-D vector
+    // 
+    inline const Vector3 operator *( const Vector3 & vec ) const;
+
+    // Multiply two 3x3 matrices
+    // 
+    inline const Matrix3 operator *( const Matrix3 & mat ) const;
+
+    // Perform compound assignment and addition with a 3x3 matrix
+    // 
+    inline Matrix3 & operator +=( const Matrix3 & mat );
+
+    // Perform compound assignment and subtraction by a 3x3 matrix
+    // 
+    inline Matrix3 & operator -=( const Matrix3 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Matrix3 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a 3x3 matrix
+    // 
+    inline Matrix3 & operator *=( const Matrix3 & mat );
+
+    // Construct an identity 3x3 matrix
+    // 
+    static inline const Matrix3 identity( );
+
+    // Construct a 3x3 matrix to rotate around the x axis
+    // 
+    static inline const Matrix3 rotationX( float radians );
+
+    // Construct a 3x3 matrix to rotate around the y axis
+    // 
+    static inline const Matrix3 rotationY( float radians );
+
+    // Construct a 3x3 matrix to rotate around the z axis
+    // 
+    static inline const Matrix3 rotationZ( float radians );
+
+    // Construct a 3x3 matrix to rotate around the x, y, and z axes
+    // 
+    static inline const Matrix3 rotationZYX( const Vector3 & radiansXYZ );
+
+    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector
+    // 
+    static inline const Matrix3 rotation( float radians, const Vector3 & unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static inline const Matrix3 rotation( const Quat & unitQuat );
+
+    // Construct a 3x3 matrix to perform scaling
+    // 
+    static inline const Matrix3 scale( const Vector3 & scaleVec );
+
+};
+// Multiply a 3x3 matrix by a scalar
+// 
+inline const Matrix3 operator *( float scalar, const Matrix3 & mat );
+
+// Append (post-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 & scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Matrix3 prependScale( const Vector3 & scaleVec, const Matrix3 & mat );
+
+// Multiply two 3x3 matrices per element
+// 
+inline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 );
+
+// Compute the absolute value of a 3x3 matrix per element
+// 
+inline const Matrix3 absPerElem( const Matrix3 & mat );
+
+// Transpose of a 3x3 matrix
+// 
+inline const Matrix3 transpose( const Matrix3 & mat );
+
+// Compute the inverse of a 3x3 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+inline const Matrix3 inverse( const Matrix3 & mat );
+
+// Determinant of a 3x3 matrix
+// 
+inline float determinant( const Matrix3 & mat );
+
+// Conditionally select between two 3x3 matrices
+// 
+inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x3 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Matrix3 & mat );
+
+// Print a 3x3 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Matrix3 & mat, const char * name );
+
+#endif
+
+// A 4x4 matrix in array-of-structures format
+//
+class Matrix4
+{
+    Vector4 mCol0;
+    Vector4 mCol1;
+    Vector4 mCol2;
+    Vector4 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Matrix4( ) { };
+
+    // Copy a 4x4 matrix
+    // 
+    inline Matrix4( const Matrix4 & mat );
+
+    // Construct a 4x4 matrix containing the specified columns
+    // 
+    inline Matrix4( const Vector4 & col0, const Vector4 & col1, const Vector4 & col2, const Vector4 & col3 );
+
+    // Construct a 4x4 matrix from a 3x4 transformation matrix
+    // 
+    explicit inline Matrix4( const Transform3 & mat );
+
+    // Construct a 4x4 matrix from a 3x3 matrix and a 3-D vector
+    // 
+    inline Matrix4( const Matrix3 & mat, const Vector3 & translateVec );
+
+    // Construct a 4x4 matrix from a unit-length quaternion and a 3-D vector
+    // 
+    inline Matrix4( const Quat & unitQuat, const Vector3 & translateVec );
+
+    // Set all elements of a 4x4 matrix to the same scalar value
+    // 
+    explicit inline Matrix4( float scalar );
+
+    // Assign one 4x4 matrix to another
+    // 
+    inline Matrix4 & operator =( const Matrix4 & mat );
+
+    // Set the upper-left 3x3 submatrix
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    inline Matrix4 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 4x4 matrix
+    // 
+    inline const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    inline Matrix4 & setTranslation( const Vector3 & translateVec );
+
+    // Get the translation component of a 4x4 matrix
+    // 
+    inline const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 4x4 matrix
+    // 
+    inline Matrix4 & setCol0( const Vector4 & col0 );
+
+    // Set column 1 of a 4x4 matrix
+    // 
+    inline Matrix4 & setCol1( const Vector4 & col1 );
+
+    // Set column 2 of a 4x4 matrix
+    // 
+    inline Matrix4 & setCol2( const Vector4 & col2 );
+
+    // Set column 3 of a 4x4 matrix
+    // 
+    inline Matrix4 & setCol3( const Vector4 & col3 );
+
+    // Get column 0 of a 4x4 matrix
+    // 
+    inline const Vector4 getCol0( ) const;
+
+    // Get column 1 of a 4x4 matrix
+    // 
+    inline const Vector4 getCol1( ) const;
+
+    // Get column 2 of a 4x4 matrix
+    // 
+    inline const Vector4 getCol2( ) const;
+
+    // Get column 3 of a 4x4 matrix
+    // 
+    inline const Vector4 getCol3( ) const;
+
+    // Set the column of a 4x4 matrix referred to by the specified index
+    // 
+    inline Matrix4 & setCol( int col, const Vector4 & vec );
+
+    // Set the row of a 4x4 matrix referred to by the specified index
+    // 
+    inline Matrix4 & setRow( int row, const Vector4 & vec );
+
+    // Get the column of a 4x4 matrix referred to by the specified index
+    // 
+    inline const Vector4 getCol( int col ) const;
+
+    // Get the row of a 4x4 matrix referred to by the specified index
+    // 
+    inline const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    inline Vector4 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    inline const Vector4 operator []( int col ) const;
+
+    // Set the element of a 4x4 matrix referred to by column and row indices
+    // 
+    inline Matrix4 & setElem( int col, int row, float val );
+
+    // Get the element of a 4x4 matrix referred to by column and row indices
+    // 
+    inline float getElem( int col, int row ) const;
+
+    // Add two 4x4 matrices
+    // 
+    inline const Matrix4 operator +( const Matrix4 & mat ) const;
+
+    // Subtract a 4x4 matrix from another 4x4 matrix
+    // 
+    inline const Matrix4 operator -( const Matrix4 & mat ) const;
+
+    // Negate all elements of a 4x4 matrix
+    // 
+    inline const Matrix4 operator -( ) const;
+
+    // Multiply a 4x4 matrix by a scalar
+    // 
+    inline const Matrix4 operator *( float scalar ) const;
+
+    // Multiply a 4x4 matrix by a 4-D vector
+    // 
+    inline const Vector4 operator *( const Vector4 & vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D vector
+    // 
+    inline const Vector4 operator *( const Vector3 & vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D point
+    // 
+    inline const Vector4 operator *( const Point3 & pnt ) const;
+
+    // Multiply two 4x4 matrices
+    // 
+    inline const Matrix4 operator *( const Matrix4 & mat ) const;
+
+    // Multiply a 4x4 matrix by a 3x4 transformation matrix
+    // 
+    inline const Matrix4 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and addition with a 4x4 matrix
+    // 
+    inline Matrix4 & operator +=( const Matrix4 & mat );
+
+    // Perform compound assignment and subtraction by a 4x4 matrix
+    // 
+    inline Matrix4 & operator -=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Matrix4 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a 4x4 matrix
+    // 
+    inline Matrix4 & operator *=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    inline Matrix4 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 4x4 matrix
+    // 
+    static inline const Matrix4 identity( );
+
+    // Construct a 4x4 matrix to rotate around the x axis
+    // 
+    static inline const Matrix4 rotationX( float radians );
+
+    // Construct a 4x4 matrix to rotate around the y axis
+    // 
+    static inline const Matrix4 rotationY( float radians );
+
+    // Construct a 4x4 matrix to rotate around the z axis
+    // 
+    static inline const Matrix4 rotationZ( float radians );
+
+    // Construct a 4x4 matrix to rotate around the x, y, and z axes
+    // 
+    static inline const Matrix4 rotationZYX( const Vector3 & radiansXYZ );
+
+    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector
+    // 
+    static inline const Matrix4 rotation( float radians, const Vector3 & unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static inline const Matrix4 rotation( const Quat & unitQuat );
+
+    // Construct a 4x4 matrix to perform scaling
+    // 
+    static inline const Matrix4 scale( const Vector3 & scaleVec );
+
+    // Construct a 4x4 matrix to perform translation
+    // 
+    static inline const Matrix4 translation( const Vector3 & translateVec );
+
+    // Construct viewing matrix based on eye position, position looked at, and up direction
+    // 
+    static inline const Matrix4 lookAt( const Point3 & eyePos, const Point3 & lookAtPos, const Vector3 & upVec );
+
+    // Construct a perspective projection matrix
+    // 
+    static inline const Matrix4 perspective( float fovyRadians, float aspect, float zNear, float zFar );
+
+    // Construct a perspective projection matrix based on frustum
+    // 
+    static inline const Matrix4 frustum( float left, float right, float bottom, float top, float zNear, float zFar );
+
+    // Construct an orthographic projection matrix
+    // 
+    static inline const Matrix4 orthographic( float left, float right, float bottom, float top, float zNear, float zFar );
+
+};
+// Multiply a 4x4 matrix by a scalar
+// 
+inline const Matrix4 operator *( float scalar, const Matrix4 & mat );
+
+// Append (post-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 & scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Matrix4 prependScale( const Vector3 & scaleVec, const Matrix4 & mat );
+
+// Multiply two 4x4 matrices per element
+// 
+inline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 );
+
+// Compute the absolute value of a 4x4 matrix per element
+// 
+inline const Matrix4 absPerElem( const Matrix4 & mat );
+
+// Transpose of a 4x4 matrix
+// 
+inline const Matrix4 transpose( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+inline const Matrix4 inverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.  The result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+inline const Matrix4 affineInverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix with an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.
+// 
+inline const Matrix4 orthoInverse( const Matrix4 & mat );
+
+// Determinant of a 4x4 matrix
+// 
+inline float determinant( const Matrix4 & mat );
+
+// Conditionally select between two 4x4 matrices
+// 
+inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4x4 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Matrix4 & mat );
+
+// Print a 4x4 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Matrix4 & mat, const char * name );
+
+#endif
+
+// A 3x4 transformation matrix in array-of-structures format
+//
+class Transform3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+    Vector3 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Transform3( ) { };
+
+    // Copy a 3x4 transformation matrix
+    // 
+    inline Transform3( const Transform3 & tfrm );
+
+    // Construct a 3x4 transformation matrix containing the specified columns
+    // 
+    inline Transform3( const Vector3 & col0, const Vector3 & col1, const Vector3 & col2, const Vector3 & col3 );
+
+    // Construct a 3x4 transformation matrix from a 3x3 matrix and a 3-D vector
+    // 
+    inline Transform3( const Matrix3 & tfrm, const Vector3 & translateVec );
+
+    // Construct a 3x4 transformation matrix from a unit-length quaternion and a 3-D vector
+    // 
+    inline Transform3( const Quat & unitQuat, const Vector3 & translateVec );
+
+    // Set all elements of a 3x4 transformation matrix to the same scalar value
+    // 
+    explicit inline Transform3( float scalar );
+
+    // Assign one 3x4 transformation matrix to another
+    // 
+    inline Transform3 & operator =( const Transform3 & tfrm );
+
+    // Set the upper-left 3x3 submatrix
+    // 
+    inline Transform3 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 3x4 transformation matrix
+    // 
+    inline const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // 
+    inline Transform3 & setTranslation( const Vector3 & translateVec );
+
+    // Get the translation component of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 3x4 transformation matrix
+    // 
+    inline Transform3 & setCol0( const Vector3 & col0 );
+
+    // Set column 1 of a 3x4 transformation matrix
+    // 
+    inline Transform3 & setCol1( const Vector3 & col1 );
+
+    // Set column 2 of a 3x4 transformation matrix
+    // 
+    inline Transform3 & setCol2( const Vector3 & col2 );
+
+    // Set column 3 of a 3x4 transformation matrix
+    // 
+    inline Transform3 & setCol3( const Vector3 & col3 );
+
+    // Get column 0 of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getCol2( ) const;
+
+    // Get column 3 of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getCol3( ) const;
+
+    // Set the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    inline Transform3 & setCol( int col, const Vector3 & vec );
+
+    // Set the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    inline Transform3 & setRow( int row, const Vector4 & vec );
+
+    // Get the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    inline const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    inline const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    inline Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    inline const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    inline Transform3 & setElem( int col, int row, float val );
+
+    // Get the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    inline float getElem( int col, int row ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D vector
+    // 
+    inline const Vector3 operator *( const Vector3 & vec ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D point
+    // 
+    inline const Point3 operator *( const Point3 & pnt ) const;
+
+    // Multiply two 3x4 transformation matrices
+    // 
+    inline const Transform3 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    inline Transform3 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 3x4 transformation matrix
+    // 
+    static inline const Transform3 identity( );
+
+    // Construct a 3x4 transformation matrix to rotate around the x axis
+    // 
+    static inline const Transform3 rotationX( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the y axis
+    // 
+    static inline const Transform3 rotationY( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the z axis
+    // 
+    static inline const Transform3 rotationZ( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the x, y, and z axes
+    // 
+    static inline const Transform3 rotationZYX( const Vector3 & radiansXYZ );
+
+    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector
+    // 
+    static inline const Transform3 rotation( float radians, const Vector3 & unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static inline const Transform3 rotation( const Quat & unitQuat );
+
+    // Construct a 3x4 transformation matrix to perform scaling
+    // 
+    static inline const Transform3 scale( const Vector3 & scaleVec );
+
+    // Construct a 3x4 transformation matrix to perform translation
+    // 
+    static inline const Transform3 translation( const Vector3 & translateVec );
+
+};
+// Append (post-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 & scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Transform3 prependScale( const Vector3 & scaleVec, const Transform3 & tfrm );
+
+// Multiply two 3x4 transformation matrices per element
+// 
+inline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 );
+
+// Compute the absolute value of a 3x4 transformation matrix per element
+// 
+inline const Transform3 absPerElem( const Transform3 & tfrm );
+
+// Inverse of a 3x4 transformation matrix
+// NOTE: 
+// Result is unpredictable when the determinant of the left 3x3 submatrix is equal to or near 0.
+// 
+inline const Transform3 inverse( const Transform3 & tfrm );
+
+// Compute the inverse of a 3x4 transformation matrix, expected to have an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 3x4 transformation matrix meets the given restrictions.
+// 
+inline const Transform3 orthoInverse( const Transform3 & tfrm );
+
+// Conditionally select between two 3x4 transformation matrices
+// 
+inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x4 transformation matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Transform3 & tfrm );
+
+// Print a 3x4 transformation matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Transform3 & tfrm, const char * name );
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#include "vec_aos.h"
+#include "quat_aos.h"
+#include "mat_aos.h"
+
+#endif
+
diff --git a/src/vectormath/sse/boolInVec.h b/src/vectormath/sse/boolInVec.h
index d18cb15ce..d21d25cbb 100644
--- a/src/vectormath/sse/boolInVec.h
+++ b/src/vectormath/sse/boolInVec.h
@@ -1,247 +1,247 @@
-/*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-   POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifndef _BOOLINVEC_H
-#define _BOOLINVEC_H
-
-#include <math.h>
-
-namespace Vectormath {
-
-class floatInVec;
-
-//--------------------------------------------------------------------------------------------------
-// boolInVec class
-//
-
-class boolInVec
-{
-    private:
-        __m128 mData;
-
-        inline boolInVec(__m128 vec);
-    public:
-        inline boolInVec() {}
-
-        // matches standard type conversions
-        //
-        inline boolInVec(const floatInVec &vec);
-
-        // explicit cast from bool
-        //
-        explicit inline boolInVec(bool scalar);
-
-#ifdef _VECTORMATH_NO_SCALAR_CAST
-        // explicit cast to bool
-        // 
-        inline bool getAsBool() const;
-#else
-        // implicit cast to bool
-        // 
-        inline operator bool() const;
-#endif
-        
-        // get vector data
-        // bool value is splatted across all word slots of vector as 0 (false) or -1 (true)
-        //
-        inline __m128 get128() const;
-
-        // operators
-        //
-        inline const boolInVec operator ! () const;
-        inline boolInVec& operator = (const boolInVec &vec);
-        inline boolInVec& operator &= (const boolInVec &vec);
-        inline boolInVec& operator ^= (const boolInVec &vec);
-        inline boolInVec& operator |= (const boolInVec &vec);
-
-        // friend functions
-        //
-        friend inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1);
-        friend inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1);
-        friend inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1);
-        friend inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1);
-        friend inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1);
-        friend inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1);
-};
-
-//--------------------------------------------------------------------------------------------------
-// boolInVec functions
-//
-
-// operators
-//
-inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1);
-inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1);
-inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1);
-inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1);
-inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1);
-
-// select between vec0 and vec1 using boolInVec.
-// false selects vec0, true selects vec1
-//
-inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1);
-
-} // namespace Vectormath
-
-//--------------------------------------------------------------------------------------------------
-// boolInVec implementation
-//
-
-#include "floatInVec.h"
-
-namespace Vectormath {
-
-inline
-boolInVec::boolInVec(__m128 vec)
-{
-    mData = vec;
-}
-
-inline
-boolInVec::boolInVec(const floatInVec &vec)
-{
-    *this = (vec != floatInVec(0.0f));
-}
-
-inline
-boolInVec::boolInVec(bool scalar)
-{
-    unsigned int mask = -(int)scalar;
-	mData = _mm_set1_ps(*(float *)&mask); // TODO: Union
-}
-
-#ifdef _VECTORMATH_NO_SCALAR_CAST
-inline
-bool
-boolInVec::getAsBool() const
-#else
-inline
-boolInVec::operator bool() const
-#endif
-{
-	return *(bool *)&mData;
-}
-
-inline
-__m128
-boolInVec::get128() const
-{
-    return mData;
-}
-
-inline
-const boolInVec
-boolInVec::operator ! () const
-{
-    return boolInVec(_mm_andnot_ps(mData, _mm_cmpneq_ps(_mm_setzero_ps(),_mm_setzero_ps())));
-}
-
-inline
-boolInVec&
-boolInVec::operator = (const boolInVec &vec)
-{
-    mData = vec.mData;
-    return *this;
-}
-
-inline
-boolInVec&
-boolInVec::operator &= (const boolInVec &vec)
-{
-    *this = *this & vec;
-    return *this;
-}
-
-inline
-boolInVec&
-boolInVec::operator ^= (const boolInVec &vec)
-{
-    *this = *this ^ vec;
-    return *this;
-}
-
-inline
-boolInVec&
-boolInVec::operator |= (const boolInVec &vec)
-{
-    *this = *this | vec;
-    return *this;
-}
-
-inline
-const boolInVec
-operator == (const boolInVec &vec0, const boolInVec &vec1)
-{
-	return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128()));
-}
-
-inline
-const boolInVec
-operator != (const boolInVec &vec0, const boolInVec &vec1)
-{
-	return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128()));
-}
-    
-inline
-const boolInVec
-operator & (const boolInVec &vec0, const boolInVec &vec1)
-{
-	return boolInVec(_mm_and_ps(vec0.get128(), vec1.get128()));
-}
-
-inline
-const boolInVec
-operator | (const boolInVec &vec0, const boolInVec &vec1)
-{
-	return boolInVec(_mm_or_ps(vec0.get128(), vec1.get128()));
-}
-
-inline
-const boolInVec
-operator ^ (const boolInVec &vec0, const boolInVec &vec1)
-{
-	return boolInVec(_mm_xor_ps(vec0.get128(), vec1.get128()));
-}
-
-inline
-const boolInVec
-select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1)
-{
-	return boolInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128()));
-}
- 
-} // namespace Vectormath
-
-#endif // boolInVec_h
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _BOOLINVEC_H
+#define _BOOLINVEC_H
+
+#include <math.h>
+
+namespace Vectormath {
+
+class floatInVec;
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec class
+//
+
+class boolInVec
+{
+    private:
+        __m128 mData;
+
+        inline boolInVec(__m128 vec);
+    public:
+        inline boolInVec() {}
+
+        // matches standard type conversions
+        //
+        inline boolInVec(const floatInVec &vec);
+
+        // explicit cast from bool
+        //
+        explicit inline boolInVec(bool scalar);
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+        // explicit cast to bool
+        // 
+        inline bool getAsBool() const;
+#else
+        // implicit cast to bool
+        // 
+        inline operator bool() const;
+#endif
+        
+        // get vector data
+        // bool value is splatted across all word slots of vector as 0 (false) or -1 (true)
+        //
+        inline __m128 get128() const;
+
+        // operators
+        //
+        inline const boolInVec operator ! () const;
+        inline boolInVec& operator = (const boolInVec &vec);
+        inline boolInVec& operator &= (const boolInVec &vec);
+        inline boolInVec& operator ^= (const boolInVec &vec);
+        inline boolInVec& operator |= (const boolInVec &vec);
+
+        // friend functions
+        //
+        friend inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1);
+};
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec functions
+//
+
+// operators
+//
+inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1);
+
+// select between vec0 and vec1 using boolInVec.
+// false selects vec0, true selects vec1
+//
+inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1);
+
+} // namespace Vectormath
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec implementation
+//
+
+#include "floatInVec.h"
+
+namespace Vectormath {
+
+inline
+boolInVec::boolInVec(__m128 vec)
+{
+    mData = vec;
+}
+
+inline
+boolInVec::boolInVec(const floatInVec &vec)
+{
+    *this = (vec != floatInVec(0.0f));
+}
+
+inline
+boolInVec::boolInVec(bool scalar)
+{
+    unsigned int mask = -(int)scalar;
+	mData = _mm_set1_ps(*(float *)&mask); // TODO: Union
+}
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+inline
+bool
+boolInVec::getAsBool() const
+#else
+inline
+boolInVec::operator bool() const
+#endif
+{
+	return *(bool *)&mData;
+}
+
+inline
+__m128
+boolInVec::get128() const
+{
+    return mData;
+}
+
+inline
+const boolInVec
+boolInVec::operator ! () const
+{
+    return boolInVec(_mm_andnot_ps(mData, _mm_cmpneq_ps(_mm_setzero_ps(),_mm_setzero_ps())));
+}
+
+inline
+boolInVec&
+boolInVec::operator = (const boolInVec &vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator &= (const boolInVec &vec)
+{
+    *this = *this & vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator ^= (const boolInVec &vec)
+{
+    *this = *this ^ vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator |= (const boolInVec &vec)
+{
+    *this = *this | vec;
+    return *this;
+}
+
+inline
+const boolInVec
+operator == (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator != (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128()));
+}
+    
+inline
+const boolInVec
+operator & (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_and_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator | (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_or_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator ^ (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_xor_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1)
+{
+	return boolInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128()));
+}
+ 
+} // namespace Vectormath
+
+#endif // boolInVec_h
diff --git a/src/vectormath/sse/floatInVec.h b/src/vectormath/sse/floatInVec.h
index 6443865b1..e8ac5959e 100644
--- a/src/vectormath/sse/floatInVec.h
+++ b/src/vectormath/sse/floatInVec.h
@@ -1,340 +1,340 @@
-/*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-   POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifndef _FLOATINVEC_H
-#define _FLOATINVEC_H
-
-#include <math.h>
-#include <xmmintrin.h>
-
-namespace Vectormath {
-
-class boolInVec;
-
-//--------------------------------------------------------------------------------------------------
-// floatInVec class
-//
-
-class floatInVec
-{
-    private:
-        __m128 mData;
-
-    public:
-        inline floatInVec(__m128 vec);
-
-        inline floatInVec() {}
-
-        // matches standard type conversions
-        //
-        inline floatInVec(const boolInVec &vec);
-
-        // construct from a slot of __m128
-        //
-        inline floatInVec(__m128 vec, int slot);
-        
-        // explicit cast from float
-        //
-        explicit inline floatInVec(float scalar);
-
-#ifdef _VECTORMATH_NO_SCALAR_CAST
-        // explicit cast to float
-        // 
-        inline float getAsFloat() const;
-#else
-        // implicit cast to float
-        //
-        inline operator float() const;
-#endif
-
-        // get vector data
-        // float value is splatted across all word slots of vector
-        //
-        inline __m128 get128() const;
-
-        // operators
-        // 
-        inline const floatInVec operator ++ (int);
-        inline const floatInVec operator -- (int);
-        inline floatInVec& operator ++ ();
-        inline floatInVec& operator -- ();
-        inline const floatInVec operator - () const;
-        inline floatInVec& operator = (const floatInVec &vec);
-        inline floatInVec& operator *= (const floatInVec &vec);
-        inline floatInVec& operator /= (const floatInVec &vec);
-        inline floatInVec& operator += (const floatInVec &vec);
-        inline floatInVec& operator -= (const floatInVec &vec);
-
-        // friend functions
-        //
-        friend inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1);
-        friend inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, boolInVec select_vec1);
-};
-
-//--------------------------------------------------------------------------------------------------
-// floatInVec functions
-//
-
-// operators
-// 
-inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1);
-inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1);
-inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1);
-inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1);
-inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1);
-inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1);
-inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1);
-inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1);
-inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1);
-inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1);
-
-// select between vec0 and vec1 using boolInVec.
-// false selects vec0, true selects vec1
-//
-inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1);
-
-} // namespace Vectormath
-
-//--------------------------------------------------------------------------------------------------
-// floatInVec implementation
-//
-
-#include "boolInVec.h"
-
-namespace Vectormath {
-
-inline
-floatInVec::floatInVec(__m128 vec)
-{
-    mData = vec;
-}
-
-inline
-floatInVec::floatInVec(const boolInVec &vec)
-{
-	mData = vec_sel(_mm_setzero_ps(), _mm_set1_ps(1.0f), vec.get128());
-}
-
-inline
-floatInVec::floatInVec(__m128 vec, int slot)
-{
-	SSEFloat v;
-	v.m128 = vec;
-	mData = _mm_set1_ps(v.f[slot]);
-}
-
-inline
-floatInVec::floatInVec(float scalar)
-{
-	mData = _mm_set1_ps(scalar);
-}
-
-#ifdef _VECTORMATH_NO_SCALAR_CAST
-inline
-float
-floatInVec::getAsFloat() const
-#else
-inline
-floatInVec::operator float() const
-#endif
-{
-    return *((float *)&mData);
-}
-
-inline
-__m128
-floatInVec::get128() const
-{
-    return mData;
-}
-
-inline
-const floatInVec
-floatInVec::operator ++ (int)
-{
-    __m128 olddata = mData;
-    operator ++();
-    return floatInVec(olddata);
-}
-
-inline
-const floatInVec
-floatInVec::operator -- (int)
-{
-    __m128 olddata = mData;
-    operator --();
-    return floatInVec(olddata);
-}
-
-inline
-floatInVec&
-floatInVec::operator ++ ()
-{
-    *this += floatInVec(_mm_set1_ps(1.0f));
-    return *this;
-}
-
-inline
-floatInVec&
-floatInVec::operator -- ()
-{
-    *this -= floatInVec(_mm_set1_ps(1.0f));
-    return *this;
-}
-
-inline
-const floatInVec
-floatInVec::operator - () const
-{
-    return floatInVec(_mm_sub_ps(_mm_setzero_ps(), mData));
-}
-
-inline
-floatInVec&
-floatInVec::operator = (const floatInVec &vec)
-{
-    mData = vec.mData;
-    return *this;
-}
-
-inline
-floatInVec&
-floatInVec::operator *= (const floatInVec &vec)
-{
-    *this = *this * vec;
-    return *this;
-}
-
-inline
-floatInVec&
-floatInVec::operator /= (const floatInVec &vec)
-{
-    *this = *this / vec;
-    return *this;
-}
-
-inline
-floatInVec&
-floatInVec::operator += (const floatInVec &vec)
-{
-    *this = *this + vec;
-    return *this;
-}
-
-inline
-floatInVec&
-floatInVec::operator -= (const floatInVec &vec)
-{
-    *this = *this - vec;
-    return *this;
-}
-
-inline
-const floatInVec
-operator * (const floatInVec &vec0, const floatInVec &vec1)
-{
-    return floatInVec(_mm_mul_ps(vec0.get128(), vec1.get128()));
-}
-
-inline
-const floatInVec
-operator / (const floatInVec &num, const floatInVec &den)
-{
-    return floatInVec(_mm_div_ps(num.get128(), den.get128()));
-}
-
-inline
-const floatInVec
-operator + (const floatInVec &vec0, const floatInVec &vec1)
-{
-    return floatInVec(_mm_add_ps(vec0.get128(), vec1.get128()));
-}
-
-inline
-const floatInVec
-operator - (const floatInVec &vec0, const floatInVec &vec1)
-{
-    return floatInVec(_mm_sub_ps(vec0.get128(), vec1.get128()));
-}
-
-inline
-const boolInVec
-operator < (const floatInVec &vec0, const floatInVec &vec1)
-{
-    return boolInVec(_mm_cmpgt_ps(vec1.get128(), vec0.get128()));
-}
-
-inline
-const boolInVec
-operator <= (const floatInVec &vec0, const floatInVec &vec1)
-{
-    return boolInVec(_mm_cmpge_ps(vec1.get128(), vec0.get128()));
-}
-
-inline
-const boolInVec
-operator > (const floatInVec &vec0, const floatInVec &vec1)
-{
-    return boolInVec(_mm_cmpgt_ps(vec0.get128(), vec1.get128()));
-}
-
-inline
-const boolInVec
-operator >= (const floatInVec &vec0, const floatInVec &vec1)
-{
-    return boolInVec(_mm_cmpge_ps(vec0.get128(), vec1.get128()));
-}
-
-inline
-const boolInVec
-operator == (const floatInVec &vec0, const floatInVec &vec1)
-{
-    return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128()));
-}
-
-inline
-const boolInVec
-operator != (const floatInVec &vec0, const floatInVec &vec1)
-{
-    return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128()));
-}
-    
-inline
-const floatInVec
-select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1)
-{
-    return floatInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128()));
-}
-
-} // namespace Vectormath
-
-#endif // floatInVec_h
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _FLOATINVEC_H
+#define _FLOATINVEC_H
+
+#include <math.h>
+#include <xmmintrin.h>
+
+namespace Vectormath {
+
+class boolInVec;
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec class
+//
+
+class floatInVec
+{
+    private:
+        __m128 mData;
+
+    public:
+        inline floatInVec(__m128 vec);
+
+        inline floatInVec() {}
+
+        // matches standard type conversions
+        //
+        inline floatInVec(const boolInVec &vec);
+
+        // construct from a slot of __m128
+        //
+        inline floatInVec(__m128 vec, int slot);
+        
+        // explicit cast from float
+        //
+        explicit inline floatInVec(float scalar);
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+        // explicit cast to float
+        // 
+        inline float getAsFloat() const;
+#else
+        // implicit cast to float
+        //
+        inline operator float() const;
+#endif
+
+        // get vector data
+        // float value is splatted across all word slots of vector
+        //
+        inline __m128 get128() const;
+
+        // operators
+        // 
+        inline const floatInVec operator ++ (int);
+        inline const floatInVec operator -- (int);
+        inline floatInVec& operator ++ ();
+        inline floatInVec& operator -- ();
+        inline const floatInVec operator - () const;
+        inline floatInVec& operator = (const floatInVec &vec);
+        inline floatInVec& operator *= (const floatInVec &vec);
+        inline floatInVec& operator /= (const floatInVec &vec);
+        inline floatInVec& operator += (const floatInVec &vec);
+        inline floatInVec& operator -= (const floatInVec &vec);
+
+        // friend functions
+        //
+        friend inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, boolInVec select_vec1);
+};
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec functions
+//
+
+// operators
+// 
+inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1);
+inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1);
+inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1);
+inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1);
+
+// select between vec0 and vec1 using boolInVec.
+// false selects vec0, true selects vec1
+//
+inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1);
+
+} // namespace Vectormath
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec implementation
+//
+
+#include "boolInVec.h"
+
+namespace Vectormath {
+
+inline
+floatInVec::floatInVec(__m128 vec)
+{
+    mData = vec;
+}
+
+inline
+floatInVec::floatInVec(const boolInVec &vec)
+{
+	mData = vec_sel(_mm_setzero_ps(), _mm_set1_ps(1.0f), vec.get128());
+}
+
+inline
+floatInVec::floatInVec(__m128 vec, int slot)
+{
+	SSEFloat v;
+	v.m128 = vec;
+	mData = _mm_set1_ps(v.f[slot]);
+}
+
+inline
+floatInVec::floatInVec(float scalar)
+{
+	mData = _mm_set1_ps(scalar);
+}
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+inline
+float
+floatInVec::getAsFloat() const
+#else
+inline
+floatInVec::operator float() const
+#endif
+{
+    return *((float *)&mData);
+}
+
+inline
+__m128
+floatInVec::get128() const
+{
+    return mData;
+}
+
+inline
+const floatInVec
+floatInVec::operator ++ (int)
+{
+    __m128 olddata = mData;
+    operator ++();
+    return floatInVec(olddata);
+}
+
+inline
+const floatInVec
+floatInVec::operator -- (int)
+{
+    __m128 olddata = mData;
+    operator --();
+    return floatInVec(olddata);
+}
+
+inline
+floatInVec&
+floatInVec::operator ++ ()
+{
+    *this += floatInVec(_mm_set1_ps(1.0f));
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -- ()
+{
+    *this -= floatInVec(_mm_set1_ps(1.0f));
+    return *this;
+}
+
+inline
+const floatInVec
+floatInVec::operator - () const
+{
+    return floatInVec(_mm_sub_ps(_mm_setzero_ps(), mData));
+}
+
+inline
+floatInVec&
+floatInVec::operator = (const floatInVec &vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator *= (const floatInVec &vec)
+{
+    *this = *this * vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator /= (const floatInVec &vec)
+{
+    *this = *this / vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator += (const floatInVec &vec)
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -= (const floatInVec &vec)
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline
+const floatInVec
+operator * (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return floatInVec(_mm_mul_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const floatInVec
+operator / (const floatInVec &num, const floatInVec &den)
+{
+    return floatInVec(_mm_div_ps(num.get128(), den.get128()));
+}
+
+inline
+const floatInVec
+operator + (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return floatInVec(_mm_add_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const floatInVec
+operator - (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return floatInVec(_mm_sub_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator < (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpgt_ps(vec1.get128(), vec0.get128()));
+}
+
+inline
+const boolInVec
+operator <= (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpge_ps(vec1.get128(), vec0.get128()));
+}
+
+inline
+const boolInVec
+operator > (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpgt_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator >= (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpge_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator == (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator != (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128()));
+}
+    
+inline
+const floatInVec
+select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1)
+{
+    return floatInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128()));
+}
+
+} // namespace Vectormath
+
+#endif // floatInVec_h
diff --git a/src/vectormath/sse/mat_aos.h b/src/vectormath/sse/mat_aos.h
index fe6aa62b9..a2c66cc5f 100644
--- a/src/vectormath/sse/mat_aos.h
+++ b/src/vectormath/sse/mat_aos.h
@@ -1,2190 +1,2190 @@
-/*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-   POSSIBILITY OF SUCH DAMAGE.
-*/
-
-
-#ifndef _VECTORMATH_MAT_AOS_CPP_H
-#define _VECTORMATH_MAT_AOS_CPP_H
-
-namespace Vectormath {
-namespace Aos {
-
-//-----------------------------------------------------------------------------
-// Constants
-// for shuffles, words are labeled [x,y,z,w] [a,b,c,d]
-
-#define _VECTORMATH_PERM_ZBWX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_B, _VECTORMATH_PERM_W, _VECTORMATH_PERM_X })
-#define _VECTORMATH_PERM_XCYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_C, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X })
-#define _VECTORMATH_PERM_XYAB ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B })
-#define _VECTORMATH_PERM_ZWCD ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_W, _VECTORMATH_PERM_C, _VECTORMATH_PERM_D })
-#define _VECTORMATH_PERM_XZBX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_B, _VECTORMATH_PERM_X })     
-#define _VECTORMATH_PERM_CXXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X })
-#define _VECTORMATH_PERM_YAXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X })
-#define _VECTORMATH_PERM_XAZC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_C })
-#define _VECTORMATH_PERM_YXWZ ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X, _VECTORMATH_PERM_W, _VECTORMATH_PERM_Z })
-#define _VECTORMATH_PERM_YBWD ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_B, _VECTORMATH_PERM_W, _VECTORMATH_PERM_D })
-#define _VECTORMATH_PERM_XYCX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C, _VECTORMATH_PERM_X })
-#define _VECTORMATH_PERM_YCXY ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y })
-#define _VECTORMATH_PERM_CXYC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C })
-#define _VECTORMATH_PERM_ZAYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X })
-#define _VECTORMATH_PERM_BZXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_B, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X })
-#define _VECTORMATH_PERM_XZYA ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A })
-#define _VECTORMATH_PERM_ZXXB ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_B })
-#define _VECTORMATH_PERM_YXXC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_C })
-#define _VECTORMATH_PERM_BBYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_B, _VECTORMATH_PERM_B, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X })
-#define _VECTORMATH_PI_OVER_2 1.570796327f
-
-//-----------------------------------------------------------------------------
-// Definitions
-
-VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const Matrix3 & mat )
-{
-    mCol0 = mat.mCol0;
-    mCol1 = mat.mCol1;
-    mCol2 = mat.mCol2;
-}
-
-VECTORMATH_FORCE_INLINE Matrix3::Matrix3( float scalar )
-{
-    mCol0 = Vector3( scalar );
-    mCol1 = Vector3( scalar );
-    mCol2 = Vector3( scalar );
-}
-
-VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const floatInVec &scalar )
-{
-    mCol0 = Vector3( scalar );
-    mCol1 = Vector3( scalar );
-    mCol2 = Vector3( scalar );
-}
-
-VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const Quat &unitQuat )
-{
-    __m128 xyzw_2, wwww, yzxw, zxyw, yzxw_2, zxyw_2;
-    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
-	VM_ATTRIBUTE_ALIGN16 unsigned int sx[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int sz[4] = {0, 0, 0xffffffff, 0};
-	__m128 select_x = _mm_load_ps((float *)sx);
-	__m128 select_z = _mm_load_ps((float *)sz);
-
-    xyzw_2 = _mm_add_ps( unitQuat.get128(), unitQuat.get128() );
-    wwww = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,3,3,3) );
-	yzxw = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,0,2,1) );
-	zxyw = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,1,0,2) );
-    yzxw_2 = _mm_shuffle_ps( xyzw_2, xyzw_2, _MM_SHUFFLE(3,0,2,1) );
-    zxyw_2 = _mm_shuffle_ps( xyzw_2, xyzw_2, _MM_SHUFFLE(3,1,0,2) );
-
-    tmp0 = _mm_mul_ps( yzxw_2, wwww );									// tmp0 = 2yw, 2zw, 2xw, 2w2
-	tmp1 = _mm_sub_ps( _mm_set1_ps(1.0f), _mm_mul_ps(yzxw, yzxw_2) );	// tmp1 = 1 - 2y2, 1 - 2z2, 1 - 2x2, 1 - 2w2
-    tmp2 = _mm_mul_ps( yzxw, xyzw_2 );									// tmp2 = 2xy, 2yz, 2xz, 2w2
-    tmp0 = _mm_add_ps( _mm_mul_ps(zxyw, xyzw_2), tmp0 );				// tmp0 = 2yw + 2zx, 2zw + 2xy, 2xw + 2yz, 2w2 + 2w2
-    tmp1 = _mm_sub_ps( tmp1, _mm_mul_ps(zxyw, zxyw_2) );				// tmp1 = 1 - 2y2 - 2z2, 1 - 2z2 - 2x2, 1 - 2x2 - 2y2, 1 - 2w2 - 2w2
-    tmp2 = _mm_sub_ps( tmp2, _mm_mul_ps(zxyw_2, wwww) );				// tmp2 = 2xy - 2zw, 2yz - 2xw, 2xz - 2yw, 2w2 -2w2
-
-    tmp3 = vec_sel( tmp0, tmp1, select_x );
-    tmp4 = vec_sel( tmp1, tmp2, select_x );
-    tmp5 = vec_sel( tmp2, tmp0, select_x );
-    mCol0 = Vector3( vec_sel( tmp3, tmp2, select_z ) );
-    mCol1 = Vector3( vec_sel( tmp4, tmp0, select_z ) );
-    mCol2 = Vector3( vec_sel( tmp5, tmp1, select_z ) );
-}
-
-VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2 )
-{
-    mCol0 = _col0;
-    mCol1 = _col1;
-    mCol2 = _col2;
-}
-
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol0( const Vector3 &_col0 )
-{
-    mCol0 = _col0;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol1( const Vector3 &_col1 )
-{
-    mCol1 = _col1;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol2( const Vector3 &_col2 )
-{
-    mCol2 = _col2;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol( int col, const Vector3 &vec )
-{
-    *(&mCol0 + col) = vec;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setRow( int row, const Vector3 &vec )
-{
-    mCol0.setElem( row, vec.getElem( 0 ) );
-    mCol1.setElem( row, vec.getElem( 1 ) );
-    mCol2.setElem( row, vec.getElem( 2 ) );
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setElem( int col, int row, float val )
-{
-    (*this)[col].setElem(row, val);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setElem( int col, int row, const floatInVec &val )
-{
-    Vector3 tmpV3_0;
-    tmpV3_0 = this->getCol( col );
-    tmpV3_0.setElem( row, val );
-    this->setCol( col, tmpV3_0 );
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Matrix3::getElem( int col, int row ) const
-{
-    return this->getCol( col ).getElem( row );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol0( ) const
-{
-    return mCol0;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol1( ) const
-{
-    return mCol1;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol2( ) const
-{
-    return mCol2;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol( int col ) const
-{
-    return *(&mCol0 + col);
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getRow( int row ) const
-{
-    return Vector3( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ) );
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Matrix3::operator []( int col )
-{
-    return *(&mCol0 + col);
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::operator []( int col ) const
-{
-    return *(&mCol0 + col);
-}
-
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator =( const Matrix3 & mat )
-{
-    mCol0 = mat.mCol0;
-    mCol1 = mat.mCol1;
-    mCol2 = mat.mCol2;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 transpose( const Matrix3 & mat )
-{
-    __m128 tmp0, tmp1, res0, res1, res2;
-    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
-    tmp1 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
-    res0 = vec_mergeh( tmp0, mat.getCol1().get128() );
-    //res1 = vec_perm( tmp0, mat.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	res1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
-	res1 = vec_sel(res1, mat.getCol1().get128(), select_y);
-    //res2 = vec_perm( tmp1, mat.getCol1().get128(), _VECTORMATH_PERM_XCYX );
-	res2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
-	res2 = vec_sel(res2, vec_splat(mat.getCol1().get128(), 2), select_y);
-    return Matrix3(
-        Vector3( res0 ),
-        Vector3( res1 ),
-        Vector3( res2 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 inverse( const Matrix3 & mat )
-{
-    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet, inv0, inv1, inv2;
-    tmp2 = _vmathVfCross( mat.getCol0().get128(), mat.getCol1().get128() );
-    tmp0 = _vmathVfCross( mat.getCol1().get128(), mat.getCol2().get128() );
-    tmp1 = _vmathVfCross( mat.getCol2().get128(), mat.getCol0().get128() );
-    dot = _vmathVfDot3( tmp2, mat.getCol2().get128() );
-    dot = vec_splat( dot, 0 );
-    invdet = recipf4( dot );
-    tmp3 = vec_mergeh( tmp0, tmp2 );
-    tmp4 = vec_mergel( tmp0, tmp2 );
-    inv0 = vec_mergeh( tmp3, tmp1 );
-    //inv1 = vec_perm( tmp3, tmp1, _VECTORMATH_PERM_ZBWX );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	inv1 = _mm_shuffle_ps( tmp3, tmp3, _MM_SHUFFLE(0,3,2,2));
-	inv1 = vec_sel(inv1, tmp1, select_y);
-    //inv2 = vec_perm( tmp4, tmp1, _VECTORMATH_PERM_XCYX );
-	inv2 = _mm_shuffle_ps( tmp4, tmp4, _MM_SHUFFLE(0,1,1,0));
-	inv2 = vec_sel(inv2, vec_splat(tmp1, 2), select_y);
-    inv0 = vec_mul( inv0, invdet );
-    inv1 = vec_mul( inv1, invdet );
-	inv2 = vec_mul( inv2, invdet );
-    return Matrix3(
-        Vector3( inv0 ),
-        Vector3( inv1 ),
-        Vector3( inv2 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix3 & mat )
-{
-    return dot( mat.getCol2(), cross( mat.getCol0(), mat.getCol1() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator +( const Matrix3 & mat ) const
-{
-    return Matrix3(
-        ( mCol0 + mat.mCol0 ),
-        ( mCol1 + mat.mCol1 ),
-        ( mCol2 + mat.mCol2 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator -( const Matrix3 & mat ) const
-{
-    return Matrix3(
-        ( mCol0 - mat.mCol0 ),
-        ( mCol1 - mat.mCol1 ),
-        ( mCol2 - mat.mCol2 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator +=( const Matrix3 & mat )
-{
-    *this = *this + mat;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator -=( const Matrix3 & mat )
-{
-    *this = *this - mat;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator -( ) const
-{
-    return Matrix3(
-        ( -mCol0 ),
-        ( -mCol1 ),
-        ( -mCol2 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 absPerElem( const Matrix3 & mat )
-{
-    return Matrix3(
-        absPerElem( mat.getCol0() ),
-        absPerElem( mat.getCol1() ),
-        absPerElem( mat.getCol2() )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator *( float scalar ) const
-{
-    return *this * floatInVec(scalar);
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator *( const floatInVec &scalar ) const
-{
-    return Matrix3(
-        ( mCol0 * scalar ),
-        ( mCol1 * scalar ),
-        ( mCol2 * scalar )
-    );
-}
-
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator *=( float scalar )
-{
-    return *this *= floatInVec(scalar);
-}
-
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator *=( const floatInVec &scalar )
-{
-    *this = *this * scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 operator *( float scalar, const Matrix3 & mat )
-{
-    return floatInVec(scalar) * mat;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat )
-{
-    return mat * scalar;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Matrix3::operator *( const Vector3 &vec ) const
-{
-    __m128 res;
-    __m128 xxxx, yyyy, zzzz;
-    xxxx = vec_splat( vec.get128(), 0 );
-    yyyy = vec_splat( vec.get128(), 1 );
-    zzzz = vec_splat( vec.get128(), 2 );
-    res = vec_mul( mCol0.get128(), xxxx );
-    res = vec_madd( mCol1.get128(), yyyy, res );
-    res = vec_madd( mCol2.get128(), zzzz, res );
-    return Vector3( res );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
-{
-    return Matrix3(
-        ( *this * mat.mCol0 ),
-        ( *this * mat.mCol1 ),
-        ( *this * mat.mCol2 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator *=( const Matrix3 & mat )
-{
-    *this = *this * mat;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 )
-{
-    return Matrix3(
-        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
-        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
-        mulPerElem( mat0.getCol2(), mat1.getCol2() )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::identity( )
-{
-    return Matrix3(
-        Vector3::xAxis( ),
-        Vector3::yAxis( ),
-        Vector3::zAxis( )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationX( float radians )
-{
-    return rotationX( floatInVec(radians) );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationX( const floatInVec &radians )
-{
-    __m128 s, c, res1, res2;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res1 = vec_sel( zero, c, select_y );
-    res1 = vec_sel( res1, s, select_z );
-    res2 = vec_sel( zero, negatef4(s), select_y );
-    res2 = vec_sel( res2, c, select_z );
-    return Matrix3(
-        Vector3::xAxis( ),
-        Vector3( res1 ),
-        Vector3( res2 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationY( float radians )
-{
-    return rotationY( floatInVec(radians) );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationY( const floatInVec &radians )
-{
-    __m128 s, c, res0, res2;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res0 = vec_sel( zero, c, select_x );
-    res0 = vec_sel( res0, negatef4(s), select_z );
-    res2 = vec_sel( zero, s, select_x );
-    res2 = vec_sel( res2, c, select_z );
-    return Matrix3(
-        Vector3( res0 ),
-        Vector3::yAxis( ),
-        Vector3( res2 )
-	);
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationZ( float radians )
-{
-    return rotationZ( floatInVec(radians) );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationZ( const floatInVec &radians )
-{
-    __m128 s, c, res0, res1;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res0 = vec_sel( zero, c, select_x );
-    res0 = vec_sel( res0, s, select_y );
-    res1 = vec_sel( zero, negatef4(s), select_x );
-    res1 = vec_sel( res1, c, select_y );
-    return Matrix3(
-        Vector3( res0 ),
-        Vector3( res1 ),
-        Vector3::zAxis( )
-	);
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationZYX( const Vector3 &radiansXYZ )
-{
-    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
-    angles = Vector4( radiansXYZ, 0.0f ).get128();
-    sincosf4( angles, &s, &c );
-    negS = negatef4( s );
-    Z0 = vec_mergel( c, s );
-    Z1 = vec_mergel( negS, c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
-    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
-	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
-	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
-    X0 = vec_splat( s, 0 );
-    X1 = vec_splat( c, 0 );
-    tmp = vec_mul( Z0, Y1 );
-    return Matrix3(
-        Vector3( vec_mul( Z0, Y0 ) ),
-        Vector3( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
-        Vector3( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotation( float radians, const Vector3 &unitVec )
-{
-    return rotation( floatInVec(radians), unitVec );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotation( const floatInVec &radians, const Vector3 &unitVec )
-{
-    __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2;
-    axis = unitVec.get128();
-    sincosf4( radians.get128(), &s, &c );
-    xxxx = vec_splat( axis, 0 );
-    yyyy = vec_splat( axis, 1 );
-    zzzz = vec_splat( axis, 2 );
-    oneMinusC = vec_sub( _mm_set1_ps(1.0f), c );
-    axisS = vec_mul( axis, s );
-    negAxisS = negatef4( axisS );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    //tmp0 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_XZBX );
-	tmp0 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,2,0) );
-	tmp0 = vec_sel(tmp0, vec_splat(negAxisS, 1), select_z);
-    //tmp1 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_CXXX );
-	tmp1 = vec_sel( vec_splat(axisS, 0), vec_splat(negAxisS, 2), select_x );
-    //tmp2 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_YAXX );
-	tmp2 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,0,1) );
-	tmp2 = vec_sel(tmp2, vec_splat(negAxisS, 0), select_y);
-    tmp0 = vec_sel( tmp0, c, select_x );
-    tmp1 = vec_sel( tmp1, c, select_y );
-    tmp2 = vec_sel( tmp2, c, select_z );
-    return Matrix3(
-        Vector3( vec_madd( vec_mul( axis, xxxx ), oneMinusC, tmp0 ) ),
-        Vector3( vec_madd( vec_mul( axis, yyyy ), oneMinusC, tmp1 ) ),
-        Vector3( vec_madd( vec_mul( axis, zzzz ), oneMinusC, tmp2 ) )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotation( const Quat &unitQuat )
-{
-    return Matrix3( unitQuat );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::scale( const Vector3 &scaleVec )
-{
-    __m128 zero = _mm_setzero_ps();
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    return Matrix3(
-        Vector3( vec_sel( zero, scaleVec.get128(), select_x ) ),
-        Vector3( vec_sel( zero, scaleVec.get128(), select_y ) ),
-        Vector3( vec_sel( zero, scaleVec.get128(), select_z ) )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec )
-{
-    return Matrix3(
-        ( mat.getCol0() * scaleVec.getX( ) ),
-        ( mat.getCol1() * scaleVec.getY( ) ),
-        ( mat.getCol2() * scaleVec.getZ( ) )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat )
-{
-    return Matrix3(
-        mulPerElem( mat.getCol0(), scaleVec ),
-        mulPerElem( mat.getCol1(), scaleVec ),
-        mulPerElem( mat.getCol2(), scaleVec )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 )
-{
-    return Matrix3(
-        select( mat0.getCol0(), mat1.getCol0(), select1 ),
-        select( mat0.getCol1(), mat1.getCol1(), select1 ),
-        select( mat0.getCol2(), mat1.getCol2(), select1 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 )
-{
-    return Matrix3(
-        select( mat0.getCol0(), mat1.getCol0(), select1 ),
-        select( mat0.getCol1(), mat1.getCol1(), select1 ),
-        select( mat0.getCol2(), mat1.getCol2(), select1 )
-    );
-}
-
-#ifdef _VECTORMATH_DEBUG
-
-VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat )
-{
-    print( mat.getRow( 0 ) );
-    print( mat.getRow( 1 ) );
-    print( mat.getRow( 2 ) );
-}
-
-VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat, const char * name )
-{
-    printf("%s:\n", name);
-    print( mat );
-}
-
-#endif
-
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Matrix4 & mat )
-{
-    mCol0 = mat.mCol0;
-    mCol1 = mat.mCol1;
-    mCol2 = mat.mCol2;
-    mCol3 = mat.mCol3;
-}
-
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( float scalar )
-{
-    mCol0 = Vector4( scalar );
-    mCol1 = Vector4( scalar );
-    mCol2 = Vector4( scalar );
-    mCol3 = Vector4( scalar );
-}
-
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const floatInVec &scalar )
-{
-    mCol0 = Vector4( scalar );
-    mCol1 = Vector4( scalar );
-    mCol2 = Vector4( scalar );
-    mCol3 = Vector4( scalar );
-}
-
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Transform3 & mat )
-{
-    mCol0 = Vector4( mat.getCol0(), 0.0f );
-    mCol1 = Vector4( mat.getCol1(), 0.0f );
-    mCol2 = Vector4( mat.getCol2(), 0.0f );
-    mCol3 = Vector4( mat.getCol3(), 1.0f );
-}
-
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Vector4 &_col0, const Vector4 &_col1, const Vector4 &_col2, const Vector4 &_col3 )
-{
-    mCol0 = _col0;
-    mCol1 = _col1;
-    mCol2 = _col2;
-    mCol3 = _col3;
-}
-
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Matrix3 & mat, const Vector3 &translateVec )
-{
-    mCol0 = Vector4( mat.getCol0(), 0.0f );
-    mCol1 = Vector4( mat.getCol1(), 0.0f );
-    mCol2 = Vector4( mat.getCol2(), 0.0f );
-    mCol3 = Vector4( translateVec, 1.0f );
-}
-
-VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Quat &unitQuat, const Vector3 &translateVec )
-{
-    Matrix3 mat;
-    mat = Matrix3( unitQuat );
-    mCol0 = Vector4( mat.getCol0(), 0.0f );
-    mCol1 = Vector4( mat.getCol1(), 0.0f );
-    mCol2 = Vector4( mat.getCol2(), 0.0f );
-    mCol3 = Vector4( translateVec, 1.0f );
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol0( const Vector4 &_col0 )
-{
-    mCol0 = _col0;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol1( const Vector4 &_col1 )
-{
-    mCol1 = _col1;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol2( const Vector4 &_col2 )
-{
-    mCol2 = _col2;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol3( const Vector4 &_col3 )
-{
-    mCol3 = _col3;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol( int col, const Vector4 &vec )
-{
-    *(&mCol0 + col) = vec;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setRow( int row, const Vector4 &vec )
-{
-    mCol0.setElem( row, vec.getElem( 0 ) );
-    mCol1.setElem( row, vec.getElem( 1 ) );
-    mCol2.setElem( row, vec.getElem( 2 ) );
-    mCol3.setElem( row, vec.getElem( 3 ) );
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setElem( int col, int row, float val )
-{
-    (*this)[col].setElem(row, val);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setElem( int col, int row, const floatInVec &val )
-{
-    Vector4 tmpV3_0;
-    tmpV3_0 = this->getCol( col );
-    tmpV3_0.setElem( row, val );
-    this->setCol( col, tmpV3_0 );
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Matrix4::getElem( int col, int row ) const
-{
-    return this->getCol( col ).getElem( row );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol0( ) const
-{
-    return mCol0;
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol1( ) const
-{
-    return mCol1;
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol2( ) const
-{
-    return mCol2;
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol3( ) const
-{
-    return mCol3;
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol( int col ) const
-{
-    return *(&mCol0 + col);
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getRow( int row ) const
-{
-    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Matrix4::operator []( int col )
-{
-    return *(&mCol0 + col);
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator []( int col ) const
-{
-    return *(&mCol0 + col);
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator =( const Matrix4 & mat )
-{
-    mCol0 = mat.mCol0;
-    mCol1 = mat.mCol1;
-    mCol2 = mat.mCol2;
-    mCol3 = mat.mCol3;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 transpose( const Matrix4 & mat )
-{
-    __m128 tmp0, tmp1, tmp2, tmp3, res0, res1, res2, res3;
-    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
-    tmp1 = vec_mergeh( mat.getCol1().get128(), mat.getCol3().get128() );
-    tmp2 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
-    tmp3 = vec_mergel( mat.getCol1().get128(), mat.getCol3().get128() );
-    res0 = vec_mergeh( tmp0, tmp1 );
-    res1 = vec_mergel( tmp0, tmp1 );
-    res2 = vec_mergeh( tmp2, tmp3 );
-    res3 = vec_mergel( tmp2, tmp3 );
-    return Matrix4(
-        Vector4( res0 ),
-        Vector4( res1 ),
-        Vector4( res2 ),
-        Vector4( res3 )
-    );
-}
-
-// TODO: Tidy
-static VM_ATTRIBUTE_ALIGN16 const unsigned int _vmathPNPN[4] = {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-static VM_ATTRIBUTE_ALIGN16 const unsigned int _vmathNPNP[4] = {0x80000000, 0x00000000, 0x80000000, 0x00000000};
-static VM_ATTRIBUTE_ALIGN16 const float _vmathZERONE[4] = {1.0f, 0.0f, 0.0f, 1.0f};
-
-VECTORMATH_FORCE_INLINE const Matrix4 inverse( const Matrix4 & mat )
-{
-	__m128 Va,Vb,Vc;
-	__m128 r1,r2,r3,tt,tt2;
-	__m128 sum,Det,RDet;
-	__m128 trns0,trns1,trns2,trns3;
-
-	__m128 _L1 = mat.getCol0().get128();
-	__m128 _L2 = mat.getCol1().get128();
-	__m128 _L3 = mat.getCol2().get128();
-	__m128 _L4 = mat.getCol3().get128();
-	// Calculating the minterms for the first line.
-
-	// _mm_ror_ps is just a macro using _mm_shuffle_ps().
-	tt = _L4; tt2 = _mm_ror_ps(_L3,1); 
-	Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));					// V3'dot V4
-	Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));					// V3'dot V4"
-	Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));					// V3' dot V4^
-
-	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V3" dot V4^ - V3^ dot V4"
-	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V3^ dot V4' - V3' dot V4^
-	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V3' dot V4" - V3" dot V4'
-
-	tt = _L2;
-	Va = _mm_ror_ps(tt,1);		sum = _mm_mul_ps(Va,r1);
-	Vb = _mm_ror_ps(tt,2);		sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
-	Vc = _mm_ror_ps(tt,3);		sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
-
-	// Calculating the determinant.
-	Det = _mm_mul_ps(sum,_L1);
-	Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
-
-	const __m128 Sign_PNPN = _mm_load_ps((float *)_vmathPNPN);
-	const __m128 Sign_NPNP = _mm_load_ps((float *)_vmathNPNP);
-
-	__m128 mtL1 = _mm_xor_ps(sum,Sign_PNPN);
-
-	// Calculating the minterms of the second line (using previous results).
-	tt = _mm_ror_ps(_L1,1);		sum = _mm_mul_ps(tt,r1);
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
-	__m128 mtL2 = _mm_xor_ps(sum,Sign_NPNP);
-
-	// Testing the determinant.
-	Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
-
-	// Calculating the minterms of the third line.
-	tt = _mm_ror_ps(_L1,1);
-	Va = _mm_mul_ps(tt,Vb);									// V1' dot V2"
-	Vb = _mm_mul_ps(tt,Vc);									// V1' dot V2^
-	Vc = _mm_mul_ps(tt,_L2);								// V1' dot V2
-
-	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V1" dot V2^ - V1^ dot V2"
-	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V1^ dot V2' - V1' dot V2^
-	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V1' dot V2" - V1" dot V2'
-
-	tt = _mm_ror_ps(_L4,1);		sum = _mm_mul_ps(tt,r1);
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
-	__m128 mtL3 = _mm_xor_ps(sum,Sign_PNPN);
-
-	// Dividing is FASTER than rcp_nr! (Because rcp_nr causes many register-memory RWs).
-	RDet = _mm_div_ss(_mm_load_ss((float *)&_vmathZERONE), Det); // TODO: just 1.0f?
-	RDet = _mm_shuffle_ps(RDet,RDet,0x00);
-
-	// Devide the first 12 minterms with the determinant.
-	mtL1 = _mm_mul_ps(mtL1, RDet);
-	mtL2 = _mm_mul_ps(mtL2, RDet);
-	mtL3 = _mm_mul_ps(mtL3, RDet);
-
-	// Calculate the minterms of the forth line and devide by the determinant.
-	tt = _mm_ror_ps(_L3,1);		sum = _mm_mul_ps(tt,r1);
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
-	__m128 mtL4 = _mm_xor_ps(sum,Sign_NPNP);
-	mtL4 = _mm_mul_ps(mtL4, RDet);
-
-	// Now we just have to transpose the minterms matrix.
-	trns0 = _mm_unpacklo_ps(mtL1,mtL2);
-	trns1 = _mm_unpacklo_ps(mtL3,mtL4);
-	trns2 = _mm_unpackhi_ps(mtL1,mtL2);
-	trns3 = _mm_unpackhi_ps(mtL3,mtL4);
-	_L1 = _mm_movelh_ps(trns0,trns1);
-	_L2 = _mm_movehl_ps(trns1,trns0);
-	_L3 = _mm_movelh_ps(trns2,trns3);
-	_L4 = _mm_movehl_ps(trns3,trns2);
-
-    return Matrix4(
-        Vector4( _L1 ),
-        Vector4( _L2 ),
-        Vector4( _L3 ),
-        Vector4( _L4 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 affineInverse( const Matrix4 & mat )
-{
-    Transform3 affineMat;
-    affineMat.setCol0( mat.getCol0().getXYZ( ) );
-    affineMat.setCol1( mat.getCol1().getXYZ( ) );
-    affineMat.setCol2( mat.getCol2().getXYZ( ) );
-    affineMat.setCol3( mat.getCol3().getXYZ( ) );
-    return Matrix4( inverse( affineMat ) );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 orthoInverse( const Matrix4 & mat )
-{
-    Transform3 affineMat;
-    affineMat.setCol0( mat.getCol0().getXYZ( ) );
-    affineMat.setCol1( mat.getCol1().getXYZ( ) );
-    affineMat.setCol2( mat.getCol2().getXYZ( ) );
-    affineMat.setCol3( mat.getCol3().getXYZ( ) );
-    return Matrix4( orthoInverse( affineMat ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix4 & mat )
-{
-	__m128 Va,Vb,Vc;
-	__m128 r1,r2,r3,tt,tt2;
-	__m128 sum,Det;
-
-	__m128 _L1 = mat.getCol0().get128();
-	__m128 _L2 = mat.getCol1().get128();
-	__m128 _L3 = mat.getCol2().get128();
-	__m128 _L4 = mat.getCol3().get128();
-	// Calculating the minterms for the first line.
-
-	// _mm_ror_ps is just a macro using _mm_shuffle_ps().
-	tt = _L4; tt2 = _mm_ror_ps(_L3,1); 
-	Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));					// V3' dot V4
-	Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));					// V3' dot V4"
-	Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));					// V3' dot V4^
-
-	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V3" dot V4^ - V3^ dot V4"
-	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V3^ dot V4' - V3' dot V4^
-	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V3' dot V4" - V3" dot V4'
-
-	tt = _L2;
-	Va = _mm_ror_ps(tt,1);		sum = _mm_mul_ps(Va,r1);
-	Vb = _mm_ror_ps(tt,2);		sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
-	Vc = _mm_ror_ps(tt,3);		sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
-
-	// Calculating the determinant.
-	Det = _mm_mul_ps(sum,_L1);
-	Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
-
-	// Calculating the minterms of the second line (using previous results).
-	tt = _mm_ror_ps(_L1,1);		sum = _mm_mul_ps(tt,r1);
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
-	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
-
-	// Testing the determinant.
-	Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
-	return floatInVec(Det, 0);
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator +( const Matrix4 & mat ) const
-{
-    return Matrix4(
-        ( mCol0 + mat.mCol0 ),
-        ( mCol1 + mat.mCol1 ),
-        ( mCol2 + mat.mCol2 ),
-        ( mCol3 + mat.mCol3 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator -( const Matrix4 & mat ) const
-{
-    return Matrix4(
-        ( mCol0 - mat.mCol0 ),
-        ( mCol1 - mat.mCol1 ),
-        ( mCol2 - mat.mCol2 ),
-        ( mCol3 - mat.mCol3 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator +=( const Matrix4 & mat )
-{
-    *this = *this + mat;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator -=( const Matrix4 & mat )
-{
-    *this = *this - mat;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator -( ) const
-{
-    return Matrix4(
-        ( -mCol0 ),
-        ( -mCol1 ),
-        ( -mCol2 ),
-        ( -mCol3 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 absPerElem( const Matrix4 & mat )
-{
-    return Matrix4(
-        absPerElem( mat.getCol0() ),
-        absPerElem( mat.getCol1() ),
-        absPerElem( mat.getCol2() ),
-        absPerElem( mat.getCol3() )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( float scalar ) const
-{
-    return *this * floatInVec(scalar);
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( const floatInVec &scalar ) const
-{
-    return Matrix4(
-        ( mCol0 * scalar ),
-        ( mCol1 * scalar ),
-        ( mCol2 * scalar ),
-        ( mCol3 * scalar )
-    );
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( float scalar )
-{
-    return *this *= floatInVec(scalar);
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( const floatInVec &scalar )
-{
-    *this = *this * scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 operator *( float scalar, const Matrix4 & mat )
-{
-    return floatInVec(scalar) * mat;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat )
-{
-    return mat * scalar;
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator *( const Vector4 &vec ) const
-{
-    return Vector4(
-		_mm_add_ps(
-			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(1,1,1,1)))),
-			_mm_add_ps(_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(2,2,2,2))), _mm_mul_ps(mCol3.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(3,3,3,3)))))
-		);
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator *( const Vector3 &vec ) const
-{
-    return Vector4(
-		_mm_add_ps(
-			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(1,1,1,1)))),
-			_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(2,2,2,2))))
-		);
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator *( const Point3 &pnt ) const
-{
-    return Vector4(
-		_mm_add_ps(
-			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(1,1,1,1)))),
-			_mm_add_ps(_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(2,2,2,2))), mCol3.get128()))
-		);
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
-{
-    return Matrix4(
-        ( *this * mat.mCol0 ),
-        ( *this * mat.mCol1 ),
-        ( *this * mat.mCol2 ),
-        ( *this * mat.mCol3 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( const Matrix4 & mat )
-{
-    *this = *this * mat;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( const Transform3 & tfrm ) const
-{
-    return Matrix4(
-        ( *this * tfrm.getCol0() ),
-        ( *this * tfrm.getCol1() ),
-        ( *this * tfrm.getCol2() ),
-        ( *this * Point3( tfrm.getCol3() ) )
-    );
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( const Transform3 & tfrm )
-{
-    *this = *this * tfrm;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 )
-{
-    return Matrix4(
-        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
-        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
-        mulPerElem( mat0.getCol2(), mat1.getCol2() ),
-        mulPerElem( mat0.getCol3(), mat1.getCol3() )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::identity( )
-{
-    return Matrix4(
-        Vector4::xAxis( ),
-        Vector4::yAxis( ),
-        Vector4::zAxis( ),
-        Vector4::wAxis( )
-    );
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setUpper3x3( const Matrix3 & mat3 )
-{
-    mCol0.setXYZ( mat3.getCol0() );
-    mCol1.setXYZ( mat3.getCol1() );
-    mCol2.setXYZ( mat3.getCol2() );
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Matrix4::getUpper3x3( ) const
-{
-    return Matrix3(
-        mCol0.getXYZ( ),
-        mCol1.getXYZ( ),
-        mCol2.getXYZ( )
-    );
-}
-
-VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setTranslation( const Vector3 &translateVec )
-{
-    mCol3.setXYZ( translateVec );
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Matrix4::getTranslation( ) const
-{
-    return mCol3.getXYZ( );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationX( float radians )
-{
-    return rotationX( floatInVec(radians) );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationX( const floatInVec &radians )
-{
-    __m128 s, c, res1, res2;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res1 = vec_sel( zero, c, select_y );
-    res1 = vec_sel( res1, s, select_z );
-    res2 = vec_sel( zero, negatef4(s), select_y );
-    res2 = vec_sel( res2, c, select_z );
-    return Matrix4(
-        Vector4::xAxis( ),
-        Vector4( res1 ),
-        Vector4( res2 ),
-        Vector4::wAxis( )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationY( float radians )
-{
-    return rotationY( floatInVec(radians) );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationY( const floatInVec &radians )
-{
-    __m128 s, c, res0, res2;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res0 = vec_sel( zero, c, select_x );
-    res0 = vec_sel( res0, negatef4(s), select_z );
-    res2 = vec_sel( zero, s, select_x );
-    res2 = vec_sel( res2, c, select_z );
-    return Matrix4(
-        Vector4( res0 ),
-        Vector4::yAxis( ),
-        Vector4( res2 ),
-        Vector4::wAxis( )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationZ( float radians )
-{
-    return rotationZ( floatInVec(radians) );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationZ( const floatInVec &radians )
-{
-    __m128 s, c, res0, res1;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res0 = vec_sel( zero, c, select_x );
-    res0 = vec_sel( res0, s, select_y );
-    res1 = vec_sel( zero, negatef4(s), select_x );
-    res1 = vec_sel( res1, c, select_y );
-    return Matrix4(
-        Vector4( res0 ),
-        Vector4( res1 ),
-        Vector4::zAxis( ),
-        Vector4::wAxis( )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationZYX( const Vector3 &radiansXYZ )
-{
-    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
-    angles = Vector4( radiansXYZ, 0.0f ).get128();
-    sincosf4( angles, &s, &c );
-    negS = negatef4( s );
-    Z0 = vec_mergel( c, s );
-    Z1 = vec_mergel( negS, c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
-    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
-	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
-	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
-    X0 = vec_splat( s, 0 );
-    X1 = vec_splat( c, 0 );
-    tmp = vec_mul( Z0, Y1 );
-    return Matrix4(
-        Vector4( vec_mul( Z0, Y0 ) ),
-        Vector4( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
-        Vector4( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) ),
-        Vector4::wAxis( )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotation( float radians, const Vector3 &unitVec )
-{
-    return rotation( floatInVec(radians), unitVec );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotation( const floatInVec &radians, const Vector3 &unitVec )
-{
-    __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2;
-    axis = unitVec.get128();
-    sincosf4( radians.get128(), &s, &c );
-    xxxx = vec_splat( axis, 0 );
-    yyyy = vec_splat( axis, 1 );
-    zzzz = vec_splat( axis, 2 );
-    oneMinusC = vec_sub( _mm_set1_ps(1.0f), c );
-    axisS = vec_mul( axis, s );
-    negAxisS = negatef4( axisS );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    //tmp0 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_XZBX );
-	tmp0 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,2,0) );
-	tmp0 = vec_sel(tmp0, vec_splat(negAxisS, 1), select_z);
-    //tmp1 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_CXXX );
-	tmp1 = vec_sel( vec_splat(axisS, 0), vec_splat(negAxisS, 2), select_x );
-    //tmp2 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_YAXX );
-	tmp2 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,0,1) );
-	tmp2 = vec_sel(tmp2, vec_splat(negAxisS, 0), select_y);
-    tmp0 = vec_sel( tmp0, c, select_x );
-    tmp1 = vec_sel( tmp1, c, select_y );
-    tmp2 = vec_sel( tmp2, c, select_z );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
-    axis = vec_and( axis, _mm_load_ps( (float *)select_xyz ) );
-    tmp0 = vec_and( tmp0, _mm_load_ps( (float *)select_xyz ) );
-    tmp1 = vec_and( tmp1, _mm_load_ps( (float *)select_xyz ) );
-    tmp2 = vec_and( tmp2, _mm_load_ps( (float *)select_xyz ) );
-    return Matrix4(
-        Vector4( vec_madd( vec_mul( axis, xxxx ), oneMinusC, tmp0 ) ),
-        Vector4( vec_madd( vec_mul( axis, yyyy ), oneMinusC, tmp1 ) ),
-        Vector4( vec_madd( vec_mul( axis, zzzz ), oneMinusC, tmp2 ) ),
-        Vector4::wAxis( )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotation( const Quat &unitQuat )
-{
-    return Matrix4( Transform3::rotation( unitQuat ) );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::scale( const Vector3 &scaleVec )
-{
-    __m128 zero = _mm_setzero_ps();
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    return Matrix4(
-        Vector4( vec_sel( zero, scaleVec.get128(), select_x ) ),
-        Vector4( vec_sel( zero, scaleVec.get128(), select_y ) ),
-        Vector4( vec_sel( zero, scaleVec.get128(), select_z ) ),
-        Vector4::wAxis( )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec )
-{
-    return Matrix4(
-        ( mat.getCol0() * scaleVec.getX( ) ),
-        ( mat.getCol1() * scaleVec.getY( ) ),
-        ( mat.getCol2() * scaleVec.getZ( ) ),
-        mat.getCol3()
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat )
-{
-    Vector4 scale4;
-    scale4 = Vector4( scaleVec, 1.0f );
-    return Matrix4(
-        mulPerElem( mat.getCol0(), scale4 ),
-        mulPerElem( mat.getCol1(), scale4 ),
-        mulPerElem( mat.getCol2(), scale4 ),
-        mulPerElem( mat.getCol3(), scale4 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::translation( const Vector3 &translateVec )
-{
-    return Matrix4(
-        Vector4::xAxis( ),
-        Vector4::yAxis( ),
-        Vector4::zAxis( ),
-        Vector4( translateVec, 1.0f )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec )
-{
-    Matrix4 m4EyeFrame;
-    Vector3 v3X, v3Y, v3Z;
-    v3Y = normalize( upVec );
-    v3Z = normalize( ( eyePos - lookAtPos ) );
-    v3X = normalize( cross( v3Y, v3Z ) );
-    v3Y = cross( v3Z, v3X );
-    m4EyeFrame = Matrix4( Vector4( v3X ), Vector4( v3Y ), Vector4( v3Z ), Vector4( eyePos ) );
-    return orthoInverse( m4EyeFrame );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::perspective( float fovyRadians, float aspect, float zNear, float zFar )
-{
-    float f, rangeInv;
-    __m128 zero, col0, col1, col2, col3;
-    union { __m128 v; float s[4]; } tmp;
-    f = tanf( _VECTORMATH_PI_OVER_2 - fovyRadians * 0.5f );
-    rangeInv = 1.0f / ( zNear - zFar );
-    zero = _mm_setzero_ps();
-    tmp.v = zero;
-    tmp.s[0] = f / aspect;
-    col0 = tmp.v;
-    tmp.v = zero;
-    tmp.s[1] = f;
-    col1 = tmp.v;
-    tmp.v = zero;
-    tmp.s[2] = ( zNear + zFar ) * rangeInv;
-    tmp.s[3] = -1.0f;
-    col2 = tmp.v;
-    tmp.v = zero;
-    tmp.s[2] = zNear * zFar * rangeInv * 2.0f;
-    col3 = tmp.v;
-    return Matrix4(
-        Vector4( col0 ),
-        Vector4( col1 ),
-        Vector4( col2 ),
-        Vector4( col3 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::frustum( float left, float right, float bottom, float top, float zNear, float zFar )
-{
-    /* function implementation based on code from STIDC SDK:           */
-    /* --------------------------------------------------------------  */
-    /* PLEASE DO NOT MODIFY THIS SECTION                               */
-    /* This prolog section is automatically generated.                 */
-    /*                                                                 */
-    /* (C)Copyright                                                    */
-    /* Sony Computer Entertainment, Inc.,                              */
-    /* Toshiba Corporation,                                            */
-    /* International Business Machines Corporation,                    */
-    /* 2001,2002.                                                      */
-    /* S/T/I Confidential Information                                  */
-    /* --------------------------------------------------------------  */
-    __m128 lbf, rtn;
-    __m128 diff, sum, inv_diff;
-    __m128 diagonal, column, near2;
-    __m128 zero = _mm_setzero_ps();
-    union { __m128 v; float s[4]; } l, f, r, n, b, t; // TODO: Union?
-    l.s[0] = left;
-    f.s[0] = zFar;
-    r.s[0] = right;
-    n.s[0] = zNear;
-    b.s[0] = bottom;
-    t.s[0] = top;
-    lbf = vec_mergeh( l.v, f.v );
-    rtn = vec_mergeh( r.v, n.v );
-    lbf = vec_mergeh( lbf, b.v );
-    rtn = vec_mergeh( rtn, t.v );
-    diff = vec_sub( rtn, lbf );
-    sum  = vec_add( rtn, lbf );
-    inv_diff = recipf4( diff );
-    near2 = vec_splat( n.v, 0 );
-    near2 = vec_add( near2, near2 );
-    diagonal = vec_mul( near2, inv_diff );
-    column = vec_mul( sum, inv_diff );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
-    return Matrix4(
-        Vector4( vec_sel( zero, diagonal, select_x ) ),
-        Vector4( vec_sel( zero, diagonal, select_y ) ),
-        Vector4( vec_sel( column, _mm_set1_ps(-1.0f), select_w ) ),
-        Vector4( vec_sel( zero, vec_mul( diagonal, vec_splat( f.v, 0 ) ), select_z ) )
-	);
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::orthographic( float left, float right, float bottom, float top, float zNear, float zFar )
-{
-    /* function implementation based on code from STIDC SDK:           */
-    /* --------------------------------------------------------------  */
-    /* PLEASE DO NOT MODIFY THIS SECTION                               */
-    /* This prolog section is automatically generated.                 */
-    /*                                                                 */
-    /* (C)Copyright                                                    */
-    /* Sony Computer Entertainment, Inc.,                              */
-    /* Toshiba Corporation,                                            */
-    /* International Business Machines Corporation,                    */
-    /* 2001,2002.                                                      */
-    /* S/T/I Confidential Information                                  */
-    /* --------------------------------------------------------------  */
-    __m128 lbf, rtn;
-    __m128 diff, sum, inv_diff, neg_inv_diff;
-    __m128 diagonal, column;
-    __m128 zero = _mm_setzero_ps();
-    union { __m128 v; float s[4]; } l, f, r, n, b, t;
-    l.s[0] = left;
-    f.s[0] = zFar;
-    r.s[0] = right;
-    n.s[0] = zNear;
-    b.s[0] = bottom;
-    t.s[0] = top;
-    lbf = vec_mergeh( l.v, f.v );
-    rtn = vec_mergeh( r.v, n.v );
-    lbf = vec_mergeh( lbf, b.v );
-    rtn = vec_mergeh( rtn, t.v );
-    diff = vec_sub( rtn, lbf );
-    sum  = vec_add( rtn, lbf );
-    inv_diff = recipf4( diff );
-    neg_inv_diff = negatef4( inv_diff );
-    diagonal = vec_add( inv_diff, inv_diff );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
-    column = vec_mul( sum, vec_sel( neg_inv_diff, inv_diff, select_z ) ); // TODO: no madds with zero
-    return Matrix4(
-        Vector4( vec_sel( zero, diagonal, select_x ) ),
-        Vector4( vec_sel( zero, diagonal, select_y ) ),
-        Vector4( vec_sel( zero, diagonal, select_z ) ),
-        Vector4( vec_sel( column, _mm_set1_ps(1.0f), select_w ) )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 )
-{
-    return Matrix4(
-        select( mat0.getCol0(), mat1.getCol0(), select1 ),
-        select( mat0.getCol1(), mat1.getCol1(), select1 ),
-        select( mat0.getCol2(), mat1.getCol2(), select1 ),
-        select( mat0.getCol3(), mat1.getCol3(), select1 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 )
-{
-    return Matrix4(
-        select( mat0.getCol0(), mat1.getCol0(), select1 ),
-        select( mat0.getCol1(), mat1.getCol1(), select1 ),
-        select( mat0.getCol2(), mat1.getCol2(), select1 ),
-        select( mat0.getCol3(), mat1.getCol3(), select1 )
-    );
-}
-
-#ifdef _VECTORMATH_DEBUG
-
-VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat )
-{
-    print( mat.getRow( 0 ) );
-    print( mat.getRow( 1 ) );
-    print( mat.getRow( 2 ) );
-    print( mat.getRow( 3 ) );
-}
-
-VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat, const char * name )
-{
-    printf("%s:\n", name);
-    print( mat );
-}
-
-#endif
-
-VECTORMATH_FORCE_INLINE Transform3::Transform3( const Transform3 & tfrm )
-{
-    mCol0 = tfrm.mCol0;
-    mCol1 = tfrm.mCol1;
-    mCol2 = tfrm.mCol2;
-    mCol3 = tfrm.mCol3;
-}
-
-VECTORMATH_FORCE_INLINE Transform3::Transform3( float scalar )
-{
-    mCol0 = Vector3( scalar );
-    mCol1 = Vector3( scalar );
-    mCol2 = Vector3( scalar );
-    mCol3 = Vector3( scalar );
-}
-
-VECTORMATH_FORCE_INLINE Transform3::Transform3( const floatInVec &scalar )
-{
-    mCol0 = Vector3( scalar );
-    mCol1 = Vector3( scalar );
-    mCol2 = Vector3( scalar );
-    mCol3 = Vector3( scalar );
-}
-
-VECTORMATH_FORCE_INLINE Transform3::Transform3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2, const Vector3 &_col3 )
-{
-    mCol0 = _col0;
-    mCol1 = _col1;
-    mCol2 = _col2;
-    mCol3 = _col3;
-}
-
-VECTORMATH_FORCE_INLINE Transform3::Transform3( const Matrix3 & tfrm, const Vector3 &translateVec )
-{
-    this->setUpper3x3( tfrm );
-    this->setTranslation( translateVec );
-}
-
-VECTORMATH_FORCE_INLINE Transform3::Transform3( const Quat &unitQuat, const Vector3 &translateVec )
-{
-    this->setUpper3x3( Matrix3( unitQuat ) );
-    this->setTranslation( translateVec );
-}
-
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol0( const Vector3 &_col0 )
-{
-    mCol0 = _col0;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol1( const Vector3 &_col1 )
-{
-    mCol1 = _col1;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol2( const Vector3 &_col2 )
-{
-    mCol2 = _col2;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol3( const Vector3 &_col3 )
-{
-    mCol3 = _col3;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol( int col, const Vector3 &vec )
-{
-    *(&mCol0 + col) = vec;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setRow( int row, const Vector4 &vec )
-{
-    mCol0.setElem( row, vec.getElem( 0 ) );
-    mCol1.setElem( row, vec.getElem( 1 ) );
-    mCol2.setElem( row, vec.getElem( 2 ) );
-    mCol3.setElem( row, vec.getElem( 3 ) );
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setElem( int col, int row, float val )
-{
-    (*this)[col].setElem(row, val);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setElem( int col, int row, const floatInVec &val )
-{
-    Vector3 tmpV3_0;
-    tmpV3_0 = this->getCol( col );
-    tmpV3_0.setElem( row, val );
-    this->setCol( col, tmpV3_0 );
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Transform3::getElem( int col, int row ) const
-{
-    return this->getCol( col ).getElem( row );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol0( ) const
-{
-    return mCol0;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol1( ) const
-{
-    return mCol1;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol2( ) const
-{
-    return mCol2;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol3( ) const
-{
-    return mCol3;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol( int col ) const
-{
-    return *(&mCol0 + col);
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Transform3::getRow( int row ) const
-{
-    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Transform3::operator []( int col )
-{
-    return *(&mCol0 + col);
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::operator []( int col ) const
-{
-    return *(&mCol0 + col);
-}
-
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::operator =( const Transform3 & tfrm )
-{
-    mCol0 = tfrm.mCol0;
-    mCol1 = tfrm.mCol1;
-    mCol2 = tfrm.mCol2;
-    mCol3 = tfrm.mCol3;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 inverse( const Transform3 & tfrm )
-{
-    __m128 inv0, inv1, inv2, inv3;
-    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet;
-    __m128 xxxx, yyyy, zzzz;
-    tmp2 = _vmathVfCross( tfrm.getCol0().get128(), tfrm.getCol1().get128() );
-    tmp0 = _vmathVfCross( tfrm.getCol1().get128(), tfrm.getCol2().get128() );
-    tmp1 = _vmathVfCross( tfrm.getCol2().get128(), tfrm.getCol0().get128() );
-    inv3 = negatef4( tfrm.getCol3().get128() );
-    dot = _vmathVfDot3( tmp2, tfrm.getCol2().get128() );
-    dot = vec_splat( dot, 0 );
-    invdet = recipf4( dot );
-    tmp3 = vec_mergeh( tmp0, tmp2 );
-    tmp4 = vec_mergel( tmp0, tmp2 );
-    inv0 = vec_mergeh( tmp3, tmp1 );
-    xxxx = vec_splat( inv3, 0 );
-    //inv1 = vec_perm( tmp3, tmp1, _VECTORMATH_PERM_ZBWX );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	inv1 = _mm_shuffle_ps( tmp3, tmp3, _MM_SHUFFLE(0,3,2,2));
-	inv1 = vec_sel(inv1, tmp1, select_y);
-    //inv2 = vec_perm( tmp4, tmp1, _VECTORMATH_PERM_XCYX );
-	inv2 = _mm_shuffle_ps( tmp4, tmp4, _MM_SHUFFLE(0,1,1,0));
-	inv2 = vec_sel(inv2, vec_splat(tmp1, 2), select_y);
-    yyyy = vec_splat( inv3, 1 );
-    zzzz = vec_splat( inv3, 2 );
-    inv3 = vec_mul( inv0, xxxx );
-    inv3 = vec_madd( inv1, yyyy, inv3 );
-    inv3 = vec_madd( inv2, zzzz, inv3 );
-    inv0 = vec_mul( inv0, invdet );
-    inv1 = vec_mul( inv1, invdet );
-    inv2 = vec_mul( inv2, invdet );
-    inv3 = vec_mul( inv3, invdet );
-    return Transform3(
-        Vector3( inv0 ),
-        Vector3( inv1 ),
-        Vector3( inv2 ),
-        Vector3( inv3 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 orthoInverse( const Transform3 & tfrm )
-{
-    __m128 inv0, inv1, inv2, inv3;
-    __m128 tmp0, tmp1;
-    __m128 xxxx, yyyy, zzzz;
-    tmp0 = vec_mergeh( tfrm.getCol0().get128(), tfrm.getCol2().get128() );
-    tmp1 = vec_mergel( tfrm.getCol0().get128(), tfrm.getCol2().get128() );
-    inv3 = negatef4( tfrm.getCol3().get128() );
-    inv0 = vec_mergeh( tmp0, tfrm.getCol1().get128() );
-    xxxx = vec_splat( inv3, 0 );
-    //inv1 = vec_perm( tmp0, tfrm.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	inv1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
-	inv1 = vec_sel(inv1, tfrm.getCol1().get128(), select_y);
-    //inv2 = vec_perm( tmp1, tfrm.getCol1().get128(), _VECTORMATH_PERM_XCYX );
-	inv2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
-	inv2 = vec_sel(inv2, vec_splat(tfrm.getCol1().get128(), 2), select_y);
-    yyyy = vec_splat( inv3, 1 );
-    zzzz = vec_splat( inv3, 2 );
-    inv3 = vec_mul( inv0, xxxx );
-    inv3 = vec_madd( inv1, yyyy, inv3 );
-    inv3 = vec_madd( inv2, zzzz, inv3 );
-    return Transform3(
-        Vector3( inv0 ),
-        Vector3( inv1 ),
-        Vector3( inv2 ),
-        Vector3( inv3 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 absPerElem( const Transform3 & tfrm )
-{
-    return Transform3(
-        absPerElem( tfrm.getCol0() ),
-        absPerElem( tfrm.getCol1() ),
-        absPerElem( tfrm.getCol2() ),
-        absPerElem( tfrm.getCol3() )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::operator *( const Vector3 &vec ) const
-{
-    __m128 res;
-    __m128 xxxx, yyyy, zzzz;
-    xxxx = vec_splat( vec.get128(), 0 );
-    yyyy = vec_splat( vec.get128(), 1 );
-    zzzz = vec_splat( vec.get128(), 2 );
-    res = vec_mul( mCol0.get128(), xxxx );
-    res = vec_madd( mCol1.get128(), yyyy, res );
-    res = vec_madd( mCol2.get128(), zzzz, res );
-    return Vector3( res );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 Transform3::operator *( const Point3 &pnt ) const
-{
-    __m128 tmp0, tmp1, res;
-    __m128 xxxx, yyyy, zzzz;
-    xxxx = vec_splat( pnt.get128(), 0 );
-    yyyy = vec_splat( pnt.get128(), 1 );
-    zzzz = vec_splat( pnt.get128(), 2 );
-    tmp0 = vec_mul( mCol0.get128(), xxxx );
-    tmp1 = vec_mul( mCol1.get128(), yyyy );
-    tmp0 = vec_madd( mCol2.get128(), zzzz, tmp0 );
-    tmp1 = vec_add( mCol3.get128(), tmp1 );
-    res = vec_add( tmp0, tmp1 );
-    return Point3( res );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::operator *( const Transform3 & tfrm ) const
-{
-    return Transform3(
-        ( *this * tfrm.mCol0 ),
-        ( *this * tfrm.mCol1 ),
-        ( *this * tfrm.mCol2 ),
-        Vector3( ( *this * Point3( tfrm.mCol3 ) ) )
-    );
-}
-
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::operator *=( const Transform3 & tfrm )
-{
-    *this = *this * tfrm;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 )
-{
-    return Transform3(
-        mulPerElem( tfrm0.getCol0(), tfrm1.getCol0() ),
-        mulPerElem( tfrm0.getCol1(), tfrm1.getCol1() ),
-        mulPerElem( tfrm0.getCol2(), tfrm1.getCol2() ),
-        mulPerElem( tfrm0.getCol3(), tfrm1.getCol3() )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::identity( )
-{
-    return Transform3(
-        Vector3::xAxis( ),
-        Vector3::yAxis( ),
-        Vector3::zAxis( ),
-        Vector3( 0.0f )
-    );
-}
-
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setUpper3x3( const Matrix3 & tfrm )
-{
-    mCol0 = tfrm.getCol0();
-    mCol1 = tfrm.getCol1();
-    mCol2 = tfrm.getCol2();
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 Transform3::getUpper3x3( ) const
-{
-    return Matrix3( mCol0, mCol1, mCol2 );
-}
-
-VECTORMATH_FORCE_INLINE Transform3 & Transform3::setTranslation( const Vector3 &translateVec )
-{
-    mCol3 = translateVec;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Transform3::getTranslation( ) const
-{
-    return mCol3;
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationX( float radians )
-{
-    return rotationX( floatInVec(radians) );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationX( const floatInVec &radians )
-{
-    __m128 s, c, res1, res2;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res1 = vec_sel( zero, c, select_y );
-    res1 = vec_sel( res1, s, select_z );
-    res2 = vec_sel( zero, negatef4(s), select_y );
-    res2 = vec_sel( res2, c, select_z );
-    return Transform3(
-        Vector3::xAxis( ),
-        Vector3( res1 ),
-        Vector3( res2 ),
-        Vector3( _mm_setzero_ps() )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationY( float radians )
-{
-    return rotationY( floatInVec(radians) );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationY( const floatInVec &radians )
-{
-    __m128 s, c, res0, res2;
-    __m128 zero;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res0 = vec_sel( zero, c, select_x );
-    res0 = vec_sel( res0, negatef4(s), select_z );
-    res2 = vec_sel( zero, s, select_x );
-    res2 = vec_sel( res2, c, select_z );
-    return Transform3(
-        Vector3( res0 ),
-        Vector3::yAxis( ),
-        Vector3( res2 ),
-        Vector3( 0.0f )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationZ( float radians )
-{
-    return rotationZ( floatInVec(radians) );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationZ( const floatInVec &radians )
-{
-    __m128 s, c, res0, res1;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-    __m128 zero = _mm_setzero_ps();
-    sincosf4( radians.get128(), &s, &c );
-    res0 = vec_sel( zero, c, select_x );
-    res0 = vec_sel( res0, s, select_y );
-    res1 = vec_sel( zero, negatef4(s), select_x );
-    res1 = vec_sel( res1, c, select_y );
-    return Transform3(
-        Vector3( res0 ),
-        Vector3( res1 ),
-        Vector3::zAxis( ),
-        Vector3( 0.0f )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationZYX( const Vector3 &radiansXYZ )
-{
-    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
-    angles = Vector4( radiansXYZ, 0.0f ).get128();
-    sincosf4( angles, &s, &c );
-    negS = negatef4( s );
-    Z0 = vec_mergel( c, s );
-    Z1 = vec_mergel( negS, c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
-    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
-	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
-	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
-    X0 = vec_splat( s, 0 );
-    X1 = vec_splat( c, 0 );
-    tmp = vec_mul( Z0, Y1 );
-    return Transform3(
-        Vector3( vec_mul( Z0, Y0 ) ),
-        Vector3( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
-        Vector3( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) ),
-        Vector3( 0.0f )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotation( float radians, const Vector3 &unitVec )
-{
-    return rotation( floatInVec(radians), unitVec );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotation( const floatInVec &radians, const Vector3 &unitVec )
-{
-    return Transform3( Matrix3::rotation( radians, unitVec ), Vector3( 0.0f ) );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotation( const Quat &unitQuat )
-{
-    return Transform3( Matrix3( unitQuat ), Vector3( 0.0f ) );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::scale( const Vector3 &scaleVec )
-{
-    __m128 zero = _mm_setzero_ps();
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    return Transform3(
-        Vector3( vec_sel( zero, scaleVec.get128(), select_x ) ),
-        Vector3( vec_sel( zero, scaleVec.get128(), select_y ) ),
-        Vector3( vec_sel( zero, scaleVec.get128(), select_z ) ),
-        Vector3( 0.0f )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec )
-{
-    return Transform3(
-        ( tfrm.getCol0() * scaleVec.getX( ) ),
-        ( tfrm.getCol1() * scaleVec.getY( ) ),
-        ( tfrm.getCol2() * scaleVec.getZ( ) ),
-        tfrm.getCol3()
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm )
-{
-    return Transform3(
-        mulPerElem( tfrm.getCol0(), scaleVec ),
-        mulPerElem( tfrm.getCol1(), scaleVec ),
-        mulPerElem( tfrm.getCol2(), scaleVec ),
-        mulPerElem( tfrm.getCol3(), scaleVec )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 Transform3::translation( const Vector3 &translateVec )
-{
-    return Transform3(
-        Vector3::xAxis( ),
-        Vector3::yAxis( ),
-        Vector3::zAxis( ),
-        translateVec
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 )
-{
-    return Transform3(
-        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
-        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
-        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
-        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 )
-{
-    return Transform3(
-        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
-        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
-        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
-        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
-    );
-}
-
-#ifdef _VECTORMATH_DEBUG
-
-VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm )
-{
-    print( tfrm.getRow( 0 ) );
-    print( tfrm.getRow( 1 ) );
-    print( tfrm.getRow( 2 ) );
-}
-
-VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm, const char * name )
-{
-    printf("%s:\n", name);
-    print( tfrm );
-}
-
-#endif
-
-VECTORMATH_FORCE_INLINE Quat::Quat( const Matrix3 & tfrm )
-{
-    __m128 res;
-    __m128 col0, col1, col2;
-    __m128 xx_yy, xx_yy_zz_xx, yy_zz_xx_yy, zz_xx_yy_zz, diagSum, diagDiff;
-    __m128 zy_xz_yx, yz_zx_xy, sum, diff;
-    __m128 radicand, invSqrt, scale;
-    __m128 res0, res1, res2, res3;
-    __m128 xx, yy, zz;
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
-
-    col0 = tfrm.getCol0().get128();
-    col1 = tfrm.getCol1().get128();
-    col2 = tfrm.getCol2().get128();
-
-    /* four cases: */
-    /* trace > 0 */
-    /* else */
-    /*    xx largest diagonal element */
-    /*    yy largest diagonal element */
-    /*    zz largest diagonal element */
-
-    /* compute quaternion for each case */
-
-    xx_yy = vec_sel( col0, col1, select_y );
-    //xx_yy_zz_xx = vec_perm( xx_yy, col2, _VECTORMATH_PERM_XYCX );
-    //yy_zz_xx_yy = vec_perm( xx_yy, col2, _VECTORMATH_PERM_YCXY );
-    //zz_xx_yy_zz = vec_perm( xx_yy, col2, _VECTORMATH_PERM_CXYC );
-    xx_yy_zz_xx = _mm_shuffle_ps( xx_yy, xx_yy, _MM_SHUFFLE(0,0,1,0) );
-    xx_yy_zz_xx = vec_sel( xx_yy_zz_xx, col2, select_z ); // TODO: Ck
-    yy_zz_xx_yy = _mm_shuffle_ps( xx_yy_zz_xx, xx_yy_zz_xx, _MM_SHUFFLE(1,0,2,1) );
-    zz_xx_yy_zz = _mm_shuffle_ps( xx_yy_zz_xx, xx_yy_zz_xx, _MM_SHUFFLE(2,1,0,2) );
-
-    diagSum = vec_add( vec_add( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz );
-    diagDiff = vec_sub( vec_sub( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz );
-    radicand = vec_add( vec_sel( diagDiff, diagSum, select_w ), _mm_set1_ps(1.0f) );
- //   invSqrt = rsqrtf4( radicand );
-	invSqrt = newtonrapson_rsqrt4( radicand );
-
-	
-
-    zy_xz_yx = vec_sel( col0, col1, select_z );									// zy_xz_yx = 00 01 12 03
-    //zy_xz_yx = vec_perm( zy_xz_yx, col2, _VECTORMATH_PERM_ZAYX );
-	zy_xz_yx = _mm_shuffle_ps( zy_xz_yx, zy_xz_yx, _MM_SHUFFLE(0,1,2,2) );		// zy_xz_yx = 12 12 01 00
-    zy_xz_yx = vec_sel( zy_xz_yx, vec_splat(col2, 0), select_y );				// zy_xz_yx = 12 20 01 00
-    yz_zx_xy = vec_sel( col0, col1, select_x );									// yz_zx_xy = 10 01 02 03
-    //yz_zx_xy = vec_perm( yz_zx_xy, col2, _VECTORMATH_PERM_BZXX );
-	yz_zx_xy = _mm_shuffle_ps( yz_zx_xy, yz_zx_xy, _MM_SHUFFLE(0,0,2,0) );		// yz_zx_xy = 10 02 10 10
-	yz_zx_xy = vec_sel( yz_zx_xy, vec_splat(col2, 1), select_x );				// yz_zx_xy = 21 02 10 10
-
-    sum = vec_add( zy_xz_yx, yz_zx_xy );
-    diff = vec_sub( zy_xz_yx, yz_zx_xy );
-
-    scale = vec_mul( invSqrt, _mm_set1_ps(0.5f) );
-
-    //res0 = vec_perm( sum, diff, _VECTORMATH_PERM_XZYA );
-	res0 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,1,2,0) );
-	res0 = vec_sel( res0, vec_splat(diff, 0), select_w );  // TODO: Ck
-    //res1 = vec_perm( sum, diff, _VECTORMATH_PERM_ZXXB );
-	res1 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,0,0,2) );
-	res1 = vec_sel( res1, vec_splat(diff, 1), select_w );  // TODO: Ck
-    //res2 = vec_perm( sum, diff, _VECTORMATH_PERM_YXXC );
-	res2 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,0,0,1) );
-	res2 = vec_sel( res2, vec_splat(diff, 2), select_w );  // TODO: Ck
-    res3 = diff;
-    res0 = vec_sel( res0, radicand, select_x );
-    res1 = vec_sel( res1, radicand, select_y );
-    res2 = vec_sel( res2, radicand, select_z );
-    res3 = vec_sel( res3, radicand, select_w );
-    res0 = vec_mul( res0, vec_splat( scale, 0 ) );
-    res1 = vec_mul( res1, vec_splat( scale, 1 ) );
-    res2 = vec_mul( res2, vec_splat( scale, 2 ) );
-    res3 = vec_mul( res3, vec_splat( scale, 3 ) );
-
-    /* determine case and select answer */
-
-    xx = vec_splat( col0, 0 );
-    yy = vec_splat( col1, 1 );
-    zz = vec_splat( col2, 2 );
-    res = vec_sel( res0, res1, vec_cmpgt( yy, xx ) );
-    res = vec_sel( res, res2, vec_and( vec_cmpgt( zz, xx ), vec_cmpgt( zz, yy ) ) );
-    res = vec_sel( res, res3, vec_cmpgt( vec_splat( diagSum, 0 ), _mm_setzero_ps() ) );
-    mVec128 = res;
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 outer( const Vector3 &tfrm0, const Vector3 &tfrm1 )
-{
-    return Matrix3(
-        ( tfrm0 * tfrm1.getX( ) ),
-        ( tfrm0 * tfrm1.getY( ) ),
-        ( tfrm0 * tfrm1.getZ( ) )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix4 outer( const Vector4 &tfrm0, const Vector4 &tfrm1 )
-{
-    return Matrix4(
-        ( tfrm0 * tfrm1.getX( ) ),
-        ( tfrm0 * tfrm1.getY( ) ),
-        ( tfrm0 * tfrm1.getZ( ) ),
-        ( tfrm0 * tfrm1.getW( ) )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat )
-{
-    __m128 tmp0, tmp1, mcol0, mcol1, mcol2, res;
-    __m128 xxxx, yyyy, zzzz;
-    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
-    tmp1 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
-    xxxx = vec_splat( vec.get128(), 0 );
-    mcol0 = vec_mergeh( tmp0, mat.getCol1().get128() );
-    //mcol1 = vec_perm( tmp0, mat.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	mcol1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
-	mcol1 = vec_sel(mcol1, mat.getCol1().get128(), select_y);
-    //mcol2 = vec_perm( tmp1, mat.getCol1().get128(), _VECTORMATH_PERM_XCYX );
-	mcol2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
-	mcol2 = vec_sel(mcol2, vec_splat(mat.getCol1().get128(), 2), select_y);
-    yyyy = vec_splat( vec.get128(), 1 );
-    res = vec_mul( mcol0, xxxx );
-    zzzz = vec_splat( vec.get128(), 2 );
-    res = vec_madd( mcol1, yyyy, res );
-    res = vec_madd( mcol2, zzzz, res );
-    return Vector3( res );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 crossMatrix( const Vector3 &vec )
-{
-    __m128 neg, res0, res1, res2;
-    neg = negatef4( vec.get128() );
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
-    //res0 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_XZBX );
-	res0 = _mm_shuffle_ps( vec.get128(), vec.get128(), _MM_SHUFFLE(0,2,2,0) );
-	res0 = vec_sel(res0, vec_splat(neg, 1), select_z);
-    //res1 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_CXXX );
-	res1 = vec_sel(vec_splat(vec.get128(), 0), vec_splat(neg, 2), select_x);
-    //res2 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_YAXX );
-	res2 = _mm_shuffle_ps( vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,1,1) );
-	res2 = vec_sel(res2, vec_splat(neg, 0), select_y);
-	VM_ATTRIBUTE_ALIGN16 unsigned int filter_x[4] = {0, 0xffffffff, 0xffffffff, 0xffffffff};
-	VM_ATTRIBUTE_ALIGN16 unsigned int filter_y[4] = {0xffffffff, 0, 0xffffffff, 0xffffffff};
-	VM_ATTRIBUTE_ALIGN16 unsigned int filter_z[4] = {0xffffffff, 0xffffffff, 0, 0xffffffff};
-    res0 = vec_and( res0, _mm_load_ps((float *)filter_x ) );
-    res1 = vec_and( res1, _mm_load_ps((float *)filter_y ) );
-    res2 = vec_and( res2, _mm_load_ps((float *)filter_z ) ); // TODO: Use selects?
-    return Matrix3(
-        Vector3( res0 ),
-        Vector3( res1 ),
-        Vector3( res2 )
-    );
-}
-
-VECTORMATH_FORCE_INLINE const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat )
-{
-    return Matrix3( cross( vec, mat.getCol0() ), cross( vec, mat.getCol1() ), cross( vec, mat.getCol2() ) );
-}
-
-} // namespace Aos
-} // namespace Vectormath
-
-#endif
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef _VECTORMATH_MAT_AOS_CPP_H
+#define _VECTORMATH_MAT_AOS_CPP_H
+
+namespace Vectormath {
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// Constants
+// for shuffles, words are labeled [x,y,z,w] [a,b,c,d]
+
+#define _VECTORMATH_PERM_ZBWX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_B, _VECTORMATH_PERM_W, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_XCYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_C, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_XYAB ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B })
+#define _VECTORMATH_PERM_ZWCD ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_W, _VECTORMATH_PERM_C, _VECTORMATH_PERM_D })
+#define _VECTORMATH_PERM_XZBX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_B, _VECTORMATH_PERM_X })     
+#define _VECTORMATH_PERM_CXXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_YAXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_XAZC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_C })
+#define _VECTORMATH_PERM_YXWZ ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X, _VECTORMATH_PERM_W, _VECTORMATH_PERM_Z })
+#define _VECTORMATH_PERM_YBWD ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_B, _VECTORMATH_PERM_W, _VECTORMATH_PERM_D })
+#define _VECTORMATH_PERM_XYCX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_YCXY ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y })
+#define _VECTORMATH_PERM_CXYC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C })
+#define _VECTORMATH_PERM_ZAYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_BZXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_B, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_XZYA ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A })
+#define _VECTORMATH_PERM_ZXXB ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_B })
+#define _VECTORMATH_PERM_YXXC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_C })
+#define _VECTORMATH_PERM_BBYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_B, _VECTORMATH_PERM_B, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PI_OVER_2 1.570796327f
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const Matrix3 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3::Matrix3( float scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+}
+
+VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const floatInVec &scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+}
+
+VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const Quat &unitQuat )
+{
+    __m128 xyzw_2, wwww, yzxw, zxyw, yzxw_2, zxyw_2;
+    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+	VM_ATTRIBUTE_ALIGN16 unsigned int sx[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int sz[4] = {0, 0, 0xffffffff, 0};
+	__m128 select_x = _mm_load_ps((float *)sx);
+	__m128 select_z = _mm_load_ps((float *)sz);
+
+    xyzw_2 = _mm_add_ps( unitQuat.get128(), unitQuat.get128() );
+    wwww = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,3,3,3) );
+	yzxw = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,0,2,1) );
+	zxyw = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,1,0,2) );
+    yzxw_2 = _mm_shuffle_ps( xyzw_2, xyzw_2, _MM_SHUFFLE(3,0,2,1) );
+    zxyw_2 = _mm_shuffle_ps( xyzw_2, xyzw_2, _MM_SHUFFLE(3,1,0,2) );
+
+    tmp0 = _mm_mul_ps( yzxw_2, wwww );									// tmp0 = 2yw, 2zw, 2xw, 2w2
+	tmp1 = _mm_sub_ps( _mm_set1_ps(1.0f), _mm_mul_ps(yzxw, yzxw_2) );	// tmp1 = 1 - 2y2, 1 - 2z2, 1 - 2x2, 1 - 2w2
+    tmp2 = _mm_mul_ps( yzxw, xyzw_2 );									// tmp2 = 2xy, 2yz, 2xz, 2w2
+    tmp0 = _mm_add_ps( _mm_mul_ps(zxyw, xyzw_2), tmp0 );				// tmp0 = 2yw + 2zx, 2zw + 2xy, 2xw + 2yz, 2w2 + 2w2
+    tmp1 = _mm_sub_ps( tmp1, _mm_mul_ps(zxyw, zxyw_2) );				// tmp1 = 1 - 2y2 - 2z2, 1 - 2z2 - 2x2, 1 - 2x2 - 2y2, 1 - 2w2 - 2w2
+    tmp2 = _mm_sub_ps( tmp2, _mm_mul_ps(zxyw_2, wwww) );				// tmp2 = 2xy - 2zw, 2yz - 2xw, 2xz - 2yw, 2w2 -2w2
+
+    tmp3 = vec_sel( tmp0, tmp1, select_x );
+    tmp4 = vec_sel( tmp1, tmp2, select_x );
+    tmp5 = vec_sel( tmp2, tmp0, select_x );
+    mCol0 = Vector3( vec_sel( tmp3, tmp2, select_z ) );
+    mCol1 = Vector3( vec_sel( tmp4, tmp0, select_z ) );
+    mCol2 = Vector3( vec_sel( tmp5, tmp1, select_z ) );
+}
+
+VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol0( const Vector3 &_col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol1( const Vector3 &_col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol2( const Vector3 &_col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol( int col, const Vector3 &vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setRow( int row, const Vector3 &vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setElem( int col, int row, float val )
+{
+    (*this)[col].setElem(row, val);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setElem( int col, int row, const floatInVec &val )
+{
+    Vector3 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Matrix3::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol0( ) const
+{
+    return mCol0;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol1( ) const
+{
+    return mCol1;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol2( ) const
+{
+    return mCol2;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getRow( int row ) const
+{
+    return Vector3( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Matrix3::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator =( const Matrix3 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 transpose( const Matrix3 & mat )
+{
+    __m128 tmp0, tmp1, res0, res1, res2;
+    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
+    tmp1 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
+    res0 = vec_mergeh( tmp0, mat.getCol1().get128() );
+    //res1 = vec_perm( tmp0, mat.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	res1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
+	res1 = vec_sel(res1, mat.getCol1().get128(), select_y);
+    //res2 = vec_perm( tmp1, mat.getCol1().get128(), _VECTORMATH_PERM_XCYX );
+	res2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
+	res2 = vec_sel(res2, vec_splat(mat.getCol1().get128(), 2), select_y);
+    return Matrix3(
+        Vector3( res0 ),
+        Vector3( res1 ),
+        Vector3( res2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 inverse( const Matrix3 & mat )
+{
+    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet, inv0, inv1, inv2;
+    tmp2 = _vmathVfCross( mat.getCol0().get128(), mat.getCol1().get128() );
+    tmp0 = _vmathVfCross( mat.getCol1().get128(), mat.getCol2().get128() );
+    tmp1 = _vmathVfCross( mat.getCol2().get128(), mat.getCol0().get128() );
+    dot = _vmathVfDot3( tmp2, mat.getCol2().get128() );
+    dot = vec_splat( dot, 0 );
+    invdet = recipf4( dot );
+    tmp3 = vec_mergeh( tmp0, tmp2 );
+    tmp4 = vec_mergel( tmp0, tmp2 );
+    inv0 = vec_mergeh( tmp3, tmp1 );
+    //inv1 = vec_perm( tmp3, tmp1, _VECTORMATH_PERM_ZBWX );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	inv1 = _mm_shuffle_ps( tmp3, tmp3, _MM_SHUFFLE(0,3,2,2));
+	inv1 = vec_sel(inv1, tmp1, select_y);
+    //inv2 = vec_perm( tmp4, tmp1, _VECTORMATH_PERM_XCYX );
+	inv2 = _mm_shuffle_ps( tmp4, tmp4, _MM_SHUFFLE(0,1,1,0));
+	inv2 = vec_sel(inv2, vec_splat(tmp1, 2), select_y);
+    inv0 = vec_mul( inv0, invdet );
+    inv1 = vec_mul( inv1, invdet );
+	inv2 = vec_mul( inv2, invdet );
+    return Matrix3(
+        Vector3( inv0 ),
+        Vector3( inv1 ),
+        Vector3( inv2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix3 & mat )
+{
+    return dot( mat.getCol2(), cross( mat.getCol0(), mat.getCol1() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator +( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( mCol0 + mat.mCol0 ),
+        ( mCol1 + mat.mCol1 ),
+        ( mCol2 + mat.mCol2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator -( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( mCol0 - mat.mCol0 ),
+        ( mCol1 - mat.mCol1 ),
+        ( mCol2 - mat.mCol2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator +=( const Matrix3 & mat )
+{
+    *this = *this + mat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator -=( const Matrix3 & mat )
+{
+    *this = *this - mat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator -( ) const
+{
+    return Matrix3(
+        ( -mCol0 ),
+        ( -mCol1 ),
+        ( -mCol2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 absPerElem( const Matrix3 & mat )
+{
+    return Matrix3(
+        absPerElem( mat.getCol0() ),
+        absPerElem( mat.getCol1() ),
+        absPerElem( mat.getCol2() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator *( const floatInVec &scalar ) const
+{
+    return Matrix3(
+        ( mCol0 * scalar ),
+        ( mCol1 * scalar ),
+        ( mCol2 * scalar )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator *=( float scalar )
+{
+    return *this *= floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 operator *( float scalar, const Matrix3 & mat )
+{
+    return floatInVec(scalar) * mat;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat )
+{
+    return mat * scalar;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::operator *( const Vector3 &vec ) const
+{
+    __m128 res;
+    __m128 xxxx, yyyy, zzzz;
+    xxxx = vec_splat( vec.get128(), 0 );
+    yyyy = vec_splat( vec.get128(), 1 );
+    zzzz = vec_splat( vec.get128(), 2 );
+    res = vec_mul( mCol0.get128(), xxxx );
+    res = vec_madd( mCol1.get128(), yyyy, res );
+    res = vec_madd( mCol2.get128(), zzzz, res );
+    return Vector3( res );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( *this * mat.mCol0 ),
+        ( *this * mat.mCol1 ),
+        ( *this * mat.mCol2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator *=( const Matrix3 & mat )
+{
+    *this = *this * mat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 )
+{
+    return Matrix3(
+        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
+        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
+        mulPerElem( mat0.getCol2(), mat1.getCol2() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::identity( )
+{
+    return Matrix3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationX( float radians )
+{
+    return rotationX( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationX( const floatInVec &radians )
+{
+    __m128 s, c, res1, res2;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res1 = vec_sel( zero, c, select_y );
+    res1 = vec_sel( res1, s, select_z );
+    res2 = vec_sel( zero, negatef4(s), select_y );
+    res2 = vec_sel( res2, c, select_z );
+    return Matrix3(
+        Vector3::xAxis( ),
+        Vector3( res1 ),
+        Vector3( res2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationY( float radians )
+{
+    return rotationY( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationY( const floatInVec &radians )
+{
+    __m128 s, c, res0, res2;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, negatef4(s), select_z );
+    res2 = vec_sel( zero, s, select_x );
+    res2 = vec_sel( res2, c, select_z );
+    return Matrix3(
+        Vector3( res0 ),
+        Vector3::yAxis( ),
+        Vector3( res2 )
+	);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationZ( float radians )
+{
+    return rotationZ( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationZ( const floatInVec &radians )
+{
+    __m128 s, c, res0, res1;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, s, select_y );
+    res1 = vec_sel( zero, negatef4(s), select_x );
+    res1 = vec_sel( res1, c, select_y );
+    return Matrix3(
+        Vector3( res0 ),
+        Vector3( res1 ),
+        Vector3::zAxis( )
+	);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationZYX( const Vector3 &radiansXYZ )
+{
+    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
+    angles = Vector4( radiansXYZ, 0.0f ).get128();
+    sincosf4( angles, &s, &c );
+    negS = negatef4( s );
+    Z0 = vec_mergel( c, s );
+    Z1 = vec_mergel( negS, c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
+    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
+	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
+	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
+    X0 = vec_splat( s, 0 );
+    X1 = vec_splat( c, 0 );
+    tmp = vec_mul( Z0, Y1 );
+    return Matrix3(
+        Vector3( vec_mul( Z0, Y0 ) ),
+        Vector3( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
+        Vector3( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotation( float radians, const Vector3 &unitVec )
+{
+    return rotation( floatInVec(radians), unitVec );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotation( const floatInVec &radians, const Vector3 &unitVec )
+{
+    __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2;
+    axis = unitVec.get128();
+    sincosf4( radians.get128(), &s, &c );
+    xxxx = vec_splat( axis, 0 );
+    yyyy = vec_splat( axis, 1 );
+    zzzz = vec_splat( axis, 2 );
+    oneMinusC = vec_sub( _mm_set1_ps(1.0f), c );
+    axisS = vec_mul( axis, s );
+    negAxisS = negatef4( axisS );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    //tmp0 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_XZBX );
+	tmp0 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,2,0) );
+	tmp0 = vec_sel(tmp0, vec_splat(negAxisS, 1), select_z);
+    //tmp1 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_CXXX );
+	tmp1 = vec_sel( vec_splat(axisS, 0), vec_splat(negAxisS, 2), select_x );
+    //tmp2 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_YAXX );
+	tmp2 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,0,1) );
+	tmp2 = vec_sel(tmp2, vec_splat(negAxisS, 0), select_y);
+    tmp0 = vec_sel( tmp0, c, select_x );
+    tmp1 = vec_sel( tmp1, c, select_y );
+    tmp2 = vec_sel( tmp2, c, select_z );
+    return Matrix3(
+        Vector3( vec_madd( vec_mul( axis, xxxx ), oneMinusC, tmp0 ) ),
+        Vector3( vec_madd( vec_mul( axis, yyyy ), oneMinusC, tmp1 ) ),
+        Vector3( vec_madd( vec_mul( axis, zzzz ), oneMinusC, tmp2 ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotation( const Quat &unitQuat )
+{
+    return Matrix3( unitQuat );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::scale( const Vector3 &scaleVec )
+{
+    __m128 zero = _mm_setzero_ps();
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    return Matrix3(
+        Vector3( vec_sel( zero, scaleVec.get128(), select_x ) ),
+        Vector3( vec_sel( zero, scaleVec.get128(), select_y ) ),
+        Vector3( vec_sel( zero, scaleVec.get128(), select_z ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec )
+{
+    return Matrix3(
+        ( mat.getCol0() * scaleVec.getX( ) ),
+        ( mat.getCol1() * scaleVec.getY( ) ),
+        ( mat.getCol2() * scaleVec.getZ( ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat )
+{
+    return Matrix3(
+        mulPerElem( mat.getCol0(), scaleVec ),
+        mulPerElem( mat.getCol1(), scaleVec ),
+        mulPerElem( mat.getCol2(), scaleVec )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 )
+{
+    return Matrix3(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 )
+{
+    return Matrix3(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat )
+{
+    print( mat.getRow( 0 ) );
+    print( mat.getRow( 1 ) );
+    print( mat.getRow( 2 ) );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat, const char * name )
+{
+    printf("%s:\n", name);
+    print( mat );
+}
+
+#endif
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Matrix4 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    mCol3 = mat.mCol3;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( float scalar )
+{
+    mCol0 = Vector4( scalar );
+    mCol1 = Vector4( scalar );
+    mCol2 = Vector4( scalar );
+    mCol3 = Vector4( scalar );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const floatInVec &scalar )
+{
+    mCol0 = Vector4( scalar );
+    mCol1 = Vector4( scalar );
+    mCol2 = Vector4( scalar );
+    mCol3 = Vector4( scalar );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Transform3 & mat )
+{
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( mat.getCol3(), 1.0f );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Vector4 &_col0, const Vector4 &_col1, const Vector4 &_col2, const Vector4 &_col3 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+    mCol3 = _col3;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Matrix3 & mat, const Vector3 &translateVec )
+{
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( translateVec, 1.0f );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Quat &unitQuat, const Vector3 &translateVec )
+{
+    Matrix3 mat;
+    mat = Matrix3( unitQuat );
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( translateVec, 1.0f );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol0( const Vector4 &_col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol1( const Vector4 &_col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol2( const Vector4 &_col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol3( const Vector4 &_col3 )
+{
+    mCol3 = _col3;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol( int col, const Vector4 &vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setRow( int row, const Vector4 &vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    mCol3.setElem( row, vec.getElem( 3 ) );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setElem( int col, int row, float val )
+{
+    (*this)[col].setElem(row, val);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setElem( int col, int row, const floatInVec &val )
+{
+    Vector4 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Matrix4::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol0( ) const
+{
+    return mCol0;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol1( ) const
+{
+    return mCol1;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol2( ) const
+{
+    return mCol2;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol3( ) const
+{
+    return mCol3;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getRow( int row ) const
+{
+    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Matrix4::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator =( const Matrix4 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    mCol3 = mat.mCol3;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 transpose( const Matrix4 & mat )
+{
+    __m128 tmp0, tmp1, tmp2, tmp3, res0, res1, res2, res3;
+    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
+    tmp1 = vec_mergeh( mat.getCol1().get128(), mat.getCol3().get128() );
+    tmp2 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
+    tmp3 = vec_mergel( mat.getCol1().get128(), mat.getCol3().get128() );
+    res0 = vec_mergeh( tmp0, tmp1 );
+    res1 = vec_mergel( tmp0, tmp1 );
+    res2 = vec_mergeh( tmp2, tmp3 );
+    res3 = vec_mergel( tmp2, tmp3 );
+    return Matrix4(
+        Vector4( res0 ),
+        Vector4( res1 ),
+        Vector4( res2 ),
+        Vector4( res3 )
+    );
+}
+
+// TODO: Tidy
+static VM_ATTRIBUTE_ALIGN16 const unsigned int _vmathPNPN[4] = {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+static VM_ATTRIBUTE_ALIGN16 const unsigned int _vmathNPNP[4] = {0x80000000, 0x00000000, 0x80000000, 0x00000000};
+static VM_ATTRIBUTE_ALIGN16 const float _vmathZERONE[4] = {1.0f, 0.0f, 0.0f, 1.0f};
+
+VECTORMATH_FORCE_INLINE const Matrix4 inverse( const Matrix4 & mat )
+{
+	__m128 Va,Vb,Vc;
+	__m128 r1,r2,r3,tt,tt2;
+	__m128 sum,Det,RDet;
+	__m128 trns0,trns1,trns2,trns3;
+
+	__m128 _L1 = mat.getCol0().get128();
+	__m128 _L2 = mat.getCol1().get128();
+	__m128 _L3 = mat.getCol2().get128();
+	__m128 _L4 = mat.getCol3().get128();
+	// Calculating the minterms for the first line.
+
+	// _mm_ror_ps is just a macro using _mm_shuffle_ps().
+	tt = _L4; tt2 = _mm_ror_ps(_L3,1); 
+	Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));					// V3'dot V4
+	Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));					// V3'dot V4"
+	Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));					// V3' dot V4^
+
+	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V3" dot V4^ - V3^ dot V4"
+	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V3^ dot V4' - V3' dot V4^
+	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V3' dot V4" - V3" dot V4'
+
+	tt = _L2;
+	Va = _mm_ror_ps(tt,1);		sum = _mm_mul_ps(Va,r1);
+	Vb = _mm_ror_ps(tt,2);		sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
+	Vc = _mm_ror_ps(tt,3);		sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
+
+	// Calculating the determinant.
+	Det = _mm_mul_ps(sum,_L1);
+	Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
+
+	const __m128 Sign_PNPN = _mm_load_ps((float *)_vmathPNPN);
+	const __m128 Sign_NPNP = _mm_load_ps((float *)_vmathNPNP);
+
+	__m128 mtL1 = _mm_xor_ps(sum,Sign_PNPN);
+
+	// Calculating the minterms of the second line (using previous results).
+	tt = _mm_ror_ps(_L1,1);		sum = _mm_mul_ps(tt,r1);
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+	__m128 mtL2 = _mm_xor_ps(sum,Sign_NPNP);
+
+	// Testing the determinant.
+	Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
+
+	// Calculating the minterms of the third line.
+	tt = _mm_ror_ps(_L1,1);
+	Va = _mm_mul_ps(tt,Vb);									// V1' dot V2"
+	Vb = _mm_mul_ps(tt,Vc);									// V1' dot V2^
+	Vc = _mm_mul_ps(tt,_L2);								// V1' dot V2
+
+	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V1" dot V2^ - V1^ dot V2"
+	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V1^ dot V2' - V1' dot V2^
+	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V1' dot V2" - V1" dot V2'
+
+	tt = _mm_ror_ps(_L4,1);		sum = _mm_mul_ps(tt,r1);
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+	__m128 mtL3 = _mm_xor_ps(sum,Sign_PNPN);
+
+	// Dividing is FASTER than rcp_nr! (Because rcp_nr causes many register-memory RWs).
+	RDet = _mm_div_ss(_mm_load_ss((float *)&_vmathZERONE), Det); // TODO: just 1.0f?
+	RDet = _mm_shuffle_ps(RDet,RDet,0x00);
+
+	// Devide the first 12 minterms with the determinant.
+	mtL1 = _mm_mul_ps(mtL1, RDet);
+	mtL2 = _mm_mul_ps(mtL2, RDet);
+	mtL3 = _mm_mul_ps(mtL3, RDet);
+
+	// Calculate the minterms of the forth line and devide by the determinant.
+	tt = _mm_ror_ps(_L3,1);		sum = _mm_mul_ps(tt,r1);
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+	__m128 mtL4 = _mm_xor_ps(sum,Sign_NPNP);
+	mtL4 = _mm_mul_ps(mtL4, RDet);
+
+	// Now we just have to transpose the minterms matrix.
+	trns0 = _mm_unpacklo_ps(mtL1,mtL2);
+	trns1 = _mm_unpacklo_ps(mtL3,mtL4);
+	trns2 = _mm_unpackhi_ps(mtL1,mtL2);
+	trns3 = _mm_unpackhi_ps(mtL3,mtL4);
+	_L1 = _mm_movelh_ps(trns0,trns1);
+	_L2 = _mm_movehl_ps(trns1,trns0);
+	_L3 = _mm_movelh_ps(trns2,trns3);
+	_L4 = _mm_movehl_ps(trns3,trns2);
+
+    return Matrix4(
+        Vector4( _L1 ),
+        Vector4( _L2 ),
+        Vector4( _L3 ),
+        Vector4( _L4 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 affineInverse( const Matrix4 & mat )
+{
+    Transform3 affineMat;
+    affineMat.setCol0( mat.getCol0().getXYZ( ) );
+    affineMat.setCol1( mat.getCol1().getXYZ( ) );
+    affineMat.setCol2( mat.getCol2().getXYZ( ) );
+    affineMat.setCol3( mat.getCol3().getXYZ( ) );
+    return Matrix4( inverse( affineMat ) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 orthoInverse( const Matrix4 & mat )
+{
+    Transform3 affineMat;
+    affineMat.setCol0( mat.getCol0().getXYZ( ) );
+    affineMat.setCol1( mat.getCol1().getXYZ( ) );
+    affineMat.setCol2( mat.getCol2().getXYZ( ) );
+    affineMat.setCol3( mat.getCol3().getXYZ( ) );
+    return Matrix4( orthoInverse( affineMat ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix4 & mat )
+{
+	__m128 Va,Vb,Vc;
+	__m128 r1,r2,r3,tt,tt2;
+	__m128 sum,Det;
+
+	__m128 _L1 = mat.getCol0().get128();
+	__m128 _L2 = mat.getCol1().get128();
+	__m128 _L3 = mat.getCol2().get128();
+	__m128 _L4 = mat.getCol3().get128();
+	// Calculating the minterms for the first line.
+
+	// _mm_ror_ps is just a macro using _mm_shuffle_ps().
+	tt = _L4; tt2 = _mm_ror_ps(_L3,1); 
+	Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));					// V3' dot V4
+	Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));					// V3' dot V4"
+	Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));					// V3' dot V4^
+
+	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V3" dot V4^ - V3^ dot V4"
+	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V3^ dot V4' - V3' dot V4^
+	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V3' dot V4" - V3" dot V4'
+
+	tt = _L2;
+	Va = _mm_ror_ps(tt,1);		sum = _mm_mul_ps(Va,r1);
+	Vb = _mm_ror_ps(tt,2);		sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
+	Vc = _mm_ror_ps(tt,3);		sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
+
+	// Calculating the determinant.
+	Det = _mm_mul_ps(sum,_L1);
+	Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
+
+	// Calculating the minterms of the second line (using previous results).
+	tt = _mm_ror_ps(_L1,1);		sum = _mm_mul_ps(tt,r1);
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+
+	// Testing the determinant.
+	Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
+	return floatInVec(Det, 0);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator +( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( mCol0 + mat.mCol0 ),
+        ( mCol1 + mat.mCol1 ),
+        ( mCol2 + mat.mCol2 ),
+        ( mCol3 + mat.mCol3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator -( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( mCol0 - mat.mCol0 ),
+        ( mCol1 - mat.mCol1 ),
+        ( mCol2 - mat.mCol2 ),
+        ( mCol3 - mat.mCol3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator +=( const Matrix4 & mat )
+{
+    *this = *this + mat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator -=( const Matrix4 & mat )
+{
+    *this = *this - mat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator -( ) const
+{
+    return Matrix4(
+        ( -mCol0 ),
+        ( -mCol1 ),
+        ( -mCol2 ),
+        ( -mCol3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 absPerElem( const Matrix4 & mat )
+{
+    return Matrix4(
+        absPerElem( mat.getCol0() ),
+        absPerElem( mat.getCol1() ),
+        absPerElem( mat.getCol2() ),
+        absPerElem( mat.getCol3() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( const floatInVec &scalar ) const
+{
+    return Matrix4(
+        ( mCol0 * scalar ),
+        ( mCol1 * scalar ),
+        ( mCol2 * scalar ),
+        ( mCol3 * scalar )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( float scalar )
+{
+    return *this *= floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 operator *( float scalar, const Matrix4 & mat )
+{
+    return floatInVec(scalar) * mat;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat )
+{
+    return mat * scalar;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator *( const Vector4 &vec ) const
+{
+    return Vector4(
+		_mm_add_ps(
+			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(1,1,1,1)))),
+			_mm_add_ps(_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(2,2,2,2))), _mm_mul_ps(mCol3.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(3,3,3,3)))))
+		);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator *( const Vector3 &vec ) const
+{
+    return Vector4(
+		_mm_add_ps(
+			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(1,1,1,1)))),
+			_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(2,2,2,2))))
+		);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator *( const Point3 &pnt ) const
+{
+    return Vector4(
+		_mm_add_ps(
+			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(1,1,1,1)))),
+			_mm_add_ps(_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(2,2,2,2))), mCol3.get128()))
+		);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( *this * mat.mCol0 ),
+        ( *this * mat.mCol1 ),
+        ( *this * mat.mCol2 ),
+        ( *this * mat.mCol3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( const Matrix4 & mat )
+{
+    *this = *this * mat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( const Transform3 & tfrm ) const
+{
+    return Matrix4(
+        ( *this * tfrm.getCol0() ),
+        ( *this * tfrm.getCol1() ),
+        ( *this * tfrm.getCol2() ),
+        ( *this * Point3( tfrm.getCol3() ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( const Transform3 & tfrm )
+{
+    *this = *this * tfrm;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 )
+{
+    return Matrix4(
+        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
+        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
+        mulPerElem( mat0.getCol2(), mat1.getCol2() ),
+        mulPerElem( mat0.getCol3(), mat1.getCol3() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::identity( )
+{
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4::yAxis( ),
+        Vector4::zAxis( ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setUpper3x3( const Matrix3 & mat3 )
+{
+    mCol0.setXYZ( mat3.getCol0() );
+    mCol1.setXYZ( mat3.getCol1() );
+    mCol2.setXYZ( mat3.getCol2() );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix4::getUpper3x3( ) const
+{
+    return Matrix3(
+        mCol0.getXYZ( ),
+        mCol1.getXYZ( ),
+        mCol2.getXYZ( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setTranslation( const Vector3 &translateVec )
+{
+    mCol3.setXYZ( translateVec );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix4::getTranslation( ) const
+{
+    return mCol3.getXYZ( );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationX( float radians )
+{
+    return rotationX( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationX( const floatInVec &radians )
+{
+    __m128 s, c, res1, res2;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res1 = vec_sel( zero, c, select_y );
+    res1 = vec_sel( res1, s, select_z );
+    res2 = vec_sel( zero, negatef4(s), select_y );
+    res2 = vec_sel( res2, c, select_z );
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4( res1 ),
+        Vector4( res2 ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationY( float radians )
+{
+    return rotationY( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationY( const floatInVec &radians )
+{
+    __m128 s, c, res0, res2;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, negatef4(s), select_z );
+    res2 = vec_sel( zero, s, select_x );
+    res2 = vec_sel( res2, c, select_z );
+    return Matrix4(
+        Vector4( res0 ),
+        Vector4::yAxis( ),
+        Vector4( res2 ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationZ( float radians )
+{
+    return rotationZ( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationZ( const floatInVec &radians )
+{
+    __m128 s, c, res0, res1;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, s, select_y );
+    res1 = vec_sel( zero, negatef4(s), select_x );
+    res1 = vec_sel( res1, c, select_y );
+    return Matrix4(
+        Vector4( res0 ),
+        Vector4( res1 ),
+        Vector4::zAxis( ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationZYX( const Vector3 &radiansXYZ )
+{
+    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
+    angles = Vector4( radiansXYZ, 0.0f ).get128();
+    sincosf4( angles, &s, &c );
+    negS = negatef4( s );
+    Z0 = vec_mergel( c, s );
+    Z1 = vec_mergel( negS, c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
+    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
+	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
+	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
+    X0 = vec_splat( s, 0 );
+    X1 = vec_splat( c, 0 );
+    tmp = vec_mul( Z0, Y1 );
+    return Matrix4(
+        Vector4( vec_mul( Z0, Y0 ) ),
+        Vector4( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
+        Vector4( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotation( float radians, const Vector3 &unitVec )
+{
+    return rotation( floatInVec(radians), unitVec );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotation( const floatInVec &radians, const Vector3 &unitVec )
+{
+    __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2;
+    axis = unitVec.get128();
+    sincosf4( radians.get128(), &s, &c );
+    xxxx = vec_splat( axis, 0 );
+    yyyy = vec_splat( axis, 1 );
+    zzzz = vec_splat( axis, 2 );
+    oneMinusC = vec_sub( _mm_set1_ps(1.0f), c );
+    axisS = vec_mul( axis, s );
+    negAxisS = negatef4( axisS );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    //tmp0 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_XZBX );
+	tmp0 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,2,0) );
+	tmp0 = vec_sel(tmp0, vec_splat(negAxisS, 1), select_z);
+    //tmp1 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_CXXX );
+	tmp1 = vec_sel( vec_splat(axisS, 0), vec_splat(negAxisS, 2), select_x );
+    //tmp2 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_YAXX );
+	tmp2 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,0,1) );
+	tmp2 = vec_sel(tmp2, vec_splat(negAxisS, 0), select_y);
+    tmp0 = vec_sel( tmp0, c, select_x );
+    tmp1 = vec_sel( tmp1, c, select_y );
+    tmp2 = vec_sel( tmp2, c, select_z );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
+    axis = vec_and( axis, _mm_load_ps( (float *)select_xyz ) );
+    tmp0 = vec_and( tmp0, _mm_load_ps( (float *)select_xyz ) );
+    tmp1 = vec_and( tmp1, _mm_load_ps( (float *)select_xyz ) );
+    tmp2 = vec_and( tmp2, _mm_load_ps( (float *)select_xyz ) );
+    return Matrix4(
+        Vector4( vec_madd( vec_mul( axis, xxxx ), oneMinusC, tmp0 ) ),
+        Vector4( vec_madd( vec_mul( axis, yyyy ), oneMinusC, tmp1 ) ),
+        Vector4( vec_madd( vec_mul( axis, zzzz ), oneMinusC, tmp2 ) ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotation( const Quat &unitQuat )
+{
+    return Matrix4( Transform3::rotation( unitQuat ) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::scale( const Vector3 &scaleVec )
+{
+    __m128 zero = _mm_setzero_ps();
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    return Matrix4(
+        Vector4( vec_sel( zero, scaleVec.get128(), select_x ) ),
+        Vector4( vec_sel( zero, scaleVec.get128(), select_y ) ),
+        Vector4( vec_sel( zero, scaleVec.get128(), select_z ) ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec )
+{
+    return Matrix4(
+        ( mat.getCol0() * scaleVec.getX( ) ),
+        ( mat.getCol1() * scaleVec.getY( ) ),
+        ( mat.getCol2() * scaleVec.getZ( ) ),
+        mat.getCol3()
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat )
+{
+    Vector4 scale4;
+    scale4 = Vector4( scaleVec, 1.0f );
+    return Matrix4(
+        mulPerElem( mat.getCol0(), scale4 ),
+        mulPerElem( mat.getCol1(), scale4 ),
+        mulPerElem( mat.getCol2(), scale4 ),
+        mulPerElem( mat.getCol3(), scale4 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::translation( const Vector3 &translateVec )
+{
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4::yAxis( ),
+        Vector4::zAxis( ),
+        Vector4( translateVec, 1.0f )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec )
+{
+    Matrix4 m4EyeFrame;
+    Vector3 v3X, v3Y, v3Z;
+    v3Y = normalize( upVec );
+    v3Z = normalize( ( eyePos - lookAtPos ) );
+    v3X = normalize( cross( v3Y, v3Z ) );
+    v3Y = cross( v3Z, v3X );
+    m4EyeFrame = Matrix4( Vector4( v3X ), Vector4( v3Y ), Vector4( v3Z ), Vector4( eyePos ) );
+    return orthoInverse( m4EyeFrame );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::perspective( float fovyRadians, float aspect, float zNear, float zFar )
+{
+    float f, rangeInv;
+    __m128 zero, col0, col1, col2, col3;
+    union { __m128 v; float s[4]; } tmp;
+    f = tanf( _VECTORMATH_PI_OVER_2 - fovyRadians * 0.5f );
+    rangeInv = 1.0f / ( zNear - zFar );
+    zero = _mm_setzero_ps();
+    tmp.v = zero;
+    tmp.s[0] = f / aspect;
+    col0 = tmp.v;
+    tmp.v = zero;
+    tmp.s[1] = f;
+    col1 = tmp.v;
+    tmp.v = zero;
+    tmp.s[2] = ( zNear + zFar ) * rangeInv;
+    tmp.s[3] = -1.0f;
+    col2 = tmp.v;
+    tmp.v = zero;
+    tmp.s[2] = zNear * zFar * rangeInv * 2.0f;
+    col3 = tmp.v;
+    return Matrix4(
+        Vector4( col0 ),
+        Vector4( col1 ),
+        Vector4( col2 ),
+        Vector4( col3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::frustum( float left, float right, float bottom, float top, float zNear, float zFar )
+{
+    /* function implementation based on code from STIDC SDK:           */
+    /* --------------------------------------------------------------  */
+    /* PLEASE DO NOT MODIFY THIS SECTION                               */
+    /* This prolog section is automatically generated.                 */
+    /*                                                                 */
+    /* (C)Copyright                                                    */
+    /* Sony Computer Entertainment, Inc.,                              */
+    /* Toshiba Corporation,                                            */
+    /* International Business Machines Corporation,                    */
+    /* 2001,2002.                                                      */
+    /* S/T/I Confidential Information                                  */
+    /* --------------------------------------------------------------  */
+    __m128 lbf, rtn;
+    __m128 diff, sum, inv_diff;
+    __m128 diagonal, column, near2;
+    __m128 zero = _mm_setzero_ps();
+    union { __m128 v; float s[4]; } l, f, r, n, b, t; // TODO: Union?
+    l.s[0] = left;
+    f.s[0] = zFar;
+    r.s[0] = right;
+    n.s[0] = zNear;
+    b.s[0] = bottom;
+    t.s[0] = top;
+    lbf = vec_mergeh( l.v, f.v );
+    rtn = vec_mergeh( r.v, n.v );
+    lbf = vec_mergeh( lbf, b.v );
+    rtn = vec_mergeh( rtn, t.v );
+    diff = vec_sub( rtn, lbf );
+    sum  = vec_add( rtn, lbf );
+    inv_diff = recipf4( diff );
+    near2 = vec_splat( n.v, 0 );
+    near2 = vec_add( near2, near2 );
+    diagonal = vec_mul( near2, inv_diff );
+    column = vec_mul( sum, inv_diff );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
+    return Matrix4(
+        Vector4( vec_sel( zero, diagonal, select_x ) ),
+        Vector4( vec_sel( zero, diagonal, select_y ) ),
+        Vector4( vec_sel( column, _mm_set1_ps(-1.0f), select_w ) ),
+        Vector4( vec_sel( zero, vec_mul( diagonal, vec_splat( f.v, 0 ) ), select_z ) )
+	);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::orthographic( float left, float right, float bottom, float top, float zNear, float zFar )
+{
+    /* function implementation based on code from STIDC SDK:           */
+    /* --------------------------------------------------------------  */
+    /* PLEASE DO NOT MODIFY THIS SECTION                               */
+    /* This prolog section is automatically generated.                 */
+    /*                                                                 */
+    /* (C)Copyright                                                    */
+    /* Sony Computer Entertainment, Inc.,                              */
+    /* Toshiba Corporation,                                            */
+    /* International Business Machines Corporation,                    */
+    /* 2001,2002.                                                      */
+    /* S/T/I Confidential Information                                  */
+    /* --------------------------------------------------------------  */
+    __m128 lbf, rtn;
+    __m128 diff, sum, inv_diff, neg_inv_diff;
+    __m128 diagonal, column;
+    __m128 zero = _mm_setzero_ps();
+    union { __m128 v; float s[4]; } l, f, r, n, b, t;
+    l.s[0] = left;
+    f.s[0] = zFar;
+    r.s[0] = right;
+    n.s[0] = zNear;
+    b.s[0] = bottom;
+    t.s[0] = top;
+    lbf = vec_mergeh( l.v, f.v );
+    rtn = vec_mergeh( r.v, n.v );
+    lbf = vec_mergeh( lbf, b.v );
+    rtn = vec_mergeh( rtn, t.v );
+    diff = vec_sub( rtn, lbf );
+    sum  = vec_add( rtn, lbf );
+    inv_diff = recipf4( diff );
+    neg_inv_diff = negatef4( inv_diff );
+    diagonal = vec_add( inv_diff, inv_diff );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
+    column = vec_mul( sum, vec_sel( neg_inv_diff, inv_diff, select_z ) ); // TODO: no madds with zero
+    return Matrix4(
+        Vector4( vec_sel( zero, diagonal, select_x ) ),
+        Vector4( vec_sel( zero, diagonal, select_y ) ),
+        Vector4( vec_sel( zero, diagonal, select_z ) ),
+        Vector4( vec_sel( column, _mm_set1_ps(1.0f), select_w ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 )
+{
+    return Matrix4(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 ),
+        select( mat0.getCol3(), mat1.getCol3(), select1 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 )
+{
+    return Matrix4(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 ),
+        select( mat0.getCol3(), mat1.getCol3(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat )
+{
+    print( mat.getRow( 0 ) );
+    print( mat.getRow( 1 ) );
+    print( mat.getRow( 2 ) );
+    print( mat.getRow( 3 ) );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat, const char * name )
+{
+    printf("%s:\n", name);
+    print( mat );
+}
+
+#endif
+
+VECTORMATH_FORCE_INLINE Transform3::Transform3( const Transform3 & tfrm )
+{
+    mCol0 = tfrm.mCol0;
+    mCol1 = tfrm.mCol1;
+    mCol2 = tfrm.mCol2;
+    mCol3 = tfrm.mCol3;
+}
+
+VECTORMATH_FORCE_INLINE Transform3::Transform3( float scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+    mCol3 = Vector3( scalar );
+}
+
+VECTORMATH_FORCE_INLINE Transform3::Transform3( const floatInVec &scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+    mCol3 = Vector3( scalar );
+}
+
+VECTORMATH_FORCE_INLINE Transform3::Transform3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2, const Vector3 &_col3 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+    mCol3 = _col3;
+}
+
+VECTORMATH_FORCE_INLINE Transform3::Transform3( const Matrix3 & tfrm, const Vector3 &translateVec )
+{
+    this->setUpper3x3( tfrm );
+    this->setTranslation( translateVec );
+}
+
+VECTORMATH_FORCE_INLINE Transform3::Transform3( const Quat &unitQuat, const Vector3 &translateVec )
+{
+    this->setUpper3x3( Matrix3( unitQuat ) );
+    this->setTranslation( translateVec );
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol0( const Vector3 &_col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol1( const Vector3 &_col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol2( const Vector3 &_col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol3( const Vector3 &_col3 )
+{
+    mCol3 = _col3;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol( int col, const Vector3 &vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setRow( int row, const Vector4 &vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    mCol3.setElem( row, vec.getElem( 3 ) );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setElem( int col, int row, float val )
+{
+    (*this)[col].setElem(row, val);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setElem( int col, int row, const floatInVec &val )
+{
+    Vector3 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Transform3::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol0( ) const
+{
+    return mCol0;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol1( ) const
+{
+    return mCol1;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol2( ) const
+{
+    return mCol2;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol3( ) const
+{
+    return mCol3;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Transform3::getRow( int row ) const
+{
+    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Transform3::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::operator =( const Transform3 & tfrm )
+{
+    mCol0 = tfrm.mCol0;
+    mCol1 = tfrm.mCol1;
+    mCol2 = tfrm.mCol2;
+    mCol3 = tfrm.mCol3;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 inverse( const Transform3 & tfrm )
+{
+    __m128 inv0, inv1, inv2, inv3;
+    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet;
+    __m128 xxxx, yyyy, zzzz;
+    tmp2 = _vmathVfCross( tfrm.getCol0().get128(), tfrm.getCol1().get128() );
+    tmp0 = _vmathVfCross( tfrm.getCol1().get128(), tfrm.getCol2().get128() );
+    tmp1 = _vmathVfCross( tfrm.getCol2().get128(), tfrm.getCol0().get128() );
+    inv3 = negatef4( tfrm.getCol3().get128() );
+    dot = _vmathVfDot3( tmp2, tfrm.getCol2().get128() );
+    dot = vec_splat( dot, 0 );
+    invdet = recipf4( dot );
+    tmp3 = vec_mergeh( tmp0, tmp2 );
+    tmp4 = vec_mergel( tmp0, tmp2 );
+    inv0 = vec_mergeh( tmp3, tmp1 );
+    xxxx = vec_splat( inv3, 0 );
+    //inv1 = vec_perm( tmp3, tmp1, _VECTORMATH_PERM_ZBWX );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	inv1 = _mm_shuffle_ps( tmp3, tmp3, _MM_SHUFFLE(0,3,2,2));
+	inv1 = vec_sel(inv1, tmp1, select_y);
+    //inv2 = vec_perm( tmp4, tmp1, _VECTORMATH_PERM_XCYX );
+	inv2 = _mm_shuffle_ps( tmp4, tmp4, _MM_SHUFFLE(0,1,1,0));
+	inv2 = vec_sel(inv2, vec_splat(tmp1, 2), select_y);
+    yyyy = vec_splat( inv3, 1 );
+    zzzz = vec_splat( inv3, 2 );
+    inv3 = vec_mul( inv0, xxxx );
+    inv3 = vec_madd( inv1, yyyy, inv3 );
+    inv3 = vec_madd( inv2, zzzz, inv3 );
+    inv0 = vec_mul( inv0, invdet );
+    inv1 = vec_mul( inv1, invdet );
+    inv2 = vec_mul( inv2, invdet );
+    inv3 = vec_mul( inv3, invdet );
+    return Transform3(
+        Vector3( inv0 ),
+        Vector3( inv1 ),
+        Vector3( inv2 ),
+        Vector3( inv3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 orthoInverse( const Transform3 & tfrm )
+{
+    __m128 inv0, inv1, inv2, inv3;
+    __m128 tmp0, tmp1;
+    __m128 xxxx, yyyy, zzzz;
+    tmp0 = vec_mergeh( tfrm.getCol0().get128(), tfrm.getCol2().get128() );
+    tmp1 = vec_mergel( tfrm.getCol0().get128(), tfrm.getCol2().get128() );
+    inv3 = negatef4( tfrm.getCol3().get128() );
+    inv0 = vec_mergeh( tmp0, tfrm.getCol1().get128() );
+    xxxx = vec_splat( inv3, 0 );
+    //inv1 = vec_perm( tmp0, tfrm.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	inv1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
+	inv1 = vec_sel(inv1, tfrm.getCol1().get128(), select_y);
+    //inv2 = vec_perm( tmp1, tfrm.getCol1().get128(), _VECTORMATH_PERM_XCYX );
+	inv2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
+	inv2 = vec_sel(inv2, vec_splat(tfrm.getCol1().get128(), 2), select_y);
+    yyyy = vec_splat( inv3, 1 );
+    zzzz = vec_splat( inv3, 2 );
+    inv3 = vec_mul( inv0, xxxx );
+    inv3 = vec_madd( inv1, yyyy, inv3 );
+    inv3 = vec_madd( inv2, zzzz, inv3 );
+    return Transform3(
+        Vector3( inv0 ),
+        Vector3( inv1 ),
+        Vector3( inv2 ),
+        Vector3( inv3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 absPerElem( const Transform3 & tfrm )
+{
+    return Transform3(
+        absPerElem( tfrm.getCol0() ),
+        absPerElem( tfrm.getCol1() ),
+        absPerElem( tfrm.getCol2() ),
+        absPerElem( tfrm.getCol3() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::operator *( const Vector3 &vec ) const
+{
+    __m128 res;
+    __m128 xxxx, yyyy, zzzz;
+    xxxx = vec_splat( vec.get128(), 0 );
+    yyyy = vec_splat( vec.get128(), 1 );
+    zzzz = vec_splat( vec.get128(), 2 );
+    res = vec_mul( mCol0.get128(), xxxx );
+    res = vec_madd( mCol1.get128(), yyyy, res );
+    res = vec_madd( mCol2.get128(), zzzz, res );
+    return Vector3( res );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 Transform3::operator *( const Point3 &pnt ) const
+{
+    __m128 tmp0, tmp1, res;
+    __m128 xxxx, yyyy, zzzz;
+    xxxx = vec_splat( pnt.get128(), 0 );
+    yyyy = vec_splat( pnt.get128(), 1 );
+    zzzz = vec_splat( pnt.get128(), 2 );
+    tmp0 = vec_mul( mCol0.get128(), xxxx );
+    tmp1 = vec_mul( mCol1.get128(), yyyy );
+    tmp0 = vec_madd( mCol2.get128(), zzzz, tmp0 );
+    tmp1 = vec_add( mCol3.get128(), tmp1 );
+    res = vec_add( tmp0, tmp1 );
+    return Point3( res );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::operator *( const Transform3 & tfrm ) const
+{
+    return Transform3(
+        ( *this * tfrm.mCol0 ),
+        ( *this * tfrm.mCol1 ),
+        ( *this * tfrm.mCol2 ),
+        Vector3( ( *this * Point3( tfrm.mCol3 ) ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::operator *=( const Transform3 & tfrm )
+{
+    *this = *this * tfrm;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 )
+{
+    return Transform3(
+        mulPerElem( tfrm0.getCol0(), tfrm1.getCol0() ),
+        mulPerElem( tfrm0.getCol1(), tfrm1.getCol1() ),
+        mulPerElem( tfrm0.getCol2(), tfrm1.getCol2() ),
+        mulPerElem( tfrm0.getCol3(), tfrm1.getCol3() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::identity( )
+{
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( ),
+        Vector3( 0.0f )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setUpper3x3( const Matrix3 & tfrm )
+{
+    mCol0 = tfrm.getCol0();
+    mCol1 = tfrm.getCol1();
+    mCol2 = tfrm.getCol2();
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Transform3::getUpper3x3( ) const
+{
+    return Matrix3( mCol0, mCol1, mCol2 );
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setTranslation( const Vector3 &translateVec )
+{
+    mCol3 = translateVec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::getTranslation( ) const
+{
+    return mCol3;
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationX( float radians )
+{
+    return rotationX( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationX( const floatInVec &radians )
+{
+    __m128 s, c, res1, res2;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res1 = vec_sel( zero, c, select_y );
+    res1 = vec_sel( res1, s, select_z );
+    res2 = vec_sel( zero, negatef4(s), select_y );
+    res2 = vec_sel( res2, c, select_z );
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3( res1 ),
+        Vector3( res2 ),
+        Vector3( _mm_setzero_ps() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationY( float radians )
+{
+    return rotationY( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationY( const floatInVec &radians )
+{
+    __m128 s, c, res0, res2;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, negatef4(s), select_z );
+    res2 = vec_sel( zero, s, select_x );
+    res2 = vec_sel( res2, c, select_z );
+    return Transform3(
+        Vector3( res0 ),
+        Vector3::yAxis( ),
+        Vector3( res2 ),
+        Vector3( 0.0f )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationZ( float radians )
+{
+    return rotationZ( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationZ( const floatInVec &radians )
+{
+    __m128 s, c, res0, res1;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+    __m128 zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, s, select_y );
+    res1 = vec_sel( zero, negatef4(s), select_x );
+    res1 = vec_sel( res1, c, select_y );
+    return Transform3(
+        Vector3( res0 ),
+        Vector3( res1 ),
+        Vector3::zAxis( ),
+        Vector3( 0.0f )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationZYX( const Vector3 &radiansXYZ )
+{
+    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
+    angles = Vector4( radiansXYZ, 0.0f ).get128();
+    sincosf4( angles, &s, &c );
+    negS = negatef4( s );
+    Z0 = vec_mergel( c, s );
+    Z1 = vec_mergel( negS, c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
+    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
+	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
+	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
+    X0 = vec_splat( s, 0 );
+    X1 = vec_splat( c, 0 );
+    tmp = vec_mul( Z0, Y1 );
+    return Transform3(
+        Vector3( vec_mul( Z0, Y0 ) ),
+        Vector3( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
+        Vector3( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) ),
+        Vector3( 0.0f )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotation( float radians, const Vector3 &unitVec )
+{
+    return rotation( floatInVec(radians), unitVec );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotation( const floatInVec &radians, const Vector3 &unitVec )
+{
+    return Transform3( Matrix3::rotation( radians, unitVec ), Vector3( 0.0f ) );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotation( const Quat &unitQuat )
+{
+    return Transform3( Matrix3( unitQuat ), Vector3( 0.0f ) );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::scale( const Vector3 &scaleVec )
+{
+    __m128 zero = _mm_setzero_ps();
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    return Transform3(
+        Vector3( vec_sel( zero, scaleVec.get128(), select_x ) ),
+        Vector3( vec_sel( zero, scaleVec.get128(), select_y ) ),
+        Vector3( vec_sel( zero, scaleVec.get128(), select_z ) ),
+        Vector3( 0.0f )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec )
+{
+    return Transform3(
+        ( tfrm.getCol0() * scaleVec.getX( ) ),
+        ( tfrm.getCol1() * scaleVec.getY( ) ),
+        ( tfrm.getCol2() * scaleVec.getZ( ) ),
+        tfrm.getCol3()
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm )
+{
+    return Transform3(
+        mulPerElem( tfrm.getCol0(), scaleVec ),
+        mulPerElem( tfrm.getCol1(), scaleVec ),
+        mulPerElem( tfrm.getCol2(), scaleVec ),
+        mulPerElem( tfrm.getCol3(), scaleVec )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::translation( const Vector3 &translateVec )
+{
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( ),
+        translateVec
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 )
+{
+    return Transform3(
+        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
+        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
+        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
+        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 )
+{
+    return Transform3(
+        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
+        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
+        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
+        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm )
+{
+    print( tfrm.getRow( 0 ) );
+    print( tfrm.getRow( 1 ) );
+    print( tfrm.getRow( 2 ) );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm, const char * name )
+{
+    printf("%s:\n", name);
+    print( tfrm );
+}
+
+#endif
+
+VECTORMATH_FORCE_INLINE Quat::Quat( const Matrix3 & tfrm )
+{
+    __m128 res;
+    __m128 col0, col1, col2;
+    __m128 xx_yy, xx_yy_zz_xx, yy_zz_xx_yy, zz_xx_yy_zz, diagSum, diagDiff;
+    __m128 zy_xz_yx, yz_zx_xy, sum, diff;
+    __m128 radicand, invSqrt, scale;
+    __m128 res0, res1, res2, res3;
+    __m128 xx, yy, zz;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
+
+    col0 = tfrm.getCol0().get128();
+    col1 = tfrm.getCol1().get128();
+    col2 = tfrm.getCol2().get128();
+
+    /* four cases: */
+    /* trace > 0 */
+    /* else */
+    /*    xx largest diagonal element */
+    /*    yy largest diagonal element */
+    /*    zz largest diagonal element */
+
+    /* compute quaternion for each case */
+
+    xx_yy = vec_sel( col0, col1, select_y );
+    //xx_yy_zz_xx = vec_perm( xx_yy, col2, _VECTORMATH_PERM_XYCX );
+    //yy_zz_xx_yy = vec_perm( xx_yy, col2, _VECTORMATH_PERM_YCXY );
+    //zz_xx_yy_zz = vec_perm( xx_yy, col2, _VECTORMATH_PERM_CXYC );
+    xx_yy_zz_xx = _mm_shuffle_ps( xx_yy, xx_yy, _MM_SHUFFLE(0,0,1,0) );
+    xx_yy_zz_xx = vec_sel( xx_yy_zz_xx, col2, select_z ); // TODO: Ck
+    yy_zz_xx_yy = _mm_shuffle_ps( xx_yy_zz_xx, xx_yy_zz_xx, _MM_SHUFFLE(1,0,2,1) );
+    zz_xx_yy_zz = _mm_shuffle_ps( xx_yy_zz_xx, xx_yy_zz_xx, _MM_SHUFFLE(2,1,0,2) );
+
+    diagSum = vec_add( vec_add( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz );
+    diagDiff = vec_sub( vec_sub( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz );
+    radicand = vec_add( vec_sel( diagDiff, diagSum, select_w ), _mm_set1_ps(1.0f) );
+ //   invSqrt = rsqrtf4( radicand );
+	invSqrt = newtonrapson_rsqrt4( radicand );
+
+	
+
+    zy_xz_yx = vec_sel( col0, col1, select_z );									// zy_xz_yx = 00 01 12 03
+    //zy_xz_yx = vec_perm( zy_xz_yx, col2, _VECTORMATH_PERM_ZAYX );
+	zy_xz_yx = _mm_shuffle_ps( zy_xz_yx, zy_xz_yx, _MM_SHUFFLE(0,1,2,2) );		// zy_xz_yx = 12 12 01 00
+    zy_xz_yx = vec_sel( zy_xz_yx, vec_splat(col2, 0), select_y );				// zy_xz_yx = 12 20 01 00
+    yz_zx_xy = vec_sel( col0, col1, select_x );									// yz_zx_xy = 10 01 02 03
+    //yz_zx_xy = vec_perm( yz_zx_xy, col2, _VECTORMATH_PERM_BZXX );
+	yz_zx_xy = _mm_shuffle_ps( yz_zx_xy, yz_zx_xy, _MM_SHUFFLE(0,0,2,0) );		// yz_zx_xy = 10 02 10 10
+	yz_zx_xy = vec_sel( yz_zx_xy, vec_splat(col2, 1), select_x );				// yz_zx_xy = 21 02 10 10
+
+    sum = vec_add( zy_xz_yx, yz_zx_xy );
+    diff = vec_sub( zy_xz_yx, yz_zx_xy );
+
+    scale = vec_mul( invSqrt, _mm_set1_ps(0.5f) );
+
+    //res0 = vec_perm( sum, diff, _VECTORMATH_PERM_XZYA );
+	res0 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,1,2,0) );
+	res0 = vec_sel( res0, vec_splat(diff, 0), select_w );  // TODO: Ck
+    //res1 = vec_perm( sum, diff, _VECTORMATH_PERM_ZXXB );
+	res1 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,0,0,2) );
+	res1 = vec_sel( res1, vec_splat(diff, 1), select_w );  // TODO: Ck
+    //res2 = vec_perm( sum, diff, _VECTORMATH_PERM_YXXC );
+	res2 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,0,0,1) );
+	res2 = vec_sel( res2, vec_splat(diff, 2), select_w );  // TODO: Ck
+    res3 = diff;
+    res0 = vec_sel( res0, radicand, select_x );
+    res1 = vec_sel( res1, radicand, select_y );
+    res2 = vec_sel( res2, radicand, select_z );
+    res3 = vec_sel( res3, radicand, select_w );
+    res0 = vec_mul( res0, vec_splat( scale, 0 ) );
+    res1 = vec_mul( res1, vec_splat( scale, 1 ) );
+    res2 = vec_mul( res2, vec_splat( scale, 2 ) );
+    res3 = vec_mul( res3, vec_splat( scale, 3 ) );
+
+    /* determine case and select answer */
+
+    xx = vec_splat( col0, 0 );
+    yy = vec_splat( col1, 1 );
+    zz = vec_splat( col2, 2 );
+    res = vec_sel( res0, res1, vec_cmpgt( yy, xx ) );
+    res = vec_sel( res, res2, vec_and( vec_cmpgt( zz, xx ), vec_cmpgt( zz, yy ) ) );
+    res = vec_sel( res, res3, vec_cmpgt( vec_splat( diagSum, 0 ), _mm_setzero_ps() ) );
+    mVec128 = res;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 outer( const Vector3 &tfrm0, const Vector3 &tfrm1 )
+{
+    return Matrix3(
+        ( tfrm0 * tfrm1.getX( ) ),
+        ( tfrm0 * tfrm1.getY( ) ),
+        ( tfrm0 * tfrm1.getZ( ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 outer( const Vector4 &tfrm0, const Vector4 &tfrm1 )
+{
+    return Matrix4(
+        ( tfrm0 * tfrm1.getX( ) ),
+        ( tfrm0 * tfrm1.getY( ) ),
+        ( tfrm0 * tfrm1.getZ( ) ),
+        ( tfrm0 * tfrm1.getW( ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat )
+{
+    __m128 tmp0, tmp1, mcol0, mcol1, mcol2, res;
+    __m128 xxxx, yyyy, zzzz;
+    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
+    tmp1 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
+    xxxx = vec_splat( vec.get128(), 0 );
+    mcol0 = vec_mergeh( tmp0, mat.getCol1().get128() );
+    //mcol1 = vec_perm( tmp0, mat.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	mcol1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
+	mcol1 = vec_sel(mcol1, mat.getCol1().get128(), select_y);
+    //mcol2 = vec_perm( tmp1, mat.getCol1().get128(), _VECTORMATH_PERM_XCYX );
+	mcol2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
+	mcol2 = vec_sel(mcol2, vec_splat(mat.getCol1().get128(), 2), select_y);
+    yyyy = vec_splat( vec.get128(), 1 );
+    res = vec_mul( mcol0, xxxx );
+    zzzz = vec_splat( vec.get128(), 2 );
+    res = vec_madd( mcol1, yyyy, res );
+    res = vec_madd( mcol2, zzzz, res );
+    return Vector3( res );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 crossMatrix( const Vector3 &vec )
+{
+    __m128 neg, res0, res1, res2;
+    neg = negatef4( vec.get128() );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    //res0 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_XZBX );
+	res0 = _mm_shuffle_ps( vec.get128(), vec.get128(), _MM_SHUFFLE(0,2,2,0) );
+	res0 = vec_sel(res0, vec_splat(neg, 1), select_z);
+    //res1 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_CXXX );
+	res1 = vec_sel(vec_splat(vec.get128(), 0), vec_splat(neg, 2), select_x);
+    //res2 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_YAXX );
+	res2 = _mm_shuffle_ps( vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,1,1) );
+	res2 = vec_sel(res2, vec_splat(neg, 0), select_y);
+	VM_ATTRIBUTE_ALIGN16 unsigned int filter_x[4] = {0, 0xffffffff, 0xffffffff, 0xffffffff};
+	VM_ATTRIBUTE_ALIGN16 unsigned int filter_y[4] = {0xffffffff, 0, 0xffffffff, 0xffffffff};
+	VM_ATTRIBUTE_ALIGN16 unsigned int filter_z[4] = {0xffffffff, 0xffffffff, 0, 0xffffffff};
+    res0 = vec_and( res0, _mm_load_ps((float *)filter_x ) );
+    res1 = vec_and( res1, _mm_load_ps((float *)filter_y ) );
+    res2 = vec_and( res2, _mm_load_ps((float *)filter_z ) ); // TODO: Use selects?
+    return Matrix3(
+        Vector3( res0 ),
+        Vector3( res1 ),
+        Vector3( res2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat )
+{
+    return Matrix3( cross( vec, mat.getCol0() ), cross( vec, mat.getCol1() ), cross( vec, mat.getCol2() ) );
+}
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/src/vectormath/sse/quat_aos.h b/src/vectormath/sse/quat_aos.h
index d7091ecf8..7eac59fe5 100644
--- a/src/vectormath/sse/quat_aos.h
+++ b/src/vectormath/sse/quat_aos.h
@@ -1,579 +1,579 @@
-/*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-   POSSIBILITY OF SUCH DAMAGE.
-*/
-
-
-#ifndef _VECTORMATH_QUAT_AOS_CPP_H
-#define _VECTORMATH_QUAT_AOS_CPP_H
-
-//-----------------------------------------------------------------------------
-// Definitions
-
-#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
-#define _VECTORMATH_INTERNAL_FUNCTIONS
-
-#endif
-
-namespace Vectormath {
-namespace Aos {
-
-VECTORMATH_FORCE_INLINE void Quat::set128(vec_float4 vec)
-{
-    mVec128 = vec;
-}
-
-VECTORMATH_FORCE_INLINE Quat::Quat( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
-{
-	mVec128 = _mm_unpacklo_ps(
-		_mm_unpacklo_ps( _x.get128(), _z.get128() ),
-		_mm_unpacklo_ps( _y.get128(), _w.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE Quat::Quat( const Vector3 &xyz, float _w )
-{
-    mVec128 = xyz.get128();
-    _vmathVfSetElement(mVec128, _w, 3);
-}
-
-
-
-VECTORMATH_FORCE_INLINE  Quat::Quat(const Quat& quat)
-{
-	mVec128 = quat.get128();
-}
-
-VECTORMATH_FORCE_INLINE Quat::Quat( float _x, float _y, float _z, float _w )
-{
-	mVec128 = _mm_setr_ps(_x, _y, _z, _w);
-}
-
-
-
-
-
-VECTORMATH_FORCE_INLINE Quat::Quat( const Vector3 &xyz, const floatInVec &_w )
-{
-    mVec128 = xyz.get128();
-    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
-}
-
-VECTORMATH_FORCE_INLINE Quat::Quat( const Vector4 &vec )
-{
-    mVec128 = vec.get128();
-}
-
-VECTORMATH_FORCE_INLINE Quat::Quat( float scalar )
-{
-    mVec128 = floatInVec(scalar).get128();
-}
-
-VECTORMATH_FORCE_INLINE Quat::Quat( const floatInVec &scalar )
-{
-    mVec128 = scalar.get128();
-}
-
-VECTORMATH_FORCE_INLINE Quat::Quat( __m128 vf4 )
-{
-    mVec128 = vf4;
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::identity( )
-{
-    return Quat( _VECTORMATH_UNIT_0001 );
-}
-
-VECTORMATH_FORCE_INLINE const Quat lerp( float t, const Quat &quat0, const Quat &quat1 )
-{
-    return lerp( floatInVec(t), quat0, quat1 );
-}
-
-VECTORMATH_FORCE_INLINE const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 )
-{
-    return ( quat0 + ( ( quat1 - quat0 ) * t ) );
-}
-
-VECTORMATH_FORCE_INLINE const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 )
-{
-    return slerp( floatInVec(t), unitQuat0, unitQuat1 );
-}
-
-VECTORMATH_FORCE_INLINE const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 )
-{
-    Quat start;
-    vec_float4 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
-    __m128 selectMask;
-    cosAngle = _vmathVfDot4( unitQuat0.get128(), unitQuat1.get128() );
-    selectMask = (__m128)vec_cmpgt( _mm_setzero_ps(), cosAngle );
-    cosAngle = vec_sel( cosAngle, negatef4( cosAngle ), selectMask );
-    start = Quat( vec_sel( unitQuat0.get128(), negatef4( unitQuat0.get128() ), selectMask ) );
-    selectMask = (__m128)vec_cmpgt( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
-    angle = acosf4( cosAngle );
-    tttt = t.get128();
-    oneMinusT = vec_sub( _mm_set1_ps(1.0f), tttt );
-    angles = vec_mergeh( _mm_set1_ps(1.0f), tttt );
-    angles = vec_mergeh( angles, oneMinusT );
-    angles = vec_madd( angles, angle, _mm_setzero_ps() );
-    sines = sinf4( angles );
-    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
-    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
-    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
-    return Quat( vec_madd( start.get128(), scale0, vec_mul( unitQuat1.get128(), scale1 ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
-{
-    return squad( floatInVec(t), unitQuat0, unitQuat1, unitQuat2, unitQuat3 );
-}
-
-VECTORMATH_FORCE_INLINE const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
-{
-    return slerp( ( ( floatInVec(2.0f) * t ) * ( floatInVec(1.0f) - t ) ), slerp( t, unitQuat0, unitQuat3 ), slerp( t, unitQuat1, unitQuat2 ) );
-}
-
-VECTORMATH_FORCE_INLINE __m128 Quat::get128( ) const
-{
-    return mVec128;
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::operator =( const Quat &quat )
-{
-    mVec128 = quat.mVec128;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::setXYZ( const Vector3 &vec )
-{
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
-	mVec128 = vec_sel( vec.get128(), mVec128, sw );
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Quat::getXYZ( ) const
-{
-    return Vector3( mVec128 );
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::setX( float _x )
-{
-    _vmathVfSetElement(mVec128, _x, 0);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::setX( const floatInVec &_x )
-{
-    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Quat::getX( ) const
-{
-    return floatInVec( mVec128, 0 );
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::setY( float _y )
-{
-    _vmathVfSetElement(mVec128, _y, 1);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::setY( const floatInVec &_y )
-{
-    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Quat::getY( ) const
-{
-    return floatInVec( mVec128, 1 );
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::setZ( float _z )
-{
-    _vmathVfSetElement(mVec128, _z, 2);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::setZ( const floatInVec &_z )
-{
-    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Quat::getZ( ) const
-{
-    return floatInVec( mVec128, 2 );
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::setW( float _w )
-{
-    _vmathVfSetElement(mVec128, _w, 3);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::setW( const floatInVec &_w )
-{
-    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Quat::getW( ) const
-{
-    return floatInVec( mVec128, 3 );
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::setElem( int idx, float value )
-{
-    _vmathVfSetElement(mVec128, value, idx);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::setElem( int idx, const floatInVec &value )
-{
-    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Quat::getElem( int idx ) const
-{
-    return floatInVec( mVec128, idx );
-}
-
-VECTORMATH_FORCE_INLINE VecIdx Quat::operator []( int idx )
-{
-    return VecIdx( mVec128, idx );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Quat::operator []( int idx ) const
-{
-    return floatInVec( mVec128, idx );
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::operator +( const Quat &quat ) const
-{
-    return Quat( _mm_add_ps( mVec128, quat.mVec128 ) );
-}
-
-
-VECTORMATH_FORCE_INLINE const Quat Quat::operator -( const Quat &quat ) const
-{
-    return Quat( _mm_sub_ps( mVec128, quat.mVec128 ) );
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::operator *( float scalar ) const
-{
-    return *this * floatInVec(scalar);
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::operator *( const floatInVec &scalar ) const
-{
-    return Quat( _mm_mul_ps( mVec128, scalar.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::operator +=( const Quat &quat )
-{
-    *this = *this + quat;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::operator -=( const Quat &quat )
-{
-    *this = *this - quat;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::operator *=( float scalar )
-{
-    *this = *this * scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::operator *=( const floatInVec &scalar )
-{
-    *this = *this * scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::operator /( float scalar ) const
-{
-    return *this / floatInVec(scalar);
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::operator /( const floatInVec &scalar ) const
-{
-    return Quat( _mm_div_ps( mVec128, scalar.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::operator /=( float scalar )
-{
-    *this = *this / scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::operator /=( const floatInVec &scalar )
-{
-    *this = *this / scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::operator -( ) const
-{
-	return Quat(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
-}
-
-VECTORMATH_FORCE_INLINE const Quat operator *( float scalar, const Quat &quat )
-{
-    return floatInVec(scalar) * quat;
-}
-
-VECTORMATH_FORCE_INLINE const Quat operator *( const floatInVec &scalar, const Quat &quat )
-{
-    return quat * scalar;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec dot( const Quat &quat0, const Quat &quat1 )
-{
-    return floatInVec( _vmathVfDot4( quat0.get128(), quat1.get128() ), 0 );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec norm( const Quat &quat )
-{
-    return floatInVec(  _vmathVfDot4( quat.get128(), quat.get128() ), 0 );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec length( const Quat &quat )
-{
-    return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( quat.get128(), quat.get128() )), 0 );
-}
-
-VECTORMATH_FORCE_INLINE const Quat normalize( const Quat &quat )
-{
-	vec_float4 dot =_vmathVfDot4( quat.get128(), quat.get128());
-    return Quat( _mm_mul_ps( quat.get128(), newtonrapson_rsqrt4( dot ) ) );
-}
-
-
-VECTORMATH_FORCE_INLINE const Quat Quat::rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 )
-{
-    Vector3 crossVec;
-    __m128 cosAngle, cosAngleX2Plus2, recipCosHalfAngleX2, cosHalfAngleX2, res;
-    cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() );
-    cosAngleX2Plus2 = vec_madd( cosAngle, _mm_set1_ps(2.0f), _mm_set1_ps(2.0f) );
-    recipCosHalfAngleX2 = _mm_rsqrt_ps( cosAngleX2Plus2 );
-    cosHalfAngleX2 = vec_mul( recipCosHalfAngleX2, cosAngleX2Plus2 );
-    crossVec = cross( unitVec0, unitVec1 );
-    res = vec_mul( crossVec.get128(), recipCosHalfAngleX2 );
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
-    res = vec_sel( res, vec_mul( cosHalfAngleX2, _mm_set1_ps(0.5f) ), sw );
-    return Quat( res );
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::rotation( float radians, const Vector3 &unitVec )
-{
-    return rotation( floatInVec(radians), unitVec );
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::rotation( const floatInVec &radians, const Vector3 &unitVec )
-{
-    __m128 s, c, angle, res;
-    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
-    sincosf4( angle, &s, &c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
-    res = vec_sel( vec_mul( unitVec.get128(), s ), c, sw );
-    return Quat( res );
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::rotationX( float radians )
-{
-    return rotationX( floatInVec(radians) );
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::rotationX( const floatInVec &radians )
-{
-    __m128 s, c, angle, res;
-    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
-    sincosf4( angle, &s, &c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int xsw[4] = {0xffffffff, 0, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
-    res = vec_sel( _mm_setzero_ps(), s, xsw );
-    res = vec_sel( res, c, wsw );
-    return Quat( res );
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::rotationY( float radians )
-{
-    return rotationY( floatInVec(radians) );
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::rotationY( const floatInVec &radians )
-{
-    __m128 s, c, angle, res;
-    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
-    sincosf4( angle, &s, &c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int ysw[4] = {0, 0xffffffff, 0, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
-    res = vec_sel( _mm_setzero_ps(), s, ysw );
-    res = vec_sel( res, c, wsw );
-    return Quat( res );
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::rotationZ( float radians )
-{
-    return rotationZ( floatInVec(radians) );
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::rotationZ( const floatInVec &radians )
-{
-    __m128 s, c, angle, res;
-    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
-    sincosf4( angle, &s, &c );
-	VM_ATTRIBUTE_ALIGN16 unsigned int zsw[4] = {0, 0, 0xffffffff, 0};
-	VM_ATTRIBUTE_ALIGN16 unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
-    res = vec_sel( _mm_setzero_ps(), s, zsw );
-    res = vec_sel( res, c, wsw );
-    return Quat( res );
-}
-
-VECTORMATH_FORCE_INLINE const Quat Quat::operator *( const Quat &quat ) const
-{
-    __m128 ldata, rdata, qv, tmp0, tmp1, tmp2, tmp3;
-    __m128 product, l_wxyz, r_wxyz, xy, qw;
-    ldata = mVec128;
-    rdata = quat.mVec128;
-    tmp0 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,0,2,1) );
-    tmp1 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,1,0,2) );
-    tmp2 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,1,0,2) );
-    tmp3 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,0,2,1) );
-    qv = vec_mul( vec_splat( ldata, 3 ), rdata );
-    qv = vec_madd( vec_splat( rdata, 3 ), ldata, qv );
-    qv = vec_madd( tmp0, tmp1, qv );
-    qv = vec_nmsub( tmp2, tmp3, qv );
-    product = vec_mul( ldata, rdata );
-    l_wxyz = vec_sld( ldata, ldata, 12 );
-    r_wxyz = vec_sld( rdata, rdata, 12 );
-    qw = vec_nmsub( l_wxyz, r_wxyz, product );
-    xy = vec_madd( l_wxyz, r_wxyz, product );
-    qw = vec_sub( qw, vec_sld( xy, xy, 8 ) );
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
-    return Quat( vec_sel( qv, qw, sw ) );
-}
-
-VECTORMATH_FORCE_INLINE Quat & Quat::operator *=( const Quat &quat )
-{
-    *this = *this * quat;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 rotate( const Quat &quat, const Vector3 &vec )
-{    __m128 qdata, vdata, product, tmp0, tmp1, tmp2, tmp3, wwww, qv, qw, res;
-    qdata = quat.get128();
-    vdata = vec.get128();
-    tmp0 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,0,2,1) );
-    tmp1 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,1,0,2) );
-    tmp2 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,1,0,2) );
-    tmp3 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,0,2,1) );
-    wwww = vec_splat( qdata, 3 );
-    qv = vec_mul( wwww, vdata );
-    qv = vec_madd( tmp0, tmp1, qv );
-    qv = vec_nmsub( tmp2, tmp3, qv );
-    product = vec_mul( qdata, vdata );
-    qw = vec_madd( vec_sld( qdata, qdata, 4 ), vec_sld( vdata, vdata, 4 ), product );
-    qw = vec_add( vec_sld( product, product, 8 ), qw );
-    tmp1 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,1,0,2) );
-    tmp3 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,0,2,1) );
-    res = vec_mul( vec_splat( qw, 0 ), qdata );
-    res = vec_madd( wwww, qv, res );
-    res = vec_madd( tmp0, tmp1, res );
-    res = vec_nmsub( tmp2, tmp3, res );
-    return Vector3( res );
-}
-
-VECTORMATH_FORCE_INLINE const Quat conj( const Quat &quat )
-{
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0x80000000,0x80000000,0x80000000,0};
-    return Quat( vec_xor( quat.get128(), _mm_load_ps((float *)sw) ) );
-}
-
-VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, bool select1 )
-{
-    return select( quat0, quat1, boolInVec(select1) );
-}
-
-//VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 )
-//{
-//    return Quat( vec_sel( quat0.get128(), quat1.get128(), select1.get128() ) );
-//}
-
-VECTORMATH_FORCE_INLINE void loadXYZW(Quat& quat, const float* fptr)
-{
-#ifdef USE_SSE3_LDDQU
-	quat = Quat(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128		);
-#else
-	SSEFloat fl;
-	fl.f[0] = fptr[0];
-	fl.f[1] = fptr[1];
-	fl.f[2] = fptr[2];
-	fl.f[3] = fptr[3];
-    quat = Quat(	fl.m128);
-#endif
-    
-
-}
-
-VECTORMATH_FORCE_INLINE void storeXYZW(const Quat& quat, float* fptr)
-{
-	fptr[0] = quat.getX();
-	fptr[1] = quat.getY();
-	fptr[2] = quat.getZ();
-	fptr[3] = quat.getW();
-//    _mm_storeu_ps((float*)quat.get128(),fptr);
-}
-
-
-
-#ifdef _VECTORMATH_DEBUG
-
-VECTORMATH_FORCE_INLINE void print( const Quat &quat )
-{
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = quat.get128();
-    printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
-}
-
-VECTORMATH_FORCE_INLINE void print( const Quat &quat, const char * name )
-{
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = quat.get128();
-    printf( "%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
-}
-
-#endif
-
-} // namespace Aos
-} // namespace Vectormath
-
-#endif
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef _VECTORMATH_QUAT_AOS_CPP_H
+#define _VECTORMATH_QUAT_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+VECTORMATH_FORCE_INLINE void Quat::set128(vec_float4 vec)
+{
+    mVec128 = vec;
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
+{
+	mVec128 = _mm_unpacklo_ps(
+		_mm_unpacklo_ps( _x.get128(), _z.get128() ),
+		_mm_unpacklo_ps( _y.get128(), _w.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( const Vector3 &xyz, float _w )
+{
+    mVec128 = xyz.get128();
+    _vmathVfSetElement(mVec128, _w, 3);
+}
+
+
+
+VECTORMATH_FORCE_INLINE  Quat::Quat(const Quat& quat)
+{
+	mVec128 = quat.get128();
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( float _x, float _y, float _z, float _w )
+{
+	mVec128 = _mm_setr_ps(_x, _y, _z, _w);
+}
+
+
+
+
+
+VECTORMATH_FORCE_INLINE Quat::Quat( const Vector3 &xyz, const floatInVec &_w )
+{
+    mVec128 = xyz.get128();
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( const Vector4 &vec )
+{
+    mVec128 = vec.get128();
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( float scalar )
+{
+    mVec128 = floatInVec(scalar).get128();
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( const floatInVec &scalar )
+{
+    mVec128 = scalar.get128();
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( __m128 vf4 )
+{
+    mVec128 = vf4;
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::identity( )
+{
+    return Quat( _VECTORMATH_UNIT_0001 );
+}
+
+VECTORMATH_FORCE_INLINE const Quat lerp( float t, const Quat &quat0, const Quat &quat1 )
+{
+    return lerp( floatInVec(t), quat0, quat1 );
+}
+
+VECTORMATH_FORCE_INLINE const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 )
+{
+    return ( quat0 + ( ( quat1 - quat0 ) * t ) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 )
+{
+    return slerp( floatInVec(t), unitQuat0, unitQuat1 );
+}
+
+VECTORMATH_FORCE_INLINE const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 )
+{
+    Quat start;
+    vec_float4 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
+    __m128 selectMask;
+    cosAngle = _vmathVfDot4( unitQuat0.get128(), unitQuat1.get128() );
+    selectMask = (__m128)vec_cmpgt( _mm_setzero_ps(), cosAngle );
+    cosAngle = vec_sel( cosAngle, negatef4( cosAngle ), selectMask );
+    start = Quat( vec_sel( unitQuat0.get128(), negatef4( unitQuat0.get128() ), selectMask ) );
+    selectMask = (__m128)vec_cmpgt( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
+    angle = acosf4( cosAngle );
+    tttt = t.get128();
+    oneMinusT = vec_sub( _mm_set1_ps(1.0f), tttt );
+    angles = vec_mergeh( _mm_set1_ps(1.0f), tttt );
+    angles = vec_mergeh( angles, oneMinusT );
+    angles = vec_madd( angles, angle, _mm_setzero_ps() );
+    sines = sinf4( angles );
+    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
+    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
+    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
+    return Quat( vec_madd( start.get128(), scale0, vec_mul( unitQuat1.get128(), scale1 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
+{
+    return squad( floatInVec(t), unitQuat0, unitQuat1, unitQuat2, unitQuat3 );
+}
+
+VECTORMATH_FORCE_INLINE const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
+{
+    return slerp( ( ( floatInVec(2.0f) * t ) * ( floatInVec(1.0f) - t ) ), slerp( t, unitQuat0, unitQuat3 ), slerp( t, unitQuat1, unitQuat2 ) );
+}
+
+VECTORMATH_FORCE_INLINE __m128 Quat::get128( ) const
+{
+    return mVec128;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator =( const Quat &quat )
+{
+    mVec128 = quat.mVec128;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setXYZ( const Vector3 &vec )
+{
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+	mVec128 = vec_sel( vec.get128(), mVec128, sw );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Quat::getXYZ( ) const
+{
+    return Vector3( mVec128 );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setX( float _x )
+{
+    _vmathVfSetElement(mVec128, _x, 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setX( const floatInVec &_x )
+{
+    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Quat::getX( ) const
+{
+    return floatInVec( mVec128, 0 );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setY( float _y )
+{
+    _vmathVfSetElement(mVec128, _y, 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setY( const floatInVec &_y )
+{
+    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Quat::getY( ) const
+{
+    return floatInVec( mVec128, 1 );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setZ( float _z )
+{
+    _vmathVfSetElement(mVec128, _z, 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setZ( const floatInVec &_z )
+{
+    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Quat::getZ( ) const
+{
+    return floatInVec( mVec128, 2 );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setW( float _w )
+{
+    _vmathVfSetElement(mVec128, _w, 3);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setW( const floatInVec &_w )
+{
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Quat::getW( ) const
+{
+    return floatInVec( mVec128, 3 );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setElem( int idx, float value )
+{
+    _vmathVfSetElement(mVec128, value, idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setElem( int idx, const floatInVec &value )
+{
+    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Quat::getElem( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE VecIdx Quat::operator []( int idx )
+{
+    return VecIdx( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Quat::operator []( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator +( const Quat &quat ) const
+{
+    return Quat( _mm_add_ps( mVec128, quat.mVec128 ) );
+}
+
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator -( const Quat &quat ) const
+{
+    return Quat( _mm_sub_ps( mVec128, quat.mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator *( const floatInVec &scalar ) const
+{
+    return Quat( _mm_mul_ps( mVec128, scalar.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator +=( const Quat &quat )
+{
+    *this = *this + quat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator -=( const Quat &quat )
+{
+    *this = *this - quat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator /( float scalar ) const
+{
+    return *this / floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator /( const floatInVec &scalar ) const
+{
+    return Quat( _mm_div_ps( mVec128, scalar.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator /=( const floatInVec &scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator -( ) const
+{
+	return Quat(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat operator *( float scalar, const Quat &quat )
+{
+    return floatInVec(scalar) * quat;
+}
+
+VECTORMATH_FORCE_INLINE const Quat operator *( const floatInVec &scalar, const Quat &quat )
+{
+    return quat * scalar;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec dot( const Quat &quat0, const Quat &quat1 )
+{
+    return floatInVec( _vmathVfDot4( quat0.get128(), quat1.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec norm( const Quat &quat )
+{
+    return floatInVec(  _vmathVfDot4( quat.get128(), quat.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec length( const Quat &quat )
+{
+    return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( quat.get128(), quat.get128() )), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const Quat normalize( const Quat &quat )
+{
+	vec_float4 dot =_vmathVfDot4( quat.get128(), quat.get128());
+    return Quat( _mm_mul_ps( quat.get128(), newtonrapson_rsqrt4( dot ) ) );
+}
+
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 )
+{
+    Vector3 crossVec;
+    __m128 cosAngle, cosAngleX2Plus2, recipCosHalfAngleX2, cosHalfAngleX2, res;
+    cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() );
+    cosAngleX2Plus2 = vec_madd( cosAngle, _mm_set1_ps(2.0f), _mm_set1_ps(2.0f) );
+    recipCosHalfAngleX2 = _mm_rsqrt_ps( cosAngleX2Plus2 );
+    cosHalfAngleX2 = vec_mul( recipCosHalfAngleX2, cosAngleX2Plus2 );
+    crossVec = cross( unitVec0, unitVec1 );
+    res = vec_mul( crossVec.get128(), recipCosHalfAngleX2 );
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( res, vec_mul( cosHalfAngleX2, _mm_set1_ps(0.5f) ), sw );
+    return Quat( res );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotation( float radians, const Vector3 &unitVec )
+{
+    return rotation( floatInVec(radians), unitVec );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotation( const floatInVec &radians, const Vector3 &unitVec )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( vec_mul( unitVec.get128(), s ), c, sw );
+    return Quat( res );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotationX( float radians )
+{
+    return rotationX( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotationX( const floatInVec &radians )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int xsw[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( _mm_setzero_ps(), s, xsw );
+    res = vec_sel( res, c, wsw );
+    return Quat( res );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotationY( float radians )
+{
+    return rotationY( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotationY( const floatInVec &radians )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int ysw[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( _mm_setzero_ps(), s, ysw );
+    res = vec_sel( res, c, wsw );
+    return Quat( res );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotationZ( float radians )
+{
+    return rotationZ( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotationZ( const floatInVec &radians )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int zsw[4] = {0, 0, 0xffffffff, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( _mm_setzero_ps(), s, zsw );
+    res = vec_sel( res, c, wsw );
+    return Quat( res );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator *( const Quat &quat ) const
+{
+    __m128 ldata, rdata, qv, tmp0, tmp1, tmp2, tmp3;
+    __m128 product, l_wxyz, r_wxyz, xy, qw;
+    ldata = mVec128;
+    rdata = quat.mVec128;
+    tmp0 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,0,2,1) );
+    tmp1 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,1,0,2) );
+    tmp2 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,0,2,1) );
+    qv = vec_mul( vec_splat( ldata, 3 ), rdata );
+    qv = vec_madd( vec_splat( rdata, 3 ), ldata, qv );
+    qv = vec_madd( tmp0, tmp1, qv );
+    qv = vec_nmsub( tmp2, tmp3, qv );
+    product = vec_mul( ldata, rdata );
+    l_wxyz = vec_sld( ldata, ldata, 12 );
+    r_wxyz = vec_sld( rdata, rdata, 12 );
+    qw = vec_nmsub( l_wxyz, r_wxyz, product );
+    xy = vec_madd( l_wxyz, r_wxyz, product );
+    qw = vec_sub( qw, vec_sld( xy, xy, 8 ) );
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+    return Quat( vec_sel( qv, qw, sw ) );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator *=( const Quat &quat )
+{
+    *this = *this * quat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 rotate( const Quat &quat, const Vector3 &vec )
+{    __m128 qdata, vdata, product, tmp0, tmp1, tmp2, tmp3, wwww, qv, qw, res;
+    qdata = quat.get128();
+    vdata = vec.get128();
+    tmp0 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,0,2,1) );
+    tmp1 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,1,0,2) );
+    tmp2 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,0,2,1) );
+    wwww = vec_splat( qdata, 3 );
+    qv = vec_mul( wwww, vdata );
+    qv = vec_madd( tmp0, tmp1, qv );
+    qv = vec_nmsub( tmp2, tmp3, qv );
+    product = vec_mul( qdata, vdata );
+    qw = vec_madd( vec_sld( qdata, qdata, 4 ), vec_sld( vdata, vdata, 4 ), product );
+    qw = vec_add( vec_sld( product, product, 8 ), qw );
+    tmp1 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,0,2,1) );
+    res = vec_mul( vec_splat( qw, 0 ), qdata );
+    res = vec_madd( wwww, qv, res );
+    res = vec_madd( tmp0, tmp1, res );
+    res = vec_nmsub( tmp2, tmp3, res );
+    return Vector3( res );
+}
+
+VECTORMATH_FORCE_INLINE const Quat conj( const Quat &quat )
+{
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0x80000000,0x80000000,0x80000000,0};
+    return Quat( vec_xor( quat.get128(), _mm_load_ps((float *)sw) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, bool select1 )
+{
+    return select( quat0, quat1, boolInVec(select1) );
+}
+
+//VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 )
+//{
+//    return Quat( vec_sel( quat0.get128(), quat1.get128(), select1.get128() ) );
+//}
+
+VECTORMATH_FORCE_INLINE void loadXYZW(Quat& quat, const float* fptr)
+{
+#ifdef USE_SSE3_LDDQU
+	quat = Quat(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128		);
+#else
+	SSEFloat fl;
+	fl.f[0] = fptr[0];
+	fl.f[1] = fptr[1];
+	fl.f[2] = fptr[2];
+	fl.f[3] = fptr[3];
+    quat = Quat(	fl.m128);
+#endif
+    
+
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZW(const Quat& quat, float* fptr)
+{
+	fptr[0] = quat.getX();
+	fptr[1] = quat.getY();
+	fptr[2] = quat.getZ();
+	fptr[3] = quat.getW();
+//    _mm_storeu_ps((float*)quat.get128(),fptr);
+}
+
+
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Quat &quat )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = quat.get128();
+    printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Quat &quat, const char * name )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = quat.get128();
+    printf( "%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/src/vectormath/sse/vec_aos.h b/src/vectormath/sse/vec_aos.h
index 05ebac55a..35aeeaf16 100644
--- a/src/vectormath/sse/vec_aos.h
+++ b/src/vectormath/sse/vec_aos.h
@@ -1,1455 +1,1455 @@
-/*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-   POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifndef _VECTORMATH_VEC_AOS_CPP_H
-#define _VECTORMATH_VEC_AOS_CPP_H
-
-//-----------------------------------------------------------------------------
-// Constants
-// for permutes words are labeled [x,y,z,w] [a,b,c,d]
-
-#define _VECTORMATH_PERM_X 0x00010203
-#define _VECTORMATH_PERM_Y 0x04050607
-#define _VECTORMATH_PERM_Z 0x08090a0b
-#define _VECTORMATH_PERM_W 0x0c0d0e0f
-#define _VECTORMATH_PERM_A 0x10111213
-#define _VECTORMATH_PERM_B 0x14151617
-#define _VECTORMATH_PERM_C 0x18191a1b
-#define _VECTORMATH_PERM_D 0x1c1d1e1f
-#define _VECTORMATH_PERM_XYZA (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A }
-#define _VECTORMATH_PERM_ZXYW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_W }
-#define _VECTORMATH_PERM_YZXW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_W }
-#define _VECTORMATH_PERM_YZAB (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B }
-#define _VECTORMATH_PERM_ZABC (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B, _VECTORMATH_PERM_C }
-#define _VECTORMATH_PERM_XYAW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_W }
-#define _VECTORMATH_PERM_XAZW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_W }
-#define _VECTORMATH_MASK_0xF000 (vec_uint4){ 0xffffffff, 0, 0, 0 }
-#define _VECTORMATH_MASK_0x0F00 (vec_uint4){ 0, 0xffffffff, 0, 0 }
-#define _VECTORMATH_MASK_0x00F0 (vec_uint4){ 0, 0, 0xffffffff, 0 }
-#define _VECTORMATH_MASK_0x000F (vec_uint4){ 0, 0, 0, 0xffffffff }
-#define _VECTORMATH_UNIT_1000 _mm_setr_ps(1.0f,0.0f,0.0f,0.0f) // (__m128){ 1.0f, 0.0f, 0.0f, 0.0f }
-#define _VECTORMATH_UNIT_0100 _mm_setr_ps(0.0f,1.0f,0.0f,0.0f) // (__m128){ 0.0f, 1.0f, 0.0f, 0.0f }
-#define _VECTORMATH_UNIT_0010 _mm_setr_ps(0.0f,0.0f,1.0f,0.0f) // (__m128){ 0.0f, 0.0f, 1.0f, 0.0f }
-#define _VECTORMATH_UNIT_0001 _mm_setr_ps(0.0f,0.0f,0.0f,1.0f) // (__m128){ 0.0f, 0.0f, 0.0f, 1.0f }
-#define _VECTORMATH_SLERP_TOL 0.999f
-//_VECTORMATH_SLERP_TOLF
-
-//-----------------------------------------------------------------------------
-// Definitions
-
-#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
-#define _VECTORMATH_INTERNAL_FUNCTIONS
-
-#define     _vmath_shufps(a, b, immx, immy, immz, immw) _mm_shuffle_ps(a, b, _MM_SHUFFLE(immw, immz, immy, immx))
-static VECTORMATH_FORCE_INLINE __m128 _vmathVfDot3( __m128 vec0, __m128 vec1 )
-{
-	__m128 result = _mm_mul_ps( vec0, vec1);
-    return _mm_add_ps( vec_splat( result, 0 ), _mm_add_ps( vec_splat( result, 1 ), vec_splat( result, 2 ) ) );
-}
-
-static VECTORMATH_FORCE_INLINE __m128 _vmathVfDot4( __m128 vec0, __m128 vec1 )
-{
-    __m128 result = _mm_mul_ps(vec0, vec1);
-	return _mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(0,0,0,0)),
-			_mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(1,1,1,1)),
-			_mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(2,2,2,2)), _mm_shuffle_ps(result, result, _MM_SHUFFLE(3,3,3,3)))));
-}
-
-static VECTORMATH_FORCE_INLINE __m128 _vmathVfCross( __m128 vec0, __m128 vec1 )
-{
-    __m128 tmp0, tmp1, tmp2, tmp3, result;
-    tmp0 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,0,2,1) );
-    tmp1 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,1,0,2) );
-    tmp2 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,1,0,2) );
-    tmp3 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,0,2,1) );
-    result = vec_mul( tmp0, tmp1 );
-    result = vec_nmsub( tmp2, tmp3, result );
-    return result;
-}
-/*
-static VECTORMATH_FORCE_INLINE vec_uint4 _vmathVfToHalfFloatsUnpacked(__m128 v)
-{
-#if 0
-    vec_int4 bexp;
-    vec_uint4 mant, sign, hfloat;
-    vec_uint4 notZero, isInf;
-    const vec_uint4 hfloatInf = (vec_uint4)(0x00007c00u);
-    const vec_uint4 mergeMant = (vec_uint4)(0x000003ffu);
-    const vec_uint4 mergeSign = (vec_uint4)(0x00008000u);
-
-    sign = vec_sr((vec_uint4)v, (vec_uint4)16);
-    mant = vec_sr((vec_uint4)v, (vec_uint4)13);
-    bexp = vec_and(vec_sr((vec_int4)v, (vec_uint4)23), (vec_int4)0xff);
-
-    notZero = (vec_uint4)vec_cmpgt(bexp, (vec_int4)112);
-    isInf = (vec_uint4)vec_cmpgt(bexp, (vec_int4)142);
-
-    bexp = _mm_add_ps(bexp, (vec_int4)-112);
-    bexp = vec_sl(bexp, (vec_uint4)10);
-
-    hfloat = vec_sel((vec_uint4)bexp, mant, mergeMant);
-    hfloat = vec_sel((vec_uint4)(0), hfloat, notZero);
-    hfloat = vec_sel(hfloat, hfloatInf, isInf);
-    hfloat = vec_sel(hfloat, sign, mergeSign);
-
-    return hfloat;
-#else
-	assert(0);
-	return _mm_setzero_ps();
-#endif
-}
-
-static VECTORMATH_FORCE_INLINE vec_ushort8 _vmath2VfToHalfFloats(__m128 u, __m128 v)
-{
-#if 0
-    vec_uint4 hfloat_u, hfloat_v;
-    const vec_uchar16 pack = (vec_uchar16){2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
-    hfloat_u = _vmathVfToHalfFloatsUnpacked(u);
-    hfloat_v = _vmathVfToHalfFloatsUnpacked(v);
-    return (vec_ushort8)vec_perm(hfloat_u, hfloat_v, pack);
-#else
-	assert(0);
-	return _mm_setzero_si128();
-#endif
-}
-*/
-
-static VECTORMATH_FORCE_INLINE __m128 _vmathVfInsert(__m128 dst, __m128 src, int slot)
-{
-	SSEFloat s;
-	s.m128 = src;
-	SSEFloat d;
-	d.m128 = dst;
-	d.f[slot] = s.f[slot];
-	return d.m128;
-}
-
-#define _vmathVfSetElement(vec, scalar, slot) ((float *)&(vec))[slot] = scalar
-
-static VECTORMATH_FORCE_INLINE __m128 _vmathVfSplatScalar(float scalar)
-{
-	return _mm_set1_ps(scalar);
-}
-
-#endif
-
-namespace Vectormath {
-namespace Aos {
-
-	
-#ifdef _VECTORMATH_NO_SCALAR_CAST
-VECTORMATH_FORCE_INLINE VecIdx::operator floatInVec() const
-{
-    return floatInVec(ref, i);
-}
-
-VECTORMATH_FORCE_INLINE float VecIdx::getAsFloat() const
-#else
-VECTORMATH_FORCE_INLINE VecIdx::operator float() const
-#endif
-{
-    return ((float *)&ref)[i];
-}
-
-VECTORMATH_FORCE_INLINE float VecIdx::operator =( float scalar )
-{
-    _vmathVfSetElement(ref, scalar, i);
-    return scalar;
-}
-
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator =( const floatInVec &scalar )
-{
-    ref = _vmathVfInsert(ref, scalar.get128(), i);
-    return scalar;
-}
-
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator =( const VecIdx& scalar )
-{
-    return *this = floatInVec(scalar.ref, scalar.i);
-}
-
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator *=( float scalar )
-{
-    return *this *= floatInVec(scalar);
-}
-
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator *=( const floatInVec &scalar )
-{
-    return *this = floatInVec(ref, i) * scalar;
-}
-
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator /=( float scalar )
-{
-    return *this /= floatInVec(scalar);
-}
-
-inline floatInVec VecIdx::operator /=( const floatInVec &scalar )
-{
-    return *this = floatInVec(ref, i) / scalar;
-}
-
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator +=( float scalar )
-{
-    return *this += floatInVec(scalar);
-}
-
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator +=( const floatInVec &scalar )
-{
-    return *this = floatInVec(ref, i) + scalar;
-}
-
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator -=( float scalar )
-{
-    return *this -= floatInVec(scalar);
-}
-
-VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator -=( const floatInVec &scalar )
-{
-    return *this = floatInVec(ref, i) - scalar;
-}
-
-VECTORMATH_FORCE_INLINE Vector3::Vector3(const Vector3& vec)
-{
-    set128(vec.get128());
-}
-
-VECTORMATH_FORCE_INLINE void Vector3::set128(vec_float4 vec)
-{
-    mVec128 = vec;
-}
-
-
-VECTORMATH_FORCE_INLINE Vector3::Vector3( float _x, float _y, float _z )
-{
-    mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
-}
-
-VECTORMATH_FORCE_INLINE Vector3::Vector3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
-{
-	__m128 xz = _mm_unpacklo_ps( _x.get128(), _z.get128() );
-	mVec128 = _mm_unpacklo_ps( xz, _y.get128() );
-}
-
-VECTORMATH_FORCE_INLINE Vector3::Vector3( const Point3 &pnt )
-{
-    mVec128 = pnt.get128();
-}
-
-VECTORMATH_FORCE_INLINE Vector3::Vector3( float scalar )
-{
-    mVec128 = floatInVec(scalar).get128();
-}
-
-VECTORMATH_FORCE_INLINE Vector3::Vector3( const floatInVec &scalar )
-{
-    mVec128 = scalar.get128();
-}
-
-VECTORMATH_FORCE_INLINE Vector3::Vector3( __m128 vf4 )
-{
-    mVec128 = vf4;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::xAxis( )
-{
-    return Vector3( _VECTORMATH_UNIT_1000 );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::yAxis( )
-{
-    return Vector3( _VECTORMATH_UNIT_0100 );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::zAxis( )
-{
-    return Vector3( _VECTORMATH_UNIT_0010 );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 )
-{
-    return lerp( floatInVec(t), vec0, vec1 );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 )
-{
-    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
-{
-    return slerp( floatInVec(t), unitVec0, unitVec1 );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
-{
-    __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
-    cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() );
-    __m128 selectMask = _mm_cmpgt_ps( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
-    angle = acosf4( cosAngle );
-    tttt = t.get128();
-    oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt );
-    angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt ); // angles = 1, t, 1, t
-    angles = _mm_unpacklo_ps( angles, oneMinusT );		// angles = 1, 1-t, t, 1-t
-    angles = _mm_mul_ps( angles, angle );
-    sines = sinf4( angles );
-    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
-    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
-    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
-    return Vector3( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) );
-}
-
-VECTORMATH_FORCE_INLINE __m128 Vector3::get128( ) const
-{
-    return mVec128;
-}
-
-VECTORMATH_FORCE_INLINE void loadXYZ(Point3& vec, const float* fptr)
-{
-#ifdef USE_SSE3_LDDQU
-	vec = Point3(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128 );
-#else
-	SSEFloat fl;
-	fl.f[0] = fptr[0];
-	fl.f[1] = fptr[1];
-	fl.f[2] = fptr[2];
-	fl.f[3] = fptr[3];
-    vec = Point3(	fl.m128);
-#endif //USE_SSE3_LDDQU
-	
-}
-
-
-
-VECTORMATH_FORCE_INLINE void loadXYZ(Vector3& vec, const float* fptr)
-{
-#ifdef USE_SSE3_LDDQU
-	vec = Vector3(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128 );
-#else
-	SSEFloat fl;
-	fl.f[0] = fptr[0];
-	fl.f[1] = fptr[1];
-	fl.f[2] = fptr[2];
-	fl.f[3] = fptr[3];
-    vec = Vector3(	fl.m128);
-#endif //USE_SSE3_LDDQU
-	
-}
-
-VECTORMATH_FORCE_INLINE void storeXYZ( const Vector3 &vec, __m128 * quad )
-{
-	__m128 dstVec = *quad;
-	VM_ATTRIBUTE_ALIGN16  unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize
-	dstVec = vec_sel(vec.get128(), dstVec, sw);
-	*quad = dstVec;
-}
-
-VECTORMATH_FORCE_INLINE void storeXYZ(const Point3& vec, float* fptr)
-{
-	fptr[0] = vec.getX();
-	fptr[1] = vec.getY();
-	fptr[2] = vec.getZ();
-}
-
-VECTORMATH_FORCE_INLINE void storeXYZ(const Vector3& vec, float* fptr)
-{
-	fptr[0] = vec.getX();
-	fptr[1] = vec.getY();
-	fptr[2] = vec.getZ();
-}
-
-
-VECTORMATH_FORCE_INLINE void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads )
-{
-	const float *quads = (float *)threeQuads;
-    vec0 = Vector3(  _mm_load_ps(quads) );
-    vec1 = Vector3( _mm_loadu_ps(quads + 3) );
-    vec2 = Vector3( _mm_loadu_ps(quads + 6) );
-    vec3 = Vector3( _mm_loadu_ps(quads + 9) );
-}
-
-VECTORMATH_FORCE_INLINE void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads )
-{
-	__m128 xxxx = _mm_shuffle_ps( vec1.get128(), vec1.get128(), _MM_SHUFFLE(0, 0, 0, 0) );
-	__m128 zzzz = _mm_shuffle_ps( vec2.get128(), vec2.get128(), _MM_SHUFFLE(2, 2, 2, 2) );
-	VM_ATTRIBUTE_ALIGN16 unsigned int xsw[4] = {0, 0, 0, 0xffffffff};
-	VM_ATTRIBUTE_ALIGN16 unsigned int zsw[4] = {0xffffffff, 0, 0, 0};
-	threeQuads[0] = vec_sel( vec0.get128(), xxxx, xsw );
-    threeQuads[1] = _mm_shuffle_ps( vec1.get128(), vec2.get128(), _MM_SHUFFLE(1, 0, 2, 1) );
-    threeQuads[2] = vec_sel( _mm_shuffle_ps( vec3.get128(), vec3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
-}
-/*
-VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads )
-{
-	assert(0);
-#if 0
-    __m128 xyz0[3];
-    __m128 xyz1[3];
-    storeXYZArray( vec0, vec1, vec2, vec3, xyz0 );
-    storeXYZArray( vec4, vec5, vec6, vec7, xyz1 );
-    threeQuads[0] = _vmath2VfToHalfFloats(xyz0[0], xyz0[1]);
-    threeQuads[1] = _vmath2VfToHalfFloats(xyz0[2], xyz1[0]);
-    threeQuads[2] = _vmath2VfToHalfFloats(xyz1[1], xyz1[2]);
-#endif
-}
-*/
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator =( const Vector3 &vec )
-{
-    mVec128 = vec.mVec128;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setX( float _x )
-{
-    _vmathVfSetElement(mVec128, _x, 0);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setX( const floatInVec &_x )
-{
-    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Vector3::getX( ) const
-{
-    return floatInVec( mVec128, 0 );
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setY( float _y )
-{
-    _vmathVfSetElement(mVec128, _y, 1);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setY( const floatInVec &_y )
-{
-    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Vector3::getY( ) const
-{
-    return floatInVec( mVec128, 1 );
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setZ( float _z )
-{
-    _vmathVfSetElement(mVec128, _z, 2);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setZ( const floatInVec &_z )
-{
-    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Vector3::getZ( ) const
-{
-    return floatInVec( mVec128, 2 );
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setElem( int idx, float value )
-{
-    _vmathVfSetElement(mVec128, value, idx);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::setElem( int idx, const floatInVec &value )
-{
-    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Vector3::getElem( int idx ) const
-{
-    return floatInVec( mVec128, idx );
-}
-
-VECTORMATH_FORCE_INLINE VecIdx Vector3::operator []( int idx )
-{
-    return VecIdx( mVec128, idx );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Vector3::operator []( int idx ) const
-{
-    return floatInVec( mVec128, idx );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator +( const Vector3 &vec ) const
-{
-    return Vector3( _mm_add_ps( mVec128, vec.mVec128 ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator -( const Vector3 &vec ) const
-{
-    return Vector3( _mm_sub_ps( mVec128, vec.mVec128 ) );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 Vector3::operator +( const Point3 &pnt ) const
-{
-    return Point3( _mm_add_ps( mVec128, pnt.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator *( float scalar ) const
-{
-    return *this * floatInVec(scalar);
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator *( const floatInVec &scalar ) const
-{
-    return Vector3( _mm_mul_ps( mVec128, scalar.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator +=( const Vector3 &vec )
-{
-    *this = *this + vec;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator -=( const Vector3 &vec )
-{
-    *this = *this - vec;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator *=( float scalar )
-{
-    *this = *this * scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator *=( const floatInVec &scalar )
-{
-    *this = *this * scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator /( float scalar ) const
-{
-    return *this / floatInVec(scalar);
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator /( const floatInVec &scalar ) const
-{
-    return Vector3( _mm_div_ps( mVec128, scalar.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator /=( float scalar )
-{
-    *this = *this / scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator /=( const floatInVec &scalar )
-{
-    *this = *this / scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator -( ) const
-{
-	//return Vector3(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
-
-	VM_ATTRIBUTE_ALIGN16 static const int array[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-	__m128 NEG_MASK = SSEFloat(*(const vec_float4*)array).vf;
-	return Vector3(_mm_xor_ps(get128(),NEG_MASK));
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 operator *( float scalar, const Vector3 &vec )
-{
-    return floatInVec(scalar) * vec;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec )
-{
-    return vec * scalar;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 )
-{
-    return Vector3( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 )
-{
-    return Vector3( _mm_div_ps( vec0.get128(), vec1.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 recipPerElem( const Vector3 &vec )
-{
-    return Vector3( _mm_rcp_ps( vec.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 absPerElem( const Vector3 &vec )
-{
-    return Vector3( fabsf4( vec.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 )
-{
-	__m128 vmask = toM128(0x7fffffff);
-	return Vector3( _mm_or_ps(
-		_mm_and_ps   ( vmask, vec0.get128() ),			// Value
-		_mm_andnot_ps( vmask, vec1.get128() ) ) );		// Signs
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 )
-{
-    return Vector3( _mm_max_ps( vec0.get128(), vec1.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector3 &vec )
-{
-    return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 )
-{
-    return Vector3( _mm_min_ps( vec0.get128(), vec1.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector3 &vec )
-{
-    return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector3 &vec )
-{
-    return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 )
-{
-    return floatInVec( _vmathVfDot3( vec0.get128(), vec1.get128() ), 0 );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector3 &vec )
-{
-    return floatInVec(  _vmathVfDot3( vec.get128(), vec.get128() ), 0 );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec length( const Vector3 &vec )
-{
-    return floatInVec(  _mm_sqrt_ps(_vmathVfDot3( vec.get128(), vec.get128() )), 0 );
-}
-
-
-VECTORMATH_FORCE_INLINE const Vector3 normalizeApprox( const Vector3 &vec )
-{
-    return Vector3( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 normalize( const Vector3 &vec )
-{
-	return Vector3( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 )
-{
-    return Vector3( _vmathVfCross( vec0.get128(), vec1.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 )
-{
-    return select( vec0, vec1, boolInVec(select1) );
-}
-
-
-VECTORMATH_FORCE_INLINE  const Vector4 select(const Vector4& vec0, const Vector4& vec1, const boolInVec& select1)
-{
-    return Vector4(vec_sel(vec0.get128(), vec1.get128(), select1.get128()));
-}
-
-#ifdef _VECTORMATH_DEBUG
-
-VECTORMATH_FORCE_INLINE void print( const Vector3 &vec )
-{
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = vec.get128();
-    printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
-}
-
-VECTORMATH_FORCE_INLINE void print( const Vector3 &vec, const char * name )
-{
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = vec.get128();
-    printf( "%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] );
-}
-
-#endif
-
-VECTORMATH_FORCE_INLINE Vector4::Vector4( float _x, float _y, float _z, float _w )
-{
-    mVec128 = _mm_setr_ps(_x, _y, _z, _w); 
- }
-
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
-{
-	mVec128 = _mm_unpacklo_ps(
-		_mm_unpacklo_ps( _x.get128(), _z.get128() ),
-		_mm_unpacklo_ps( _y.get128(), _w.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const Vector3 &xyz, float _w )
-{
-    mVec128 = xyz.get128();
-    _vmathVfSetElement(mVec128, _w, 3);
-}
-
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const Vector3 &xyz, const floatInVec &_w )
-{
-    mVec128 = xyz.get128();
-    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
-}
-
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const Vector3 &vec )
-{
-    mVec128 = vec.get128();
-    mVec128 = _vmathVfInsert(mVec128, _mm_setzero_ps(), 3);
-}
-
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const Point3 &pnt )
-{
-    mVec128 = pnt.get128();
-    mVec128 = _vmathVfInsert(mVec128, _mm_set1_ps(1.0f), 3);
-}
-
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const Quat &quat )
-{
-    mVec128 = quat.get128();
-}
-
-VECTORMATH_FORCE_INLINE Vector4::Vector4( float scalar )
-{
-    mVec128 = floatInVec(scalar).get128();
-}
-
-VECTORMATH_FORCE_INLINE Vector4::Vector4( const floatInVec &scalar )
-{
-    mVec128 = scalar.get128();
-}
-
-VECTORMATH_FORCE_INLINE Vector4::Vector4( __m128 vf4 )
-{
-    mVec128 = vf4;
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::xAxis( )
-{
-    return Vector4( _VECTORMATH_UNIT_1000 );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::yAxis( )
-{
-    return Vector4( _VECTORMATH_UNIT_0100 );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::zAxis( )
-{
-    return Vector4( _VECTORMATH_UNIT_0010 );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::wAxis( )
-{
-    return Vector4( _VECTORMATH_UNIT_0001 );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 )
-{
-    return lerp( floatInVec(t), vec0, vec1 );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 )
-{
-    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
-{
-    return slerp( floatInVec(t), unitVec0, unitVec1 );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
-{
-    __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
-    cosAngle = _vmathVfDot4( unitVec0.get128(), unitVec1.get128() );
-    __m128 selectMask = _mm_cmpgt_ps( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
-    angle = acosf4( cosAngle );
-    tttt = t.get128();
-    oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt );
-    angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt ); // angles = 1, t, 1, t
-    angles = _mm_unpacklo_ps( angles, oneMinusT );		// angles = 1, 1-t, t, 1-t
-    angles = _mm_mul_ps( angles, angle );
-    sines = sinf4( angles );
-    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
-    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
-    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
-    return Vector4( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) );
-}
-
-VECTORMATH_FORCE_INLINE __m128 Vector4::get128( ) const
-{
-    return mVec128;
-}
-/*
-VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads )
-{
-    twoQuads[0] = _vmath2VfToHalfFloats(vec0.get128(), vec1.get128());
-    twoQuads[1] = _vmath2VfToHalfFloats(vec2.get128(), vec3.get128());
-}
-*/
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator =( const Vector4 &vec )
-{
-    mVec128 = vec.mVec128;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setXYZ( const Vector3 &vec )
-{
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
-	mVec128 = vec_sel( vec.get128(), mVec128, sw );
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Vector4::getXYZ( ) const
-{
-    return Vector3( mVec128 );
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setX( float _x )
-{
-    _vmathVfSetElement(mVec128, _x, 0);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setX( const floatInVec &_x )
-{
-    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Vector4::getX( ) const
-{
-    return floatInVec( mVec128, 0 );
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setY( float _y )
-{
-    _vmathVfSetElement(mVec128, _y, 1);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setY( const floatInVec &_y )
-{
-    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Vector4::getY( ) const
-{
-    return floatInVec( mVec128, 1 );
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setZ( float _z )
-{
-    _vmathVfSetElement(mVec128, _z, 2);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setZ( const floatInVec &_z )
-{
-    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Vector4::getZ( ) const
-{
-    return floatInVec( mVec128, 2 );
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setW( float _w )
-{
-    _vmathVfSetElement(mVec128, _w, 3);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setW( const floatInVec &_w )
-{
-    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Vector4::getW( ) const
-{
-    return floatInVec( mVec128, 3 );
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setElem( int idx, float value )
-{
-    _vmathVfSetElement(mVec128, value, idx);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::setElem( int idx, const floatInVec &value )
-{
-    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Vector4::getElem( int idx ) const
-{
-    return floatInVec( mVec128, idx );
-}
-
-VECTORMATH_FORCE_INLINE VecIdx Vector4::operator []( int idx )
-{
-    return VecIdx( mVec128, idx );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Vector4::operator []( int idx ) const
-{
-    return floatInVec( mVec128, idx );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator +( const Vector4 &vec ) const
-{
-    return Vector4( _mm_add_ps( mVec128, vec.mVec128 ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator -( const Vector4 &vec ) const
-{
-    return Vector4( _mm_sub_ps( mVec128, vec.mVec128 ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator *( float scalar ) const
-{
-    return *this * floatInVec(scalar);
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator *( const floatInVec &scalar ) const
-{
-    return Vector4( _mm_mul_ps( mVec128, scalar.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator +=( const Vector4 &vec )
-{
-    *this = *this + vec;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator -=( const Vector4 &vec )
-{
-    *this = *this - vec;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator *=( float scalar )
-{
-    *this = *this * scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator *=( const floatInVec &scalar )
-{
-    *this = *this * scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator /( float scalar ) const
-{
-    return *this / floatInVec(scalar);
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator /( const floatInVec &scalar ) const
-{
-    return Vector4( _mm_div_ps( mVec128, scalar.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator /=( float scalar )
-{
-    *this = *this / scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator /=( const floatInVec &scalar )
-{
-    *this = *this / scalar;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator -( ) const
-{
-	return Vector4(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 operator *( float scalar, const Vector4 &vec )
-{
-    return floatInVec(scalar) * vec;
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec )
-{
-    return vec * scalar;
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 )
-{
-    return Vector4( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 )
-{
-    return Vector4( _mm_div_ps( vec0.get128(), vec1.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 recipPerElem( const Vector4 &vec )
-{
-    return Vector4( _mm_rcp_ps( vec.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 absPerElem( const Vector4 &vec )
-{
-    return Vector4( fabsf4( vec.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 )
-{
-	__m128 vmask = toM128(0x7fffffff);
-	return Vector4( _mm_or_ps(
-		_mm_and_ps   ( vmask, vec0.get128() ),			// Value
-		_mm_andnot_ps( vmask, vec1.get128() ) ) );		// Signs
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 )
-{
-    return Vector4( _mm_max_ps( vec0.get128(), vec1.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector4 &vec )
-{
-    return floatInVec( _mm_max_ps(
-		_mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
-		_mm_max_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 )
-{
-    return Vector4( _mm_min_ps( vec0.get128(), vec1.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector4 &vec )
-{
-    return floatInVec( _mm_min_ps(
-		_mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
-		_mm_min_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector4 &vec )
-{
-    return floatInVec( _mm_add_ps(
-		_mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
-		_mm_add_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 )
-{
-    return floatInVec( _vmathVfDot4( vec0.get128(), vec1.get128() ), 0 );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector4 &vec )
-{
-    return floatInVec(  _vmathVfDot4( vec.get128(), vec.get128() ), 0 );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec length( const Vector4 &vec )
-{
-    return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( vec.get128(), vec.get128() )), 0 );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 normalizeApprox( const Vector4 &vec )
-{
-    return Vector4( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 normalize( const Vector4 &vec )
-{
-    return Vector4( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 )
-{
-    return select( vec0, vec1, boolInVec(select1) );
-}
-
-
-#ifdef _VECTORMATH_DEBUG
-
-VECTORMATH_FORCE_INLINE void print( const Vector4 &vec )
-{
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = vec.get128();
-    printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
-}
-
-VECTORMATH_FORCE_INLINE void print( const Vector4 &vec, const char * name )
-{
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = vec.get128();
-    printf( "%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
-}
-
-#endif
-
-VECTORMATH_FORCE_INLINE Point3::Point3( float _x, float _y, float _z )
-{
-    mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
-}
-
-VECTORMATH_FORCE_INLINE Point3::Point3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
-{
-	mVec128 = _mm_unpacklo_ps( _mm_unpacklo_ps( _x.get128(), _z.get128() ), _y.get128() );
-}
-
-VECTORMATH_FORCE_INLINE Point3::Point3( const Vector3 &vec )
-{
-    mVec128 = vec.get128();
-}
-
-VECTORMATH_FORCE_INLINE Point3::Point3( float scalar )
-{
-    mVec128 = floatInVec(scalar).get128();
-}
-
-VECTORMATH_FORCE_INLINE Point3::Point3( const floatInVec &scalar )
-{
-    mVec128 = scalar.get128();
-}
-
-VECTORMATH_FORCE_INLINE Point3::Point3( __m128 vf4 )
-{
-    mVec128 = vf4;
-}
-
-VECTORMATH_FORCE_INLINE const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 )
-{
-    return lerp( floatInVec(t), pnt0, pnt1 );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 )
-{
-    return ( pnt0 + ( ( pnt1 - pnt0 ) * t ) );
-}
-
-VECTORMATH_FORCE_INLINE __m128 Point3::get128( ) const
-{
-    return mVec128;
-}
-
-VECTORMATH_FORCE_INLINE void storeXYZ( const Point3 &pnt, __m128 * quad )
-{
-    __m128 dstVec = *quad;
-	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize
-    dstVec = vec_sel(pnt.get128(), dstVec, sw);
-    *quad = dstVec;
-}
-
-VECTORMATH_FORCE_INLINE void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads )
-{
-	const float *quads = (float *)threeQuads;
-    pnt0 = Point3(  _mm_load_ps(quads) );
-    pnt1 = Point3( _mm_loadu_ps(quads + 3) );
-    pnt2 = Point3( _mm_loadu_ps(quads + 6) );
-    pnt3 = Point3( _mm_loadu_ps(quads + 9) );
-}
-
-VECTORMATH_FORCE_INLINE void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads )
-{
-	__m128 xxxx = _mm_shuffle_ps( pnt1.get128(), pnt1.get128(), _MM_SHUFFLE(0, 0, 0, 0) );
-	__m128 zzzz = _mm_shuffle_ps( pnt2.get128(), pnt2.get128(), _MM_SHUFFLE(2, 2, 2, 2) );
-	VM_ATTRIBUTE_ALIGN16 unsigned int xsw[4] = {0, 0, 0, 0xffffffff};
-	VM_ATTRIBUTE_ALIGN16 unsigned int zsw[4] = {0xffffffff, 0, 0, 0};
-	threeQuads[0] = vec_sel( pnt0.get128(), xxxx, xsw );
-    threeQuads[1] = _mm_shuffle_ps( pnt1.get128(), pnt2.get128(), _MM_SHUFFLE(1, 0, 2, 1) );
-    threeQuads[2] = vec_sel( _mm_shuffle_ps( pnt3.get128(), pnt3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
-}
-/*
-VECTORMATH_FORCE_INLINE void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads )
-{
-#if 0
-    __m128 xyz0[3];
-    __m128 xyz1[3];
-    storeXYZArray( pnt0, pnt1, pnt2, pnt3, xyz0 );
-    storeXYZArray( pnt4, pnt5, pnt6, pnt7, xyz1 );
-    threeQuads[0] = _vmath2VfToHalfFloats(xyz0[0], xyz0[1]);
-    threeQuads[1] = _vmath2VfToHalfFloats(xyz0[2], xyz1[0]);
-    threeQuads[2] = _vmath2VfToHalfFloats(xyz1[1], xyz1[2]);
-#else
-	assert(0);
-#endif
-}
-*/
-VECTORMATH_FORCE_INLINE Point3 & Point3::operator =( const Point3 &pnt )
-{
-    mVec128 = pnt.mVec128;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Point3 & Point3::setX( float _x )
-{
-    _vmathVfSetElement(mVec128, _x, 0);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Point3 & Point3::setX( const floatInVec &_x )
-{
-    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Point3::getX( ) const
-{
-    return floatInVec( mVec128, 0 );
-}
-
-VECTORMATH_FORCE_INLINE Point3 & Point3::setY( float _y )
-{
-    _vmathVfSetElement(mVec128, _y, 1);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Point3 & Point3::setY( const floatInVec &_y )
-{
-    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Point3::getY( ) const
-{
-    return floatInVec( mVec128, 1 );
-}
-
-VECTORMATH_FORCE_INLINE Point3 & Point3::setZ( float _z )
-{
-    _vmathVfSetElement(mVec128, _z, 2);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Point3 & Point3::setZ( const floatInVec &_z )
-{
-    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Point3::getZ( ) const
-{
-    return floatInVec( mVec128, 2 );
-}
-
-VECTORMATH_FORCE_INLINE Point3 & Point3::setElem( int idx, float value )
-{
-    _vmathVfSetElement(mVec128, value, idx);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Point3 & Point3::setElem( int idx, const floatInVec &value )
-{
-    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Point3::getElem( int idx ) const
-{
-    return floatInVec( mVec128, idx );
-}
-
-VECTORMATH_FORCE_INLINE VecIdx Point3::operator []( int idx )
-{
-    return VecIdx( mVec128, idx );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec Point3::operator []( int idx ) const
-{
-    return floatInVec( mVec128, idx );
-}
-
-VECTORMATH_FORCE_INLINE const Vector3 Point3::operator -( const Point3 &pnt ) const
-{
-    return Vector3( _mm_sub_ps( mVec128, pnt.mVec128 ) );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 Point3::operator +( const Vector3 &vec ) const
-{
-    return Point3( _mm_add_ps( mVec128, vec.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 Point3::operator -( const Vector3 &vec ) const
-{
-    return Point3( _mm_sub_ps( mVec128, vec.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE Point3 & Point3::operator +=( const Vector3 &vec )
-{
-    *this = *this + vec;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE Point3 & Point3::operator -=( const Vector3 &vec )
-{
-    *this = *this - vec;
-    return *this;
-}
-
-VECTORMATH_FORCE_INLINE const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 )
-{
-    return Point3( _mm_mul_ps( pnt0.get128(), pnt1.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 )
-{
-    return Point3( _mm_div_ps( pnt0.get128(), pnt1.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 recipPerElem( const Point3 &pnt )
-{
-    return Point3( _mm_rcp_ps( pnt.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 absPerElem( const Point3 &pnt )
-{
-    return Point3( fabsf4( pnt.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 )
-{
-	__m128 vmask = toM128(0x7fffffff);
-	return Point3( _mm_or_ps(
-		_mm_and_ps   ( vmask, pnt0.get128() ),			// Value
-		_mm_andnot_ps( vmask, pnt1.get128() ) ) );		// Signs
-}
-
-VECTORMATH_FORCE_INLINE const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 )
-{
-    return Point3( _mm_max_ps( pnt0.get128(), pnt1.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Point3 &pnt )
-{
-    return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 )
-{
-    return Point3( _mm_min_ps( pnt0.get128(), pnt1.get128() ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec minElem( const Point3 &pnt )
-{
-    return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec sum( const Point3 &pnt )
-{
-    return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, float scaleVal )
-{
-    return scale( pnt, floatInVec( scaleVal ) );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal )
-{
-    return mulPerElem( pnt, Point3( scaleVal ) );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec )
-{
-    return mulPerElem( pnt, Point3( scaleVec ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec )
-{
-    return floatInVec( _vmathVfDot3( pnt.get128(), unitVec.get128() ), 0 );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec distSqrFromOrigin( const Point3 &pnt )
-{
-    return lengthSqr( Vector3( pnt ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec distFromOrigin( const Point3 &pnt )
-{
-    return length( Vector3( pnt ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 )
-{
-    return lengthSqr( ( pnt1 - pnt0 ) );
-}
-
-VECTORMATH_FORCE_INLINE const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 )
-{
-    return length( ( pnt1 - pnt0 ) );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 )
-{
-    return select( pnt0, pnt1, boolInVec(select1) );
-}
-
-VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 )
-{
-    return Point3( vec_sel( pnt0.get128(), pnt1.get128(), select1.get128() ) );
-}
-
-
-
-#ifdef _VECTORMATH_DEBUG
-
-VECTORMATH_FORCE_INLINE void print( const Point3 &pnt )
-{
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = pnt.get128();
-    printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
-}
-
-VECTORMATH_FORCE_INLINE void print( const Point3 &pnt, const char * name )
-{
-    union { __m128 v; float s[4]; } tmp;
-    tmp.v = pnt.get128();
-    printf( "%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] );
-}
-
-#endif
-
-} // namespace Aos
-} // namespace Vectormath
-
-#endif
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _VECTORMATH_VEC_AOS_CPP_H
+#define _VECTORMATH_VEC_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Constants
+// for permutes words are labeled [x,y,z,w] [a,b,c,d]
+
+#define _VECTORMATH_PERM_X 0x00010203
+#define _VECTORMATH_PERM_Y 0x04050607
+#define _VECTORMATH_PERM_Z 0x08090a0b
+#define _VECTORMATH_PERM_W 0x0c0d0e0f
+#define _VECTORMATH_PERM_A 0x10111213
+#define _VECTORMATH_PERM_B 0x14151617
+#define _VECTORMATH_PERM_C 0x18191a1b
+#define _VECTORMATH_PERM_D 0x1c1d1e1f
+#define _VECTORMATH_PERM_XYZA (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A }
+#define _VECTORMATH_PERM_ZXYW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_W }
+#define _VECTORMATH_PERM_YZXW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_W }
+#define _VECTORMATH_PERM_YZAB (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B }
+#define _VECTORMATH_PERM_ZABC (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B, _VECTORMATH_PERM_C }
+#define _VECTORMATH_PERM_XYAW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_W }
+#define _VECTORMATH_PERM_XAZW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_W }
+#define _VECTORMATH_MASK_0xF000 (vec_uint4){ 0xffffffff, 0, 0, 0 }
+#define _VECTORMATH_MASK_0x0F00 (vec_uint4){ 0, 0xffffffff, 0, 0 }
+#define _VECTORMATH_MASK_0x00F0 (vec_uint4){ 0, 0, 0xffffffff, 0 }
+#define _VECTORMATH_MASK_0x000F (vec_uint4){ 0, 0, 0, 0xffffffff }
+#define _VECTORMATH_UNIT_1000 _mm_setr_ps(1.0f,0.0f,0.0f,0.0f) // (__m128){ 1.0f, 0.0f, 0.0f, 0.0f }
+#define _VECTORMATH_UNIT_0100 _mm_setr_ps(0.0f,1.0f,0.0f,0.0f) // (__m128){ 0.0f, 1.0f, 0.0f, 0.0f }
+#define _VECTORMATH_UNIT_0010 _mm_setr_ps(0.0f,0.0f,1.0f,0.0f) // (__m128){ 0.0f, 0.0f, 1.0f, 0.0f }
+#define _VECTORMATH_UNIT_0001 _mm_setr_ps(0.0f,0.0f,0.0f,1.0f) // (__m128){ 0.0f, 0.0f, 0.0f, 1.0f }
+#define _VECTORMATH_SLERP_TOL 0.999f
+//_VECTORMATH_SLERP_TOLF
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#define     _vmath_shufps(a, b, immx, immy, immz, immw) _mm_shuffle_ps(a, b, _MM_SHUFFLE(immw, immz, immy, immx))
+static VECTORMATH_FORCE_INLINE __m128 _vmathVfDot3( __m128 vec0, __m128 vec1 )
+{
+	__m128 result = _mm_mul_ps( vec0, vec1);
+    return _mm_add_ps( vec_splat( result, 0 ), _mm_add_ps( vec_splat( result, 1 ), vec_splat( result, 2 ) ) );
+}
+
+static VECTORMATH_FORCE_INLINE __m128 _vmathVfDot4( __m128 vec0, __m128 vec1 )
+{
+    __m128 result = _mm_mul_ps(vec0, vec1);
+	return _mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(0,0,0,0)),
+			_mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(1,1,1,1)),
+			_mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(2,2,2,2)), _mm_shuffle_ps(result, result, _MM_SHUFFLE(3,3,3,3)))));
+}
+
+static VECTORMATH_FORCE_INLINE __m128 _vmathVfCross( __m128 vec0, __m128 vec1 )
+{
+    __m128 tmp0, tmp1, tmp2, tmp3, result;
+    tmp0 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,0,2,1) );
+    tmp1 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,1,0,2) );
+    tmp2 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,0,2,1) );
+    result = vec_mul( tmp0, tmp1 );
+    result = vec_nmsub( tmp2, tmp3, result );
+    return result;
+}
+/*
+static VECTORMATH_FORCE_INLINE vec_uint4 _vmathVfToHalfFloatsUnpacked(__m128 v)
+{
+#if 0
+    vec_int4 bexp;
+    vec_uint4 mant, sign, hfloat;
+    vec_uint4 notZero, isInf;
+    const vec_uint4 hfloatInf = (vec_uint4)(0x00007c00u);
+    const vec_uint4 mergeMant = (vec_uint4)(0x000003ffu);
+    const vec_uint4 mergeSign = (vec_uint4)(0x00008000u);
+
+    sign = vec_sr((vec_uint4)v, (vec_uint4)16);
+    mant = vec_sr((vec_uint4)v, (vec_uint4)13);
+    bexp = vec_and(vec_sr((vec_int4)v, (vec_uint4)23), (vec_int4)0xff);
+
+    notZero = (vec_uint4)vec_cmpgt(bexp, (vec_int4)112);
+    isInf = (vec_uint4)vec_cmpgt(bexp, (vec_int4)142);
+
+    bexp = _mm_add_ps(bexp, (vec_int4)-112);
+    bexp = vec_sl(bexp, (vec_uint4)10);
+
+    hfloat = vec_sel((vec_uint4)bexp, mant, mergeMant);
+    hfloat = vec_sel((vec_uint4)(0), hfloat, notZero);
+    hfloat = vec_sel(hfloat, hfloatInf, isInf);
+    hfloat = vec_sel(hfloat, sign, mergeSign);
+
+    return hfloat;
+#else
+	assert(0);
+	return _mm_setzero_ps();
+#endif
+}
+
+static VECTORMATH_FORCE_INLINE vec_ushort8 _vmath2VfToHalfFloats(__m128 u, __m128 v)
+{
+#if 0
+    vec_uint4 hfloat_u, hfloat_v;
+    const vec_uchar16 pack = (vec_uchar16){2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+    hfloat_u = _vmathVfToHalfFloatsUnpacked(u);
+    hfloat_v = _vmathVfToHalfFloatsUnpacked(v);
+    return (vec_ushort8)vec_perm(hfloat_u, hfloat_v, pack);
+#else
+	assert(0);
+	return _mm_setzero_si128();
+#endif
+}
+*/
+
+static VECTORMATH_FORCE_INLINE __m128 _vmathVfInsert(__m128 dst, __m128 src, int slot)
+{
+	SSEFloat s;
+	s.m128 = src;
+	SSEFloat d;
+	d.m128 = dst;
+	d.f[slot] = s.f[slot];
+	return d.m128;
+}
+
+#define _vmathVfSetElement(vec, scalar, slot) ((float *)&(vec))[slot] = scalar
+
+static VECTORMATH_FORCE_INLINE __m128 _vmathVfSplatScalar(float scalar)
+{
+	return _mm_set1_ps(scalar);
+}
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+	
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+VECTORMATH_FORCE_INLINE VecIdx::operator floatInVec() const
+{
+    return floatInVec(ref, i);
+}
+
+VECTORMATH_FORCE_INLINE float VecIdx::getAsFloat() const
+#else
+VECTORMATH_FORCE_INLINE VecIdx::operator float() const
+#endif
+{
+    return ((float *)&ref)[i];
+}
+
+VECTORMATH_FORCE_INLINE float VecIdx::operator =( float scalar )
+{
+    _vmathVfSetElement(ref, scalar, i);
+    return scalar;
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator =( const floatInVec &scalar )
+{
+    ref = _vmathVfInsert(ref, scalar.get128(), i);
+    return scalar;
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator =( const VecIdx& scalar )
+{
+    return *this = floatInVec(scalar.ref, scalar.i);
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator *=( float scalar )
+{
+    return *this *= floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator *=( const floatInVec &scalar )
+{
+    return *this = floatInVec(ref, i) * scalar;
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator /=( float scalar )
+{
+    return *this /= floatInVec(scalar);
+}
+
+inline floatInVec VecIdx::operator /=( const floatInVec &scalar )
+{
+    return *this = floatInVec(ref, i) / scalar;
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator +=( float scalar )
+{
+    return *this += floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator +=( const floatInVec &scalar )
+{
+    return *this = floatInVec(ref, i) + scalar;
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator -=( float scalar )
+{
+    return *this -= floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator -=( const floatInVec &scalar )
+{
+    return *this = floatInVec(ref, i) - scalar;
+}
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3(const Vector3& vec)
+{
+    set128(vec.get128());
+}
+
+VECTORMATH_FORCE_INLINE void Vector3::set128(vec_float4 vec)
+{
+    mVec128 = vec;
+}
+
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3( float _x, float _y, float _z )
+{
+    mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
+}
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
+{
+	__m128 xz = _mm_unpacklo_ps( _x.get128(), _z.get128() );
+	mVec128 = _mm_unpacklo_ps( xz, _y.get128() );
+}
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3( const Point3 &pnt )
+{
+    mVec128 = pnt.get128();
+}
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3( float scalar )
+{
+    mVec128 = floatInVec(scalar).get128();
+}
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3( const floatInVec &scalar )
+{
+    mVec128 = scalar.get128();
+}
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3( __m128 vf4 )
+{
+    mVec128 = vf4;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::xAxis( )
+{
+    return Vector3( _VECTORMATH_UNIT_1000 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::yAxis( )
+{
+    return Vector3( _VECTORMATH_UNIT_0100 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::zAxis( )
+{
+    return Vector3( _VECTORMATH_UNIT_0010 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return lerp( floatInVec(t), vec0, vec1 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
+{
+    return slerp( floatInVec(t), unitVec0, unitVec1 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
+{
+    __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
+    cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() );
+    __m128 selectMask = _mm_cmpgt_ps( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
+    angle = acosf4( cosAngle );
+    tttt = t.get128();
+    oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt );
+    angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt ); // angles = 1, t, 1, t
+    angles = _mm_unpacklo_ps( angles, oneMinusT );		// angles = 1, 1-t, t, 1-t
+    angles = _mm_mul_ps( angles, angle );
+    sines = sinf4( angles );
+    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
+    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
+    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
+    return Vector3( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE __m128 Vector3::get128( ) const
+{
+    return mVec128;
+}
+
+VECTORMATH_FORCE_INLINE void loadXYZ(Point3& vec, const float* fptr)
+{
+#ifdef USE_SSE3_LDDQU
+	vec = Point3(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128 );
+#else
+	SSEFloat fl;
+	fl.f[0] = fptr[0];
+	fl.f[1] = fptr[1];
+	fl.f[2] = fptr[2];
+	fl.f[3] = fptr[3];
+    vec = Point3(	fl.m128);
+#endif //USE_SSE3_LDDQU
+	
+}
+
+
+
+VECTORMATH_FORCE_INLINE void loadXYZ(Vector3& vec, const float* fptr)
+{
+#ifdef USE_SSE3_LDDQU
+	vec = Vector3(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128 );
+#else
+	SSEFloat fl;
+	fl.f[0] = fptr[0];
+	fl.f[1] = fptr[1];
+	fl.f[2] = fptr[2];
+	fl.f[3] = fptr[3];
+    vec = Vector3(	fl.m128);
+#endif //USE_SSE3_LDDQU
+	
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZ( const Vector3 &vec, __m128 * quad )
+{
+	__m128 dstVec = *quad;
+	VM_ATTRIBUTE_ALIGN16  unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize
+	dstVec = vec_sel(vec.get128(), dstVec, sw);
+	*quad = dstVec;
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZ(const Point3& vec, float* fptr)
+{
+	fptr[0] = vec.getX();
+	fptr[1] = vec.getY();
+	fptr[2] = vec.getZ();
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZ(const Vector3& vec, float* fptr)
+{
+	fptr[0] = vec.getX();
+	fptr[1] = vec.getY();
+	fptr[2] = vec.getZ();
+}
+
+
+VECTORMATH_FORCE_INLINE void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads )
+{
+	const float *quads = (float *)threeQuads;
+    vec0 = Vector3(  _mm_load_ps(quads) );
+    vec1 = Vector3( _mm_loadu_ps(quads + 3) );
+    vec2 = Vector3( _mm_loadu_ps(quads + 6) );
+    vec3 = Vector3( _mm_loadu_ps(quads + 9) );
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads )
+{
+	__m128 xxxx = _mm_shuffle_ps( vec1.get128(), vec1.get128(), _MM_SHUFFLE(0, 0, 0, 0) );
+	__m128 zzzz = _mm_shuffle_ps( vec2.get128(), vec2.get128(), _MM_SHUFFLE(2, 2, 2, 2) );
+	VM_ATTRIBUTE_ALIGN16 unsigned int xsw[4] = {0, 0, 0, 0xffffffff};
+	VM_ATTRIBUTE_ALIGN16 unsigned int zsw[4] = {0xffffffff, 0, 0, 0};
+	threeQuads[0] = vec_sel( vec0.get128(), xxxx, xsw );
+    threeQuads[1] = _mm_shuffle_ps( vec1.get128(), vec2.get128(), _MM_SHUFFLE(1, 0, 2, 1) );
+    threeQuads[2] = vec_sel( _mm_shuffle_ps( vec3.get128(), vec3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
+}
+/*
+VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads )
+{
+	assert(0);
+#if 0
+    __m128 xyz0[3];
+    __m128 xyz1[3];
+    storeXYZArray( vec0, vec1, vec2, vec3, xyz0 );
+    storeXYZArray( vec4, vec5, vec6, vec7, xyz1 );
+    threeQuads[0] = _vmath2VfToHalfFloats(xyz0[0], xyz0[1]);
+    threeQuads[1] = _vmath2VfToHalfFloats(xyz0[2], xyz1[0]);
+    threeQuads[2] = _vmath2VfToHalfFloats(xyz1[1], xyz1[2]);
+#endif
+}
+*/
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator =( const Vector3 &vec )
+{
+    mVec128 = vec.mVec128;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setX( float _x )
+{
+    _vmathVfSetElement(mVec128, _x, 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setX( const floatInVec &_x )
+{
+    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector3::getX( ) const
+{
+    return floatInVec( mVec128, 0 );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setY( float _y )
+{
+    _vmathVfSetElement(mVec128, _y, 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setY( const floatInVec &_y )
+{
+    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector3::getY( ) const
+{
+    return floatInVec( mVec128, 1 );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setZ( float _z )
+{
+    _vmathVfSetElement(mVec128, _z, 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setZ( const floatInVec &_z )
+{
+    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector3::getZ( ) const
+{
+    return floatInVec( mVec128, 2 );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setElem( int idx, float value )
+{
+    _vmathVfSetElement(mVec128, value, idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setElem( int idx, const floatInVec &value )
+{
+    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector3::getElem( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE VecIdx Vector3::operator []( int idx )
+{
+    return VecIdx( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector3::operator []( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator +( const Vector3 &vec ) const
+{
+    return Vector3( _mm_add_ps( mVec128, vec.mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator -( const Vector3 &vec ) const
+{
+    return Vector3( _mm_sub_ps( mVec128, vec.mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 Vector3::operator +( const Point3 &pnt ) const
+{
+    return Point3( _mm_add_ps( mVec128, pnt.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator *( const floatInVec &scalar ) const
+{
+    return Vector3( _mm_mul_ps( mVec128, scalar.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator +=( const Vector3 &vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator -=( const Vector3 &vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator /( float scalar ) const
+{
+    return *this / floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator /( const floatInVec &scalar ) const
+{
+    return Vector3( _mm_div_ps( mVec128, scalar.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator /=( const floatInVec &scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator -( ) const
+{
+	//return Vector3(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
+
+	VM_ATTRIBUTE_ALIGN16 static const int array[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+	__m128 NEG_MASK = SSEFloat(*(const vec_float4*)array).vf;
+	return Vector3(_mm_xor_ps(get128(),NEG_MASK));
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 operator *( float scalar, const Vector3 &vec )
+{
+    return floatInVec(scalar) * vec;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec )
+{
+    return vec * scalar;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _mm_div_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 recipPerElem( const Vector3 &vec )
+{
+    return Vector3( _mm_rcp_ps( vec.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 absPerElem( const Vector3 &vec )
+{
+    return Vector3( fabsf4( vec.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+	__m128 vmask = toM128(0x7fffffff);
+	return Vector3( _mm_or_ps(
+		_mm_and_ps   ( vmask, vec0.get128() ),			// Value
+		_mm_andnot_ps( vmask, vec1.get128() ) ) );		// Signs
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _mm_max_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector3 &vec )
+{
+    return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _mm_min_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector3 &vec )
+{
+    return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector3 &vec )
+{
+    return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return floatInVec( _vmathVfDot3( vec0.get128(), vec1.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector3 &vec )
+{
+    return floatInVec(  _vmathVfDot3( vec.get128(), vec.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec length( const Vector3 &vec )
+{
+    return floatInVec(  _mm_sqrt_ps(_vmathVfDot3( vec.get128(), vec.get128() )), 0 );
+}
+
+
+VECTORMATH_FORCE_INLINE const Vector3 normalizeApprox( const Vector3 &vec )
+{
+    return Vector3( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 normalize( const Vector3 &vec )
+{
+	return Vector3( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _vmathVfCross( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 )
+{
+    return select( vec0, vec1, boolInVec(select1) );
+}
+
+
+VECTORMATH_FORCE_INLINE  const Vector4 select(const Vector4& vec0, const Vector4& vec1, const boolInVec& select1)
+{
+    return Vector4(vec_sel(vec0.get128(), vec1.get128(), select1.get128()));
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Vector3 &vec )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = vec.get128();
+    printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Vector3 &vec, const char * name )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = vec.get128();
+    printf( "%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] );
+}
+
+#endif
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( float _x, float _y, float _z, float _w )
+{
+    mVec128 = _mm_setr_ps(_x, _y, _z, _w); 
+ }
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
+{
+	mVec128 = _mm_unpacklo_ps(
+		_mm_unpacklo_ps( _x.get128(), _z.get128() ),
+		_mm_unpacklo_ps( _y.get128(), _w.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const Vector3 &xyz, float _w )
+{
+    mVec128 = xyz.get128();
+    _vmathVfSetElement(mVec128, _w, 3);
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const Vector3 &xyz, const floatInVec &_w )
+{
+    mVec128 = xyz.get128();
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const Vector3 &vec )
+{
+    mVec128 = vec.get128();
+    mVec128 = _vmathVfInsert(mVec128, _mm_setzero_ps(), 3);
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const Point3 &pnt )
+{
+    mVec128 = pnt.get128();
+    mVec128 = _vmathVfInsert(mVec128, _mm_set1_ps(1.0f), 3);
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const Quat &quat )
+{
+    mVec128 = quat.get128();
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( float scalar )
+{
+    mVec128 = floatInVec(scalar).get128();
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const floatInVec &scalar )
+{
+    mVec128 = scalar.get128();
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( __m128 vf4 )
+{
+    mVec128 = vf4;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::xAxis( )
+{
+    return Vector4( _VECTORMATH_UNIT_1000 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::yAxis( )
+{
+    return Vector4( _VECTORMATH_UNIT_0100 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::zAxis( )
+{
+    return Vector4( _VECTORMATH_UNIT_0010 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::wAxis( )
+{
+    return Vector4( _VECTORMATH_UNIT_0001 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return lerp( floatInVec(t), vec0, vec1 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
+{
+    return slerp( floatInVec(t), unitVec0, unitVec1 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
+{
+    __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
+    cosAngle = _vmathVfDot4( unitVec0.get128(), unitVec1.get128() );
+    __m128 selectMask = _mm_cmpgt_ps( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
+    angle = acosf4( cosAngle );
+    tttt = t.get128();
+    oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt );
+    angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt ); // angles = 1, t, 1, t
+    angles = _mm_unpacklo_ps( angles, oneMinusT );		// angles = 1, 1-t, t, 1-t
+    angles = _mm_mul_ps( angles, angle );
+    sines = sinf4( angles );
+    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
+    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
+    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
+    return Vector4( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE __m128 Vector4::get128( ) const
+{
+    return mVec128;
+}
+/*
+VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads )
+{
+    twoQuads[0] = _vmath2VfToHalfFloats(vec0.get128(), vec1.get128());
+    twoQuads[1] = _vmath2VfToHalfFloats(vec2.get128(), vec3.get128());
+}
+*/
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator =( const Vector4 &vec )
+{
+    mVec128 = vec.mVec128;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setXYZ( const Vector3 &vec )
+{
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+	mVec128 = vec_sel( vec.get128(), mVec128, sw );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector4::getXYZ( ) const
+{
+    return Vector3( mVec128 );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setX( float _x )
+{
+    _vmathVfSetElement(mVec128, _x, 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setX( const floatInVec &_x )
+{
+    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector4::getX( ) const
+{
+    return floatInVec( mVec128, 0 );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setY( float _y )
+{
+    _vmathVfSetElement(mVec128, _y, 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setY( const floatInVec &_y )
+{
+    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector4::getY( ) const
+{
+    return floatInVec( mVec128, 1 );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setZ( float _z )
+{
+    _vmathVfSetElement(mVec128, _z, 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setZ( const floatInVec &_z )
+{
+    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector4::getZ( ) const
+{
+    return floatInVec( mVec128, 2 );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setW( float _w )
+{
+    _vmathVfSetElement(mVec128, _w, 3);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setW( const floatInVec &_w )
+{
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector4::getW( ) const
+{
+    return floatInVec( mVec128, 3 );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setElem( int idx, float value )
+{
+    _vmathVfSetElement(mVec128, value, idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setElem( int idx, const floatInVec &value )
+{
+    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector4::getElem( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE VecIdx Vector4::operator []( int idx )
+{
+    return VecIdx( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector4::operator []( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator +( const Vector4 &vec ) const
+{
+    return Vector4( _mm_add_ps( mVec128, vec.mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator -( const Vector4 &vec ) const
+{
+    return Vector4( _mm_sub_ps( mVec128, vec.mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator *( const floatInVec &scalar ) const
+{
+    return Vector4( _mm_mul_ps( mVec128, scalar.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator +=( const Vector4 &vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator -=( const Vector4 &vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator /( float scalar ) const
+{
+    return *this / floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator /( const floatInVec &scalar ) const
+{
+    return Vector4( _mm_div_ps( mVec128, scalar.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator /=( const floatInVec &scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator -( ) const
+{
+	return Vector4(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 operator *( float scalar, const Vector4 &vec )
+{
+    return floatInVec(scalar) * vec;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec )
+{
+    return vec * scalar;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return Vector4( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return Vector4( _mm_div_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 recipPerElem( const Vector4 &vec )
+{
+    return Vector4( _mm_rcp_ps( vec.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 absPerElem( const Vector4 &vec )
+{
+    return Vector4( fabsf4( vec.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+	__m128 vmask = toM128(0x7fffffff);
+	return Vector4( _mm_or_ps(
+		_mm_and_ps   ( vmask, vec0.get128() ),			// Value
+		_mm_andnot_ps( vmask, vec1.get128() ) ) );		// Signs
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return Vector4( _mm_max_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector4 &vec )
+{
+    return floatInVec( _mm_max_ps(
+		_mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
+		_mm_max_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return Vector4( _mm_min_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector4 &vec )
+{
+    return floatInVec( _mm_min_ps(
+		_mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
+		_mm_min_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector4 &vec )
+{
+    return floatInVec( _mm_add_ps(
+		_mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
+		_mm_add_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return floatInVec( _vmathVfDot4( vec0.get128(), vec1.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector4 &vec )
+{
+    return floatInVec(  _vmathVfDot4( vec.get128(), vec.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec length( const Vector4 &vec )
+{
+    return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( vec.get128(), vec.get128() )), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 normalizeApprox( const Vector4 &vec )
+{
+    return Vector4( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 normalize( const Vector4 &vec )
+{
+    return Vector4( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 )
+{
+    return select( vec0, vec1, boolInVec(select1) );
+}
+
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Vector4 &vec )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = vec.get128();
+    printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Vector4 &vec, const char * name )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = vec.get128();
+    printf( "%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+#endif
+
+VECTORMATH_FORCE_INLINE Point3::Point3( float _x, float _y, float _z )
+{
+    mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
+}
+
+VECTORMATH_FORCE_INLINE Point3::Point3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
+{
+	mVec128 = _mm_unpacklo_ps( _mm_unpacklo_ps( _x.get128(), _z.get128() ), _y.get128() );
+}
+
+VECTORMATH_FORCE_INLINE Point3::Point3( const Vector3 &vec )
+{
+    mVec128 = vec.get128();
+}
+
+VECTORMATH_FORCE_INLINE Point3::Point3( float scalar )
+{
+    mVec128 = floatInVec(scalar).get128();
+}
+
+VECTORMATH_FORCE_INLINE Point3::Point3( const floatInVec &scalar )
+{
+    mVec128 = scalar.get128();
+}
+
+VECTORMATH_FORCE_INLINE Point3::Point3( __m128 vf4 )
+{
+    mVec128 = vf4;
+}
+
+VECTORMATH_FORCE_INLINE const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return lerp( floatInVec(t), pnt0, pnt1 );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return ( pnt0 + ( ( pnt1 - pnt0 ) * t ) );
+}
+
+VECTORMATH_FORCE_INLINE __m128 Point3::get128( ) const
+{
+    return mVec128;
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZ( const Point3 &pnt, __m128 * quad )
+{
+    __m128 dstVec = *quad;
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize
+    dstVec = vec_sel(pnt.get128(), dstVec, sw);
+    *quad = dstVec;
+}
+
+VECTORMATH_FORCE_INLINE void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads )
+{
+	const float *quads = (float *)threeQuads;
+    pnt0 = Point3(  _mm_load_ps(quads) );
+    pnt1 = Point3( _mm_loadu_ps(quads + 3) );
+    pnt2 = Point3( _mm_loadu_ps(quads + 6) );
+    pnt3 = Point3( _mm_loadu_ps(quads + 9) );
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads )
+{
+	__m128 xxxx = _mm_shuffle_ps( pnt1.get128(), pnt1.get128(), _MM_SHUFFLE(0, 0, 0, 0) );
+	__m128 zzzz = _mm_shuffle_ps( pnt2.get128(), pnt2.get128(), _MM_SHUFFLE(2, 2, 2, 2) );
+	VM_ATTRIBUTE_ALIGN16 unsigned int xsw[4] = {0, 0, 0, 0xffffffff};
+	VM_ATTRIBUTE_ALIGN16 unsigned int zsw[4] = {0xffffffff, 0, 0, 0};
+	threeQuads[0] = vec_sel( pnt0.get128(), xxxx, xsw );
+    threeQuads[1] = _mm_shuffle_ps( pnt1.get128(), pnt2.get128(), _MM_SHUFFLE(1, 0, 2, 1) );
+    threeQuads[2] = vec_sel( _mm_shuffle_ps( pnt3.get128(), pnt3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
+}
+/*
+VECTORMATH_FORCE_INLINE void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads )
+{
+#if 0
+    __m128 xyz0[3];
+    __m128 xyz1[3];
+    storeXYZArray( pnt0, pnt1, pnt2, pnt3, xyz0 );
+    storeXYZArray( pnt4, pnt5, pnt6, pnt7, xyz1 );
+    threeQuads[0] = _vmath2VfToHalfFloats(xyz0[0], xyz0[1]);
+    threeQuads[1] = _vmath2VfToHalfFloats(xyz0[2], xyz1[0]);
+    threeQuads[2] = _vmath2VfToHalfFloats(xyz1[1], xyz1[2]);
+#else
+	assert(0);
+#endif
+}
+*/
+VECTORMATH_FORCE_INLINE Point3 & Point3::operator =( const Point3 &pnt )
+{
+    mVec128 = pnt.mVec128;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setX( float _x )
+{
+    _vmathVfSetElement(mVec128, _x, 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setX( const floatInVec &_x )
+{
+    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Point3::getX( ) const
+{
+    return floatInVec( mVec128, 0 );
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setY( float _y )
+{
+    _vmathVfSetElement(mVec128, _y, 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setY( const floatInVec &_y )
+{
+    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Point3::getY( ) const
+{
+    return floatInVec( mVec128, 1 );
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setZ( float _z )
+{
+    _vmathVfSetElement(mVec128, _z, 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setZ( const floatInVec &_z )
+{
+    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Point3::getZ( ) const
+{
+    return floatInVec( mVec128, 2 );
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setElem( int idx, float value )
+{
+    _vmathVfSetElement(mVec128, value, idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setElem( int idx, const floatInVec &value )
+{
+    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Point3::getElem( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE VecIdx Point3::operator []( int idx )
+{
+    return VecIdx( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Point3::operator []( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Point3::operator -( const Point3 &pnt ) const
+{
+    return Vector3( _mm_sub_ps( mVec128, pnt.mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 Point3::operator +( const Vector3 &vec ) const
+{
+    return Point3( _mm_add_ps( mVec128, vec.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 Point3::operator -( const Vector3 &vec ) const
+{
+    return Point3( _mm_sub_ps( mVec128, vec.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::operator +=( const Vector3 &vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::operator -=( const Vector3 &vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return Point3( _mm_mul_ps( pnt0.get128(), pnt1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return Point3( _mm_div_ps( pnt0.get128(), pnt1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 recipPerElem( const Point3 &pnt )
+{
+    return Point3( _mm_rcp_ps( pnt.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 absPerElem( const Point3 &pnt )
+{
+    return Point3( fabsf4( pnt.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+	__m128 vmask = toM128(0x7fffffff);
+	return Point3( _mm_or_ps(
+		_mm_and_ps   ( vmask, pnt0.get128() ),			// Value
+		_mm_andnot_ps( vmask, pnt1.get128() ) ) );		// Signs
+}
+
+VECTORMATH_FORCE_INLINE const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return Point3( _mm_max_ps( pnt0.get128(), pnt1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Point3 &pnt )
+{
+    return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return Point3( _mm_min_ps( pnt0.get128(), pnt1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec minElem( const Point3 &pnt )
+{
+    return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec sum( const Point3 &pnt )
+{
+    return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, float scaleVal )
+{
+    return scale( pnt, floatInVec( scaleVal ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal )
+{
+    return mulPerElem( pnt, Point3( scaleVal ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec )
+{
+    return mulPerElem( pnt, Point3( scaleVec ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec )
+{
+    return floatInVec( _vmathVfDot3( pnt.get128(), unitVec.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec distSqrFromOrigin( const Point3 &pnt )
+{
+    return lengthSqr( Vector3( pnt ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec distFromOrigin( const Point3 &pnt )
+{
+    return length( Vector3( pnt ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return lengthSqr( ( pnt1 - pnt0 ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return length( ( pnt1 - pnt0 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 )
+{
+    return select( pnt0, pnt1, boolInVec(select1) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 )
+{
+    return Point3( vec_sel( pnt0.get128(), pnt1.get128(), select1.get128() ) );
+}
+
+
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Point3 &pnt )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = pnt.get128();
+    printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Point3 &pnt, const char * name )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = pnt.get128();
+    printf( "%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/src/vectormath/sse/vecidx_aos.h b/src/vectormath/sse/vecidx_aos.h
index 32e837a52..8ba4b1d75 100644
--- a/src/vectormath/sse/vecidx_aos.h
+++ b/src/vectormath/sse/vecidx_aos.h
@@ -1,80 +1,80 @@
-/*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-   POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifndef _VECTORMATH_VECIDX_AOS_H
-#define _VECTORMATH_VECIDX_AOS_H
-
-
-#include "floatInVec.h"
-
-namespace Vectormath {
-namespace Aos {
-
-//-----------------------------------------------------------------------------
-// VecIdx 
-// Used in setting elements of Vector3, Vector4, Point3, or Quat with the 
-// subscripting operator.
-//
-
-VM_ATTRIBUTE_ALIGNED_CLASS16 (class) VecIdx
-{
-private:
-   __m128 &ref;
-   int i;
-public:
-    inline VecIdx( __m128& vec, int idx ): ref(vec) { i = idx; }
-
-    // implicitly casts to float unless _VECTORMATH_NO_SCALAR_CAST defined
-    // in which case, implicitly casts to floatInVec, and one must call
-    // getAsFloat to convert to float.
-    //
-#ifdef _VECTORMATH_NO_SCALAR_CAST
-    inline operator floatInVec() const;
-    inline float getAsFloat() const;
-#else
-    inline operator float() const;
-#endif
-
-    inline float operator =( float scalar );
-    inline floatInVec operator =( const floatInVec &scalar );
-    inline floatInVec operator =( const VecIdx& scalar );
-    inline floatInVec operator *=( float scalar );
-    inline floatInVec operator *=( const floatInVec &scalar );
-    inline floatInVec operator /=( float scalar );
-    inline floatInVec operator /=( const floatInVec &scalar );
-    inline floatInVec operator +=( float scalar );
-    inline floatInVec operator +=( const floatInVec &scalar );
-    inline floatInVec operator -=( float scalar );
-    inline floatInVec operator -=( const floatInVec &scalar );
-};
-
-} // namespace Aos
-} // namespace Vectormath
-
-#endif
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _VECTORMATH_VECIDX_AOS_H
+#define _VECTORMATH_VECIDX_AOS_H
+
+
+#include "floatInVec.h"
+
+namespace Vectormath {
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// VecIdx 
+// Used in setting elements of Vector3, Vector4, Point3, or Quat with the 
+// subscripting operator.
+//
+
+VM_ATTRIBUTE_ALIGNED_CLASS16 (class) VecIdx
+{
+private:
+   __m128 &ref;
+   int i;
+public:
+    inline VecIdx( __m128& vec, int idx ): ref(vec) { i = idx; }
+
+    // implicitly casts to float unless _VECTORMATH_NO_SCALAR_CAST defined
+    // in which case, implicitly casts to floatInVec, and one must call
+    // getAsFloat to convert to float.
+    //
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+    inline operator floatInVec() const;
+    inline float getAsFloat() const;
+#else
+    inline operator float() const;
+#endif
+
+    inline float operator =( float scalar );
+    inline floatInVec operator =( const floatInVec &scalar );
+    inline floatInVec operator =( const VecIdx& scalar );
+    inline floatInVec operator *=( float scalar );
+    inline floatInVec operator *=( const floatInVec &scalar );
+    inline floatInVec operator /=( float scalar );
+    inline floatInVec operator /=( const floatInVec &scalar );
+    inline floatInVec operator +=( float scalar );
+    inline floatInVec operator +=( const floatInVec &scalar );
+    inline floatInVec operator -=( float scalar );
+    inline floatInVec operator -=( const floatInVec &scalar );
+};
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/src/vectormath/sse/vectormath_aos.h b/src/vectormath/sse/vectormath_aos.h
index b802810df..be5ae8c6e 100644
--- a/src/vectormath/sse/vectormath_aos.h
+++ b/src/vectormath/sse/vectormath_aos.h
@@ -1,2547 +1,2547 @@
-/*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-   POSSIBILITY OF SUCH DAMAGE.
-*/
-
-
-#ifndef _VECTORMATH_AOS_CPP_SSE_H
-#define _VECTORMATH_AOS_CPP_SSE_H
-
-#include <math.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#include <assert.h>
-
-#define Vector3Ref Vector3&
-#define QuatRef	Quat&
-#define Matrix3Ref Matrix3&
-
-#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400)
-	#define USE_SSE3_LDDQU
-
-	#define VM_ATTRIBUTE_ALIGNED_CLASS16(a) __declspec(align(16)) a
-	#define VM_ATTRIBUTE_ALIGN16 __declspec(align(16))
-	#define VECTORMATH_FORCE_INLINE __forceinline 
-#else
-	#define VM_ATTRIBUTE_ALIGNED_CLASS16(a) a __attribute__ ((aligned (16)))	
-	#define VM_ATTRIBUTE_ALIGN16 __attribute__ ((aligned (16)))	
-	#define VECTORMATH_FORCE_INLINE inline 
-	#ifdef __SSE3__
-		#define USE_SSE3_LDDQU
-	#endif //__SSE3__
-#endif//_WIN32
-
-
-#ifdef USE_SSE3_LDDQU
-#include <pmmintrin.h>//_mm_lddqu_si128
-#endif //USE_SSE3_LDDQU
-
-
-// TODO: Tidy
-typedef __m128 vec_float4;
-typedef __m128 vec_uint4;
-typedef __m128 vec_int4;
-typedef __m128i vec_uchar16;
-typedef __m128i vec_ushort8;
-
-#define vec_splat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e))
-
-#define _mm_ror_ps(vec,i)	\
-	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(i+3)%4,(unsigned char)(i+2)%4,(unsigned char)(i+1)%4,(unsigned char)(i+0)%4))) : (vec))
-#define _mm_rol_ps(vec,i)	\
-	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(7-i)%4,(unsigned char)(6-i)%4,(unsigned char)(5-i)%4,(unsigned char)(4-i)%4))) : (vec))
-
-#define vec_sld(vec,vec2,x) _mm_ror_ps(vec, ((x)/4))
-
-#define _mm_abs_ps(vec)		_mm_andnot_ps(_MASKSIGN_,vec)
-#define _mm_neg_ps(vec)		_mm_xor_ps(_MASKSIGN_,vec)
-
-#define vec_madd(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b) )
-
-union SSEFloat
-{
-	__m128i vi;
-	__m128 m128;
-	__m128 vf;
-	unsigned int	ui[4];
-	unsigned short s[8];
-	float f[4];
-	SSEFloat(__m128 v) : m128(v) {}
-    SSEFloat(__m128i v) : vi(v) {}
-	SSEFloat() {}//uninitialized
-};
-
-static VECTORMATH_FORCE_INLINE __m128 vec_sel(__m128 a, __m128 b, __m128 mask)
-{
-	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
-}
-static VECTORMATH_FORCE_INLINE __m128 vec_sel(__m128 a, __m128 b, const unsigned int *_mask)
-{
-	return vec_sel(a, b, _mm_load_ps((float *)_mask));
-}
-static VECTORMATH_FORCE_INLINE __m128 vec_sel(__m128 a, __m128 b, unsigned int _mask)
-{
-	return vec_sel(a, b, _mm_set1_ps(*(float *)&_mask));
-}
-
-static VECTORMATH_FORCE_INLINE __m128 toM128(unsigned int x)
-{
-    return _mm_set1_ps( *(float *)&x );
-}
-
-static VECTORMATH_FORCE_INLINE __m128 fabsf4(__m128 x)
-{
-    return _mm_and_ps( x, toM128( 0x7fffffff ) );
-}
-/*
-union SSE64
-{
-	__m128 m128;
-	struct
-	{
-		__m64 m01;
-		__m64 m23;
-	} m64;
-};
-
-static VECTORMATH_FORCE_INLINE __m128 vec_cts(__m128 x, int a)
-{
-	assert(a == 0); // Only 2^0 supported
-	(void)a;
-	SSE64 sse64;
-	sse64.m64.m01 = _mm_cvttps_pi32(x);
-	sse64.m64.m23 = _mm_cvttps_pi32(_mm_ror_ps(x,2));
-	_mm_empty();
-    return sse64.m128;
-}
-
-static VECTORMATH_FORCE_INLINE __m128 vec_ctf(__m128 x, int a)
-{
-	assert(a == 0); // Only 2^0 supported
-	(void)a;
-	SSE64 sse64;
-	sse64.m128 = x;
-	__m128 result =_mm_movelh_ps(
-		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m01),
-		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m23));
-	_mm_empty();
-	return result;
-}
-*/
-static VECTORMATH_FORCE_INLINE __m128 vec_cts(__m128 x, int a)
-{
-	assert(a == 0); // Only 2^0 supported
-	(void)a;
-	__m128i result = _mm_cvtps_epi32(x);
-    return (__m128 &)result;
-}
-
-static VECTORMATH_FORCE_INLINE __m128 vec_ctf(__m128 x, int a)
-{
-	assert(a == 0); // Only 2^0 supported
-	(void)a;
-	return _mm_cvtepi32_ps((__m128i &)x);
-}
-
-#define vec_nmsub(a,b,c) _mm_sub_ps( c, _mm_mul_ps( a, b ) )
-#define vec_sub(a,b) _mm_sub_ps( a, b )
-#define vec_add(a,b) _mm_add_ps( a, b )
-#define vec_mul(a,b) _mm_mul_ps( a, b )
-#define vec_xor(a,b) _mm_xor_ps( a, b )
-#define vec_and(a,b) _mm_and_ps( a, b )
-#define vec_cmpeq(a,b) _mm_cmpeq_ps( a, b )
-#define vec_cmpgt(a,b) _mm_cmpgt_ps( a, b )
-
-#define vec_mergeh(a,b) _mm_unpacklo_ps( a, b )
-#define vec_mergel(a,b) _mm_unpackhi_ps( a, b )
-
-#define vec_andc(a,b) _mm_andnot_ps( b, a )
-
-#define sqrtf4(x) _mm_sqrt_ps( x )
-#define rsqrtf4(x) _mm_rsqrt_ps( x )
-#define recipf4(x) _mm_rcp_ps( x )
-#define negatef4(x) _mm_sub_ps( _mm_setzero_ps(), x )
-
-static VECTORMATH_FORCE_INLINE __m128 newtonrapson_rsqrt4( const __m128 v )
-{   
-#define _half4 _mm_setr_ps(.5f,.5f,.5f,.5f) 
-#define _three _mm_setr_ps(3.f,3.f,3.f,3.f)
-const __m128 approx = _mm_rsqrt_ps( v );   
-const __m128 muls = _mm_mul_ps(_mm_mul_ps(v, approx), approx);   
-return _mm_mul_ps(_mm_mul_ps(_half4, approx), _mm_sub_ps(_three, muls) );
-}
-
-static VECTORMATH_FORCE_INLINE __m128 acosf4(__m128 x)
-{
-    __m128 xabs = fabsf4(x);
-	__m128 select = _mm_cmplt_ps( x, _mm_setzero_ps() );
-    __m128 t1 = sqrtf4(vec_sub(_mm_set1_ps(1.0f), xabs));
-    
-    /* Instruction counts can be reduced if the polynomial was
-     * computed entirely from nested (dependent) fma's. However, 
-     * to reduce the number of pipeline stalls, the polygon is evaluated 
-     * in two halves (hi amd lo). 
-     */
-    __m128 xabs2 = _mm_mul_ps(xabs,  xabs);
-    __m128 xabs4 = _mm_mul_ps(xabs2, xabs2);
-    __m128 hi = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0012624911f),
-		xabs, _mm_set1_ps(0.0066700901f)),
-			xabs, _mm_set1_ps(-0.0170881256f)),
-				xabs, _mm_set1_ps( 0.0308918810f));
-    __m128 lo = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0501743046f),
-		xabs, _mm_set1_ps(0.0889789874f)),
-			xabs, _mm_set1_ps(-0.2145988016f)),
-				xabs, _mm_set1_ps( 1.5707963050f));
-    
-    __m128 result = vec_madd(hi, xabs4, lo);
-    
-    // Adjust the result if x is negactive.
-    return vec_sel(
-		vec_mul(t1, result),									// Positive
-		vec_nmsub(t1, result, _mm_set1_ps(3.1415926535898f)),	// Negative
-		select);
-}
-
-static VECTORMATH_FORCE_INLINE __m128 sinf4(vec_float4 x)
-{
-
-//
-// Common constants used to evaluate sinf4/cosf4/tanf4
-//
-#define _SINCOS_CC0  -0.0013602249f
-#define _SINCOS_CC1   0.0416566950f
-#define _SINCOS_CC2  -0.4999990225f
-#define _SINCOS_SC0  -0.0001950727f
-#define _SINCOS_SC1   0.0083320758f
-#define _SINCOS_SC2  -0.1666665247f
-
-#define _SINCOS_KC1  1.57079625129f
-#define _SINCOS_KC2  7.54978995489e-8f
-
-    vec_float4 xl,xl2,xl3,res;
-
-    // Range reduction using : xl = angle * TwoOverPi;
-    //  
-    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
-
-    // Find the quadrant the angle falls in
-    // using:  q = (int) (ceil(abs(xl))*sign(xl))
-    //
-    vec_int4 q = vec_cts(xl,0);
-
-    // Compute an offset based on the quadrant that the angle falls in
-    // 
-    vec_int4 offset = _mm_and_ps(q,toM128(0x3));
-
-    // Remainder in range [-pi/4..pi/4]
-    //
-    vec_float4 qf = vec_ctf(q,0);
-    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
-    
-    // Compute x^2 and x^3
-    //
-    xl2 = vec_mul(xl,xl);
-    xl3 = vec_mul(xl2,xl);
-    
-    // Compute both the sin and cos of the angles
-    // using a polynomial expression:
-    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
-    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
-    //
-    
-    vec_float4 cx =
-		vec_madd(
-			vec_madd(
-				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
-    vec_float4 sx =
-		vec_madd(
-			vec_madd(
-				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
-
-    // Use the cosine when the offset is odd and the sin
-    // when the offset is even
-    //
-    res = vec_sel(cx,sx,vec_cmpeq(vec_and(offset,
-                                          toM128(0x1)),
-										  _mm_setzero_ps()));
-
-    // Flip the sign of the result when (offset mod 4) = 1 or 2
-    //
-    return vec_sel(
-		vec_xor(toM128(0x80000000U), res),	// Negative
-		res,								// Positive
-		vec_cmpeq(vec_and(offset,toM128(0x2)),_mm_setzero_ps()));
-}
-
-static VECTORMATH_FORCE_INLINE void sincosf4(vec_float4 x, vec_float4* s, vec_float4* c)
-{
-    vec_float4 xl,xl2,xl3;
-    vec_int4   offsetSin, offsetCos;
-
-    // Range reduction using : xl = angle * TwoOverPi;
-    //  
-    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
-
-    // Find the quadrant the angle falls in
-    // using:  q = (int) (ceil(abs(xl))*sign(xl))
-    //
-    //vec_int4 q = vec_cts(vec_add(xl,vec_sel(_mm_set1_ps(0.5f),xl,(0x80000000))),0);
-    vec_int4 q = vec_cts(xl,0);
-     
-    // Compute the offset based on the quadrant that the angle falls in.
-    // Add 1 to the offset for the cosine. 
-    //
-    offsetSin = vec_and(q,toM128((int)0x3));
-	__m128i temp = _mm_add_epi32(_mm_set1_epi32(1),(__m128i &)offsetSin);
-	offsetCos = (__m128 &)temp;
-
-    // Remainder in range [-pi/4..pi/4]
-    //
-    vec_float4 qf = vec_ctf(q,0);
-    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
-    
-    // Compute x^2 and x^3
-    //
-    xl2 = vec_mul(xl,xl);
-    xl3 = vec_mul(xl2,xl);
-    
-    // Compute both the sin and cos of the angles
-    // using a polynomial expression:
-    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
-    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
-    //
-    vec_float4 cx =
-		vec_madd(
-			vec_madd(
-				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
-    vec_float4 sx =
-		vec_madd(
-			vec_madd(
-				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
-
-    // Use the cosine when the offset is odd and the sin
-    // when the offset is even
-    //
-    vec_uint4 sinMask = (vec_uint4)vec_cmpeq(vec_and(offsetSin,toM128(0x1)),_mm_setzero_ps());
-    vec_uint4 cosMask = (vec_uint4)vec_cmpeq(vec_and(offsetCos,toM128(0x1)),_mm_setzero_ps());    
-    *s = vec_sel(cx,sx,sinMask);
-    *c = vec_sel(cx,sx,cosMask);
-
-    // Flip the sign of the result when (offset mod 4) = 1 or 2
-    //
-    sinMask = vec_cmpeq(vec_and(offsetSin,toM128(0x2)),_mm_setzero_ps());
-    cosMask = vec_cmpeq(vec_and(offsetCos,toM128(0x2)),_mm_setzero_ps());
-    
-    *s = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*s),*s,sinMask);
-    *c = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*c),*c,cosMask);    
-}
-
-#include "vecidx_aos.h"
-#include "floatInVec.h"
-#include "boolInVec.h"
-
-#ifdef _VECTORMATH_DEBUG
-#include <stdio.h>
-#endif
-namespace Vectormath {
-
-namespace Aos {
-
-//-----------------------------------------------------------------------------
-// Forward Declarations
-//
-
-class Vector3;
-class Vector4;
-class Point3;
-class Quat;
-class Matrix3;
-class Matrix4;
-class Transform3;
-
-// A 3-D vector in array-of-structures format
-//
-class Vector3
-{
-    __m128 mVec128;
-
-	VECTORMATH_FORCE_INLINE void set128(vec_float4 vec);
-	 
-	 VECTORMATH_FORCE_INLINE  vec_float4& get128Ref();
-
-public:
-    // Default constructor; does no initialization
-    // 
-    VECTORMATH_FORCE_INLINE Vector3( ) { };
-
-	// Default copy constructor
-    // 
-	VECTORMATH_FORCE_INLINE Vector3(const Vector3& vec);
-
-    // Construct a 3-D vector from x, y, and z elements
-    // 
-    VECTORMATH_FORCE_INLINE Vector3( float x, float y, float z );
-
-    // Construct a 3-D vector from x, y, and z elements (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
-
-    // Copy elements from a 3-D point into a 3-D vector
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector3( const Point3 &pnt );
-
-    // Set all elements of a 3-D vector to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector3( float scalar );
-
-    // Set all elements of a 3-D vector to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector3( const floatInVec &scalar );
-
-    // Set vector float data in a 3-D vector
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector3( __m128 vf4 );
-
-    // Get vector float data from a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
-
-    // Assign one 3-D vector to another
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator =( const Vector3 &vec );
-
-    // Set the x element of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setX( float x );
-
-    // Set the y element of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setY( float y );
-
-    // Set the z element of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setZ( float z );
-
-    // Set the x element of a 3-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setX( const floatInVec &x );
-
-    // Set the y element of a 3-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setY( const floatInVec &y );
-
-    // Set the z element of a 3-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setZ( const floatInVec &z );
-
-    // Get the x element of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
-
-    // Get the y element of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
-
-    // Get the z element of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
-
-    // Set an x, y, or z element of a 3-D vector by index
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setElem( int idx, float value );
-
-    // Set an x, y, or z element of a 3-D vector by index (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & setElem( int idx, const floatInVec &value );
-
-    // Get an x, y, or z element of a 3-D vector by index
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
-
-    // Subscripting operator to set or get an element
-    // 
-    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
-
-    // Subscripting operator to get an element
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
-
-    // Add two 3-D vectors
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator +( const Vector3 &vec ) const;
-
-    // Subtract a 3-D vector from another 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator -( const Vector3 &vec ) const;
-
-    // Add a 3-D vector to a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const Point3 operator +( const Point3 &pnt ) const;
-
-    // Multiply a 3-D vector by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator *( float scalar ) const;
-
-    // Divide a 3-D vector by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator /( float scalar ) const;
-
-    // Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator *( const floatInVec &scalar ) const;
-
-    // Divide a 3-D vector by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator /( const floatInVec &scalar ) const;
-
-    // Perform compound assignment and addition with a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator +=( const Vector3 &vec );
-
-    // Perform compound assignment and subtraction by a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator -=( const Vector3 &vec );
-
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator *=( float scalar );
-
-    // Perform compound assignment and division by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator /=( float scalar );
-
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator *=( const floatInVec &scalar );
-
-    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator /=( const floatInVec &scalar );
-
-    // Negate all elements of a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator -( ) const;
-
-    // Construct x axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector3 xAxis( );
-
-    // Construct y axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector3 yAxis( );
-
-    // Construct z axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector3 zAxis( );
-
-};
-
-// Multiply a 3-D vector by a scalar
-// 
-VECTORMATH_FORCE_INLINE const Vector3 operator *( float scalar, const Vector3 &vec );
-
-// Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
-// 
-VECTORMATH_FORCE_INLINE const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec );
-
-// Multiply two 3-D vectors per element
-// 
-VECTORMATH_FORCE_INLINE const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Divide two 3-D vectors per element
-// NOTE: 
-// Floating-point behavior matches standard library function divf4.
-// 
-VECTORMATH_FORCE_INLINE const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Compute the reciprocal of a 3-D vector per element
-// NOTE: 
-// Floating-point behavior matches standard library function recipf4.
-// 
-VECTORMATH_FORCE_INLINE const Vector3 recipPerElem( const Vector3 &vec );
-
-// Compute the absolute value of a 3-D vector per element
-// 
-VECTORMATH_FORCE_INLINE const Vector3 absPerElem( const Vector3 &vec );
-
-// Copy sign from one 3-D vector to another, per element
-// 
-VECTORMATH_FORCE_INLINE const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Maximum of two 3-D vectors per element
-// 
-VECTORMATH_FORCE_INLINE const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Minimum of two 3-D vectors per element
-// 
-VECTORMATH_FORCE_INLINE const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Maximum element of a 3-D vector
-// 
-VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector3 &vec );
-
-// Minimum element of a 3-D vector
-// 
-VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector3 &vec );
-
-// Compute the sum of all elements of a 3-D vector
-// 
-VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector3 &vec );
-
-// Compute the dot product of two 3-D vectors
-// 
-VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Compute the square of the length of a 3-D vector
-// 
-VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector3 &vec );
-
-// Compute the length of a 3-D vector
-// 
-VECTORMATH_FORCE_INLINE const floatInVec length( const Vector3 &vec );
-
-// Normalize a 3-D vector
-// NOTE: 
-// The result is unpredictable when all elements of vec are at or near zero.
-// 
-VECTORMATH_FORCE_INLINE const Vector3 normalize( const Vector3 &vec );
-
-// Compute cross product of two 3-D vectors
-// 
-VECTORMATH_FORCE_INLINE const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Outer product of two 3-D vectors
-// 
-VECTORMATH_FORCE_INLINE const Matrix3 outer( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Pre-multiply a row vector by a 3x3 matrix
-// NOTE: 
-// Slower than column post-multiply.
-// 
-VECTORMATH_FORCE_INLINE const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat );
-
-// Cross-product matrix of a 3-D vector
-// 
-VECTORMATH_FORCE_INLINE const Matrix3 crossMatrix( const Vector3 &vec );
-
-// Create cross-product matrix and multiply
-// NOTE: 
-// Faster than separately creating a cross-product matrix and multiplying.
-// 
-VECTORMATH_FORCE_INLINE const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat );
-
-// Linear interpolation between two 3-D vectors
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 );
-
-// Linear interpolation between two 3-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 );
-
-// Spherical linear interpolation between two 3-D vectors
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
-
-// Spherical linear interpolation between two 3-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
-
-// Conditionally select between two 3-D vectors
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-VECTORMATH_FORCE_INLINE const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 );
-
-// Conditionally select between two 3-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-VECTORMATH_FORCE_INLINE const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, const boolInVec &select1 );
-
-// Store x, y, and z elements of 3-D vector in first three words of a quadword, preserving fourth word
-// 
-VECTORMATH_FORCE_INLINE void storeXYZ( const Vector3 &vec, __m128 * quad );
-
-// Load four three-float 3-D vectors, stored in three quadwords
-// 
-VECTORMATH_FORCE_INLINE void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads );
-
-// Store four 3-D vectors in three quadwords
-// 
-VECTORMATH_FORCE_INLINE void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads );
-
-// Store eight 3-D vectors as half-floats
-// 
-VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a 3-D vector
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Vector3 &vec );
-
-// Print a 3-D vector and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Vector3 &vec, const char * name );
-
-#endif
-
-// A 4-D vector in array-of-structures format
-//
-class Vector4
-{
-    __m128 mVec128;
-
-public:
-    // Default constructor; does no initialization
-    // 
-    VECTORMATH_FORCE_INLINE Vector4( ) { };
-
-    // Construct a 4-D vector from x, y, z, and w elements
-    // 
-    VECTORMATH_FORCE_INLINE Vector4( float x, float y, float z, float w );
-
-    // Construct a 4-D vector from x, y, z, and w elements (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
-
-    // Construct a 4-D vector from a 3-D vector and a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Vector4( const Vector3 &xyz, float w );
-
-    // Construct a 4-D vector from a 3-D vector and a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4( const Vector3 &xyz, const floatInVec &w );
-
-    // Copy x, y, and z from a 3-D vector into a 4-D vector, and set w to 0
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector4( const Vector3 &vec );
-
-    // Copy x, y, and z from a 3-D point into a 4-D vector, and set w to 1
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector4( const Point3 &pnt );
-
-    // Copy elements from a quaternion into a 4-D vector
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector4( const Quat &quat );
-
-    // Set all elements of a 4-D vector to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector4( float scalar );
-
-    // Set all elements of a 4-D vector to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector4( const floatInVec &scalar );
-
-    // Set vector float data in a 4-D vector
-    // 
-    explicit VECTORMATH_FORCE_INLINE Vector4( __m128 vf4 );
-
-    // Get vector float data from a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
-
-    // Assign one 4-D vector to another
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator =( const Vector4 &vec );
-
-    // Set the x, y, and z elements of a 4-D vector
-    // NOTE: 
-    // This function does not change the w element.
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setXYZ( const Vector3 &vec );
-
-    // Get the x, y, and z elements of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getXYZ( ) const;
-
-    // Set the x element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setX( float x );
-
-    // Set the y element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setY( float y );
-
-    // Set the z element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setZ( float z );
-
-    // Set the w element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setW( float w );
-
-    // Set the x element of a 4-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setX( const floatInVec &x );
-
-    // Set the y element of a 4-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setY( const floatInVec &y );
-
-    // Set the z element of a 4-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setZ( const floatInVec &z );
-
-    // Set the w element of a 4-D vector (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setW( const floatInVec &w );
-
-    // Get the x element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
-
-    // Get the y element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
-
-    // Get the z element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
-
-    // Get the w element of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getW( ) const;
-
-    // Set an x, y, z, or w element of a 4-D vector by index
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setElem( int idx, float value );
-
-    // Set an x, y, z, or w element of a 4-D vector by index (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & setElem( int idx, const floatInVec &value );
-
-    // Get an x, y, z, or w element of a 4-D vector by index
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
-
-    // Subscripting operator to set or get an element
-    // 
-    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
-
-    // Subscripting operator to get an element
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
-
-    // Add two 4-D vectors
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator +( const Vector4 &vec ) const;
-
-    // Subtract a 4-D vector from another 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator -( const Vector4 &vec ) const;
-
-    // Multiply a 4-D vector by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator *( float scalar ) const;
-
-    // Divide a 4-D vector by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator /( float scalar ) const;
-
-    // Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator *( const floatInVec &scalar ) const;
-
-    // Divide a 4-D vector by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator /( const floatInVec &scalar ) const;
-
-    // Perform compound assignment and addition with a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator +=( const Vector4 &vec );
-
-    // Perform compound assignment and subtraction by a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator -=( const Vector4 &vec );
-
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator *=( float scalar );
-
-    // Perform compound assignment and division by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator /=( float scalar );
-
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator *=( const floatInVec &scalar );
-
-    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator /=( const floatInVec &scalar );
-
-    // Negate all elements of a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator -( ) const;
-
-    // Construct x axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector4 xAxis( );
-
-    // Construct y axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector4 yAxis( );
-
-    // Construct z axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector4 zAxis( );
-
-    // Construct w axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Vector4 wAxis( );
-
-};
-
-// Multiply a 4-D vector by a scalar
-// 
-VECTORMATH_FORCE_INLINE const Vector4 operator *( float scalar, const Vector4 &vec );
-
-// Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
-// 
-VECTORMATH_FORCE_INLINE const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec );
-
-// Multiply two 4-D vectors per element
-// 
-VECTORMATH_FORCE_INLINE const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Divide two 4-D vectors per element
-// NOTE: 
-// Floating-point behavior matches standard library function divf4.
-// 
-VECTORMATH_FORCE_INLINE const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Compute the reciprocal of a 4-D vector per element
-// NOTE: 
-// Floating-point behavior matches standard library function recipf4.
-// 
-VECTORMATH_FORCE_INLINE const Vector4 recipPerElem( const Vector4 &vec );
-
-// Compute the absolute value of a 4-D vector per element
-// 
-VECTORMATH_FORCE_INLINE const Vector4 absPerElem( const Vector4 &vec );
-
-// Copy sign from one 4-D vector to another, per element
-// 
-VECTORMATH_FORCE_INLINE const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Maximum of two 4-D vectors per element
-// 
-VECTORMATH_FORCE_INLINE const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Minimum of two 4-D vectors per element
-// 
-VECTORMATH_FORCE_INLINE const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Maximum element of a 4-D vector
-// 
-VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector4 &vec );
-
-// Minimum element of a 4-D vector
-// 
-VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector4 &vec );
-
-// Compute the sum of all elements of a 4-D vector
-// 
-VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector4 &vec );
-
-// Compute the dot product of two 4-D vectors
-// 
-VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Compute the square of the length of a 4-D vector
-// 
-VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector4 &vec );
-
-// Compute the length of a 4-D vector
-// 
-VECTORMATH_FORCE_INLINE const floatInVec length( const Vector4 &vec );
-
-// Normalize a 4-D vector
-// NOTE: 
-// The result is unpredictable when all elements of vec are at or near zero.
-// 
-VECTORMATH_FORCE_INLINE const Vector4 normalize( const Vector4 &vec );
-
-// Outer product of two 4-D vectors
-// 
-VECTORMATH_FORCE_INLINE const Matrix4 outer( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Linear interpolation between two 4-D vectors
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 );
-
-// Linear interpolation between two 4-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 );
-
-// Spherical linear interpolation between two 4-D vectors
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
-
-// Spherical linear interpolation between two 4-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
-
-// Conditionally select between two 4-D vectors
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-VECTORMATH_FORCE_INLINE const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 );
-
-// Conditionally select between two 4-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-VECTORMATH_FORCE_INLINE const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, const boolInVec &select1 );
-
-// Store four 4-D vectors as half-floats
-// 
-VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a 4-D vector
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Vector4 &vec );
-
-// Print a 4-D vector and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Vector4 &vec, const char * name );
-
-#endif
-
-// A 3-D point in array-of-structures format
-//
-class Point3
-{
-    __m128 mVec128;
-
-public:
-    // Default constructor; does no initialization
-    // 
-    VECTORMATH_FORCE_INLINE Point3( ) { };
-
-    // Construct a 3-D point from x, y, and z elements
-    // 
-    VECTORMATH_FORCE_INLINE Point3( float x, float y, float z );
-
-    // Construct a 3-D point from x, y, and z elements (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Point3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
-
-    // Copy elements from a 3-D vector into a 3-D point
-    // 
-    explicit VECTORMATH_FORCE_INLINE Point3( const Vector3 &vec );
-
-    // Set all elements of a 3-D point to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Point3( float scalar );
-
-    // Set all elements of a 3-D point to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Point3( const floatInVec &scalar );
-
-    // Set vector float data in a 3-D point
-    // 
-    explicit VECTORMATH_FORCE_INLINE Point3( __m128 vf4 );
-
-    // Get vector float data from a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
-
-    // Assign one 3-D point to another
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & operator =( const Point3 &pnt );
-
-    // Set the x element of a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setX( float x );
-
-    // Set the y element of a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setY( float y );
-
-    // Set the z element of a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setZ( float z );
-
-    // Set the x element of a 3-D point (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setX( const floatInVec &x );
-
-    // Set the y element of a 3-D point (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setY( const floatInVec &y );
-
-    // Set the z element of a 3-D point (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setZ( const floatInVec &z );
-
-    // Get the x element of a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
-
-    // Get the y element of a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
-
-    // Get the z element of a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
-
-    // Set an x, y, or z element of a 3-D point by index
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setElem( int idx, float value );
-
-    // Set an x, y, or z element of a 3-D point by index (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & setElem( int idx, const floatInVec &value );
-
-    // Get an x, y, or z element of a 3-D point by index
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
-
-    // Subscripting operator to set or get an element
-    // 
-    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
-
-    // Subscripting operator to get an element
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
-
-    // Subtract a 3-D point from another 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator -( const Point3 &pnt ) const;
-
-    // Add a 3-D point to a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Point3 operator +( const Vector3 &vec ) const;
-
-    // Subtract a 3-D vector from a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const Point3 operator -( const Vector3 &vec ) const;
-
-    // Perform compound assignment and addition with a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & operator +=( const Vector3 &vec );
-
-    // Perform compound assignment and subtraction by a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Point3 & operator -=( const Vector3 &vec );
-
-};
-
-// Multiply two 3-D points per element
-// 
-VECTORMATH_FORCE_INLINE const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Divide two 3-D points per element
-// NOTE: 
-// Floating-point behavior matches standard library function divf4.
-// 
-VECTORMATH_FORCE_INLINE const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Compute the reciprocal of a 3-D point per element
-// NOTE: 
-// Floating-point behavior matches standard library function recipf4.
-// 
-VECTORMATH_FORCE_INLINE const Point3 recipPerElem( const Point3 &pnt );
-
-// Compute the absolute value of a 3-D point per element
-// 
-VECTORMATH_FORCE_INLINE const Point3 absPerElem( const Point3 &pnt );
-
-// Copy sign from one 3-D point to another, per element
-// 
-VECTORMATH_FORCE_INLINE const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Maximum of two 3-D points per element
-// 
-VECTORMATH_FORCE_INLINE const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Minimum of two 3-D points per element
-// 
-VECTORMATH_FORCE_INLINE const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Maximum element of a 3-D point
-// 
-VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Point3 &pnt );
-
-// Minimum element of a 3-D point
-// 
-VECTORMATH_FORCE_INLINE const floatInVec minElem( const Point3 &pnt );
-
-// Compute the sum of all elements of a 3-D point
-// 
-VECTORMATH_FORCE_INLINE const floatInVec sum( const Point3 &pnt );
-
-// Apply uniform scale to a 3-D point
-// 
-VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, float scaleVal );
-
-// Apply uniform scale to a 3-D point (scalar data contained in vector data type)
-// 
-VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal );
-
-// Apply non-uniform scale to a 3-D point
-// 
-VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec );
-
-// Scalar projection of a 3-D point on a unit-length 3-D vector
-// 
-VECTORMATH_FORCE_INLINE const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec );
-
-// Compute the square of the distance of a 3-D point from the coordinate-system origin
-// 
-VECTORMATH_FORCE_INLINE const floatInVec distSqrFromOrigin( const Point3 &pnt );
-
-// Compute the distance of a 3-D point from the coordinate-system origin
-// 
-VECTORMATH_FORCE_INLINE const floatInVec distFromOrigin( const Point3 &pnt );
-
-// Compute the square of the distance between two 3-D points
-// 
-VECTORMATH_FORCE_INLINE const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Compute the distance between two 3-D points
-// 
-VECTORMATH_FORCE_INLINE const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Linear interpolation between two 3-D points
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 );
-
-// Linear interpolation between two 3-D points (scalar data contained in vector data type)
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 );
-
-// Conditionally select between two 3-D points
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 );
-
-// Conditionally select between two 3-D points (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 );
-
-// Store x, y, and z elements of 3-D point in first three words of a quadword, preserving fourth word
-// 
-VECTORMATH_FORCE_INLINE void storeXYZ( const Point3 &pnt, __m128 * quad );
-
-// Load four three-float 3-D points, stored in three quadwords
-// 
-VECTORMATH_FORCE_INLINE void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads );
-
-// Store four 3-D points in three quadwords
-// 
-VECTORMATH_FORCE_INLINE void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads );
-
-// Store eight 3-D points as half-floats
-// 
-VECTORMATH_FORCE_INLINE void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a 3-D point
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Point3 &pnt );
-
-// Print a 3-D point and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Point3 &pnt, const char * name );
-
-#endif
-
-// A quaternion in array-of-structures format
-//
-class Quat
-{
-    __m128 mVec128;
-
-public:
-    // Default constructor; does no initialization
-    // 
-    VECTORMATH_FORCE_INLINE Quat( ) { };
-
-	VECTORMATH_FORCE_INLINE  Quat(const Quat& quat);
-
-    // Construct a quaternion from x, y, z, and w elements
-    // 
-    VECTORMATH_FORCE_INLINE Quat( float x, float y, float z, float w );
-
-    // Construct a quaternion from x, y, z, and w elements (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
-
-    // Construct a quaternion from a 3-D vector and a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Quat( const Vector3 &xyz, float w );
-
-    // Construct a quaternion from a 3-D vector and a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat( const Vector3 &xyz, const floatInVec &w );
-
-    // Copy elements from a 4-D vector into a quaternion
-    // 
-    explicit VECTORMATH_FORCE_INLINE Quat( const Vector4 &vec );
-
-    // Convert a rotation matrix to a unit-length quaternion
-    // 
-    explicit VECTORMATH_FORCE_INLINE Quat( const Matrix3 & rotMat );
-
-    // Set all elements of a quaternion to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Quat( float scalar );
-
-    // Set all elements of a quaternion to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Quat( const floatInVec &scalar );
-
-    // Set vector float data in a quaternion
-    // 
-    explicit VECTORMATH_FORCE_INLINE Quat( __m128 vf4 );
-
-    // Get vector float data from a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
-
-	// Set a quaterion from vector float data
-    //
-	VECTORMATH_FORCE_INLINE void set128(vec_float4 vec);
-
-    // Assign one quaternion to another
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator =( const Quat &quat );
-
-    // Set the x, y, and z elements of a quaternion
-    // NOTE: 
-    // This function does not change the w element.
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setXYZ( const Vector3 &vec );
-
-    // Get the x, y, and z elements of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getXYZ( ) const;
-
-    // Set the x element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setX( float x );
-
-    // Set the y element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setY( float y );
-
-    // Set the z element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setZ( float z );
-
-    // Set the w element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setW( float w );
-
-    // Set the x element of a quaternion (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setX( const floatInVec &x );
-
-    // Set the y element of a quaternion (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setY( const floatInVec &y );
-
-    // Set the z element of a quaternion (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setZ( const floatInVec &z );
-
-    // Set the w element of a quaternion (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setW( const floatInVec &w );
-
-    // Get the x element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
-
-    // Get the y element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
-
-    // Get the z element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
-
-    // Get the w element of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getW( ) const;
-
-    // Set an x, y, z, or w element of a quaternion by index
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setElem( int idx, float value );
-
-    // Set an x, y, z, or w element of a quaternion by index (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & setElem( int idx, const floatInVec &value );
-
-    // Get an x, y, z, or w element of a quaternion by index
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
-
-    // Subscripting operator to set or get an element
-    // 
-    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
-
-    // Subscripting operator to get an element
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
-
-    // Add two quaternions
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator +( const Quat &quat ) const;
-
-    // Subtract a quaternion from another quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator -( const Quat &quat ) const;
-
-    // Multiply two quaternions
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator *( const Quat &quat ) const;
-
-    // Multiply a quaternion by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator *( float scalar ) const;
-
-    // Divide a quaternion by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator /( float scalar ) const;
-
-    // Multiply a quaternion by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator *( const floatInVec &scalar ) const;
-
-    // Divide a quaternion by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator /( const floatInVec &scalar ) const;
-
-    // Perform compound assignment and addition with a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator +=( const Quat &quat );
-
-    // Perform compound assignment and subtraction by a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator -=( const Quat &quat );
-
-    // Perform compound assignment and multiplication by a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator *=( const Quat &quat );
-
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator *=( float scalar );
-
-    // Perform compound assignment and division by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator /=( float scalar );
-
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator *=( const floatInVec &scalar );
-
-    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Quat & operator /=( const floatInVec &scalar );
-
-    // Negate all elements of a quaternion
-    // 
-    VECTORMATH_FORCE_INLINE const Quat operator -( ) const;
-
-    // Construct an identity quaternion
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat identity( );
-
-    // Construct a quaternion to rotate between two unit-length 3-D vectors
-    // NOTE: 
-    // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions.
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 );
-
-    // Construct a quaternion to rotate around a unit-length 3-D vector
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotation( float radians, const Vector3 &unitVec );
-
-    // Construct a quaternion to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotation( const floatInVec &radians, const Vector3 &unitVec );
-
-    // Construct a quaternion to rotate around the x axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotationX( float radians );
-
-    // Construct a quaternion to rotate around the y axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotationY( float radians );
-
-    // Construct a quaternion to rotate around the z axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotationZ( float radians );
-
-    // Construct a quaternion to rotate around the x axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotationX( const floatInVec &radians );
-
-    // Construct a quaternion to rotate around the y axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotationY( const floatInVec &radians );
-
-    // Construct a quaternion to rotate around the z axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Quat rotationZ( const floatInVec &radians );
-
-};
-
-// Multiply a quaternion by a scalar
-// 
-VECTORMATH_FORCE_INLINE const Quat operator *( float scalar, const Quat &quat );
-
-// Multiply a quaternion by a scalar (scalar data contained in vector data type)
-// 
-VECTORMATH_FORCE_INLINE const Quat operator *( const floatInVec &scalar, const Quat &quat );
-
-// Compute the conjugate of a quaternion
-// 
-VECTORMATH_FORCE_INLINE const Quat conj( const Quat &quat );
-
-// Use a unit-length quaternion to rotate a 3-D vector
-// 
-VECTORMATH_FORCE_INLINE const Vector3 rotate( const Quat &unitQuat, const Vector3 &vec );
-
-// Compute the dot product of two quaternions
-// 
-VECTORMATH_FORCE_INLINE const floatInVec dot( const Quat &quat0, const Quat &quat1 );
-
-// Compute the norm of a quaternion
-// 
-VECTORMATH_FORCE_INLINE const floatInVec norm( const Quat &quat );
-
-// Compute the length of a quaternion
-// 
-VECTORMATH_FORCE_INLINE const floatInVec length( const Quat &quat );
-
-// Normalize a quaternion
-// NOTE: 
-// The result is unpredictable when all elements of quat are at or near zero.
-// 
-VECTORMATH_FORCE_INLINE const Quat normalize( const Quat &quat );
-
-// Linear interpolation between two quaternions
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Quat lerp( float t, const Quat &quat0, const Quat &quat1 );
-
-// Linear interpolation between two quaternions (scalar data contained in vector data type)
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 );
-
-// Spherical linear interpolation between two quaternions
-// NOTE: 
-// Interpolates along the shortest path between orientations.
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 );
-
-// Spherical linear interpolation between two quaternions (scalar data contained in vector data type)
-// NOTE: 
-// Interpolates along the shortest path between orientations.
-// Does not clamp t between 0 and 1.
-// 
-VECTORMATH_FORCE_INLINE const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 );
-
-// Spherical quadrangle interpolation
-// 
-VECTORMATH_FORCE_INLINE const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
-
-// Spherical quadrangle interpolation (scalar data contained in vector data type)
-// 
-VECTORMATH_FORCE_INLINE const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
-
-// Conditionally select between two quaternions
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, bool select1 );
-
-// Conditionally select between two quaternions (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a quaternion
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Quat &quat );
-
-// Print a quaternion and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Quat &quat, const char * name );
-
-#endif
-
-// A 3x3 matrix in array-of-structures format
-//
-class Matrix3
-{
-    Vector3 mCol0;
-    Vector3 mCol1;
-    Vector3 mCol2;
-
-public:
-    // Default constructor; does no initialization
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3( ) { };
-
-    // Copy a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3( const Matrix3 & mat );
-
-    // Construct a 3x3 matrix containing the specified columns
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2 );
-
-    // Construct a 3x3 rotation matrix from a unit-length quaternion
-    // 
-    explicit VECTORMATH_FORCE_INLINE Matrix3( const Quat &unitQuat );
-
-    // Set all elements of a 3x3 matrix to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Matrix3( float scalar );
-
-    // Set all elements of a 3x3 matrix to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Matrix3( const floatInVec &scalar );
-
-    // Assign one 3x3 matrix to another
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & operator =( const Matrix3 & mat );
-
-    // Set column 0 of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setCol0( const Vector3 &col0 );
-
-    // Set column 1 of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setCol1( const Vector3 &col1 );
-
-    // Set column 2 of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setCol2( const Vector3 &col2 );
-
-    // Get column 0 of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol0( ) const;
-
-    // Get column 1 of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol1( ) const;
-
-    // Get column 2 of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol2( ) const;
-
-    // Set the column of a 3x3 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setCol( int col, const Vector3 &vec );
-
-    // Set the row of a 3x3 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setRow( int row, const Vector3 &vec );
-
-    // Get the column of a 3x3 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol( int col ) const;
-
-    // Get the row of a 3x3 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getRow( int row ) const;
-
-    // Subscripting operator to set or get a column
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator []( int col );
-
-    // Subscripting operator to get a column
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator []( int col ) const;
-
-    // Set the element of a 3x3 matrix referred to by column and row indices
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setElem( int col, int row, float val );
-
-    // Set the element of a 3x3 matrix referred to by column and row indices (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & setElem( int col, int row, const floatInVec &val );
-
-    // Get the element of a 3x3 matrix referred to by column and row indices
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int col, int row ) const;
-
-    // Add two 3x3 matrices
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 operator +( const Matrix3 & mat ) const;
-
-    // Subtract a 3x3 matrix from another 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 operator -( const Matrix3 & mat ) const;
-
-    // Negate all elements of a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 operator -( ) const;
-
-    // Multiply a 3x3 matrix by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 operator *( float scalar ) const;
-
-    // Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 operator *( const floatInVec &scalar ) const;
-
-    // Multiply a 3x3 matrix by a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator *( const Vector3 &vec ) const;
-
-    // Multiply two 3x3 matrices
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 operator *( const Matrix3 & mat ) const;
-
-    // Perform compound assignment and addition with a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & operator +=( const Matrix3 & mat );
-
-    // Perform compound assignment and subtraction by a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & operator -=( const Matrix3 & mat );
-
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & operator *=( float scalar );
-
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & operator *=( const floatInVec &scalar );
-
-    // Perform compound assignment and multiplication by a 3x3 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix3 & operator *=( const Matrix3 & mat );
-
-    // Construct an identity 3x3 matrix
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 identity( );
-
-    // Construct a 3x3 matrix to rotate around the x axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationX( float radians );
-
-    // Construct a 3x3 matrix to rotate around the y axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationY( float radians );
-
-    // Construct a 3x3 matrix to rotate around the z axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationZ( float radians );
-
-    // Construct a 3x3 matrix to rotate around the x axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationX( const floatInVec &radians );
-
-    // Construct a 3x3 matrix to rotate around the y axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationY( const floatInVec &radians );
-
-    // Construct a 3x3 matrix to rotate around the z axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationZ( const floatInVec &radians );
-
-    // Construct a 3x3 matrix to rotate around the x, y, and z axes
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotationZYX( const Vector3 &radiansXYZ );
-
-    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotation( float radians, const Vector3 &unitVec );
-
-    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotation( const floatInVec &radians, const Vector3 &unitVec );
-
-    // Construct a rotation matrix from a unit-length quaternion
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 rotation( const Quat &unitQuat );
-
-    // Construct a 3x3 matrix to perform scaling
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix3 scale( const Vector3 &scaleVec );
-
-};
-// Multiply a 3x3 matrix by a scalar
-// 
-VECTORMATH_FORCE_INLINE const Matrix3 operator *( float scalar, const Matrix3 & mat );
-
-// Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
-// 
-VECTORMATH_FORCE_INLINE const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat );
-
-// Append (post-multiply) a scale transformation to a 3x3 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-// 
-VECTORMATH_FORCE_INLINE const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec );
-
-// Prepend (pre-multiply) a scale transformation to a 3x3 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-// 
-VECTORMATH_FORCE_INLINE const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat );
-
-// Multiply two 3x3 matrices per element
-// 
-VECTORMATH_FORCE_INLINE const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 );
-
-// Compute the absolute value of a 3x3 matrix per element
-// 
-VECTORMATH_FORCE_INLINE const Matrix3 absPerElem( const Matrix3 & mat );
-
-// Transpose of a 3x3 matrix
-// 
-VECTORMATH_FORCE_INLINE const Matrix3 transpose( const Matrix3 & mat );
-
-// Compute the inverse of a 3x3 matrix
-// NOTE: 
-// Result is unpredictable when the determinant of mat is equal to or near 0.
-// 
-VECTORMATH_FORCE_INLINE const Matrix3 inverse( const Matrix3 & mat );
-
-// Determinant of a 3x3 matrix
-// 
-VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix3 & mat );
-
-// Conditionally select between two 3x3 matrices
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 );
-
-// Conditionally select between two 3x3 matrices (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a 3x3 matrix
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat );
-
-// Print a 3x3 matrix and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat, const char * name );
-
-#endif
-
-// A 4x4 matrix in array-of-structures format
-//
-class Matrix4
-{
-    Vector4 mCol0;
-    Vector4 mCol1;
-    Vector4 mCol2;
-    Vector4 mCol3;
-
-public:
-    // Default constructor; does no initialization
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4( ) { };
-
-    // Copy a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4( const Matrix4 & mat );
-
-    // Construct a 4x4 matrix containing the specified columns
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4( const Vector4 &col0, const Vector4 &col1, const Vector4 &col2, const Vector4 &col3 );
-
-    // Construct a 4x4 matrix from a 3x4 transformation matrix
-    // 
-    explicit VECTORMATH_FORCE_INLINE Matrix4( const Transform3 & mat );
-
-    // Construct a 4x4 matrix from a 3x3 matrix and a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4( const Matrix3 & mat, const Vector3 &translateVec );
-
-    // Construct a 4x4 matrix from a unit-length quaternion and a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4( const Quat &unitQuat, const Vector3 &translateVec );
-
-    // Set all elements of a 4x4 matrix to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Matrix4( float scalar );
-
-    // Set all elements of a 4x4 matrix to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Matrix4( const floatInVec &scalar );
-
-    // Assign one 4x4 matrix to another
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator =( const Matrix4 & mat );
-
-    // Set the upper-left 3x3 submatrix
-    // NOTE: 
-    // This function does not change the bottom row elements.
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setUpper3x3( const Matrix3 & mat3 );
-
-    // Get the upper-left 3x3 submatrix of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 getUpper3x3( ) const;
-
-    // Set translation component
-    // NOTE: 
-    // This function does not change the bottom row elements.
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setTranslation( const Vector3 &translateVec );
-
-    // Get the translation component of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getTranslation( ) const;
-
-    // Set column 0 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setCol0( const Vector4 &col0 );
-
-    // Set column 1 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setCol1( const Vector4 &col1 );
-
-    // Set column 2 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setCol2( const Vector4 &col2 );
-
-    // Set column 3 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setCol3( const Vector4 &col3 );
-
-    // Get column 0 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getCol0( ) const;
-
-    // Get column 1 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getCol1( ) const;
-
-    // Get column 2 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getCol2( ) const;
-
-    // Get column 3 of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getCol3( ) const;
-
-    // Set the column of a 4x4 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setCol( int col, const Vector4 &vec );
-
-    // Set the row of a 4x4 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setRow( int row, const Vector4 &vec );
-
-    // Get the column of a 4x4 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getCol( int col ) const;
-
-    // Get the row of a 4x4 matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getRow( int row ) const;
-
-    // Subscripting operator to set or get a column
-    // 
-    VECTORMATH_FORCE_INLINE Vector4 & operator []( int col );
-
-    // Subscripting operator to get a column
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator []( int col ) const;
-
-    // Set the element of a 4x4 matrix referred to by column and row indices
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setElem( int col, int row, float val );
-
-    // Set the element of a 4x4 matrix referred to by column and row indices (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & setElem( int col, int row, const floatInVec &val );
-
-    // Get the element of a 4x4 matrix referred to by column and row indices
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int col, int row ) const;
-
-    // Add two 4x4 matrices
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator +( const Matrix4 & mat ) const;
-
-    // Subtract a 4x4 matrix from another 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator -( const Matrix4 & mat ) const;
-
-    // Negate all elements of a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator -( ) const;
-
-    // Multiply a 4x4 matrix by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator *( float scalar ) const;
-
-    // Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator *( const floatInVec &scalar ) const;
-
-    // Multiply a 4x4 matrix by a 4-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator *( const Vector4 &vec ) const;
-
-    // Multiply a 4x4 matrix by a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator *( const Vector3 &vec ) const;
-
-    // Multiply a 4x4 matrix by a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 operator *( const Point3 &pnt ) const;
-
-    // Multiply two 4x4 matrices
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator *( const Matrix4 & mat ) const;
-
-    // Multiply a 4x4 matrix by a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix4 operator *( const Transform3 & tfrm ) const;
-
-    // Perform compound assignment and addition with a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator +=( const Matrix4 & mat );
-
-    // Perform compound assignment and subtraction by a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator -=( const Matrix4 & mat );
-
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( float scalar );
-
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( const floatInVec &scalar );
-
-    // Perform compound assignment and multiplication by a 4x4 matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( const Matrix4 & mat );
-
-    // Perform compound assignment and multiplication by a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( const Transform3 & tfrm );
-
-    // Construct an identity 4x4 matrix
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 identity( );
-
-    // Construct a 4x4 matrix to rotate around the x axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationX( float radians );
-
-    // Construct a 4x4 matrix to rotate around the y axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationY( float radians );
-
-    // Construct a 4x4 matrix to rotate around the z axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationZ( float radians );
-
-    // Construct a 4x4 matrix to rotate around the x axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationX( const floatInVec &radians );
-
-    // Construct a 4x4 matrix to rotate around the y axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationY( const floatInVec &radians );
-
-    // Construct a 4x4 matrix to rotate around the z axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationZ( const floatInVec &radians );
-
-    // Construct a 4x4 matrix to rotate around the x, y, and z axes
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotationZYX( const Vector3 &radiansXYZ );
-
-    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotation( float radians, const Vector3 &unitVec );
-
-    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotation( const floatInVec &radians, const Vector3 &unitVec );
-
-    // Construct a rotation matrix from a unit-length quaternion
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 rotation( const Quat &unitQuat );
-
-    // Construct a 4x4 matrix to perform scaling
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 scale( const Vector3 &scaleVec );
-
-    // Construct a 4x4 matrix to perform translation
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 translation( const Vector3 &translateVec );
-
-    // Construct viewing matrix based on eye, position looked at, and up direction
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec );
-
-    // Construct a perspective projection matrix
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 perspective( float fovyRadians, float aspect, float zNear, float zFar );
-
-    // Construct a perspective projection matrix based on frustum
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 frustum( float left, float right, float bottom, float top, float zNear, float zFar );
-
-    // Construct an orthographic projection matrix
-    // 
-    static VECTORMATH_FORCE_INLINE const Matrix4 orthographic( float left, float right, float bottom, float top, float zNear, float zFar );
-
-};
-// Multiply a 4x4 matrix by a scalar
-// 
-VECTORMATH_FORCE_INLINE const Matrix4 operator *( float scalar, const Matrix4 & mat );
-
-// Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
-// 
-VECTORMATH_FORCE_INLINE const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat );
-
-// Append (post-multiply) a scale transformation to a 4x4 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-// 
-VECTORMATH_FORCE_INLINE const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec );
-
-// Prepend (pre-multiply) a scale transformation to a 4x4 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-// 
-VECTORMATH_FORCE_INLINE const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat );
-
-// Multiply two 4x4 matrices per element
-// 
-VECTORMATH_FORCE_INLINE const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 );
-
-// Compute the absolute value of a 4x4 matrix per element
-// 
-VECTORMATH_FORCE_INLINE const Matrix4 absPerElem( const Matrix4 & mat );
-
-// Transpose of a 4x4 matrix
-// 
-VECTORMATH_FORCE_INLINE const Matrix4 transpose( const Matrix4 & mat );
-
-// Compute the inverse of a 4x4 matrix
-// NOTE: 
-// Result is unpredictable when the determinant of mat is equal to or near 0.
-// 
-VECTORMATH_FORCE_INLINE const Matrix4 inverse( const Matrix4 & mat );
-
-// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix
-// NOTE: 
-// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.  The result is unpredictable when the determinant of mat is equal to or near 0.
-// 
-VECTORMATH_FORCE_INLINE const Matrix4 affineInverse( const Matrix4 & mat );
-
-// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix with an orthogonal upper-left 3x3 submatrix
-// NOTE: 
-// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.
-// 
-VECTORMATH_FORCE_INLINE const Matrix4 orthoInverse( const Matrix4 & mat );
-
-// Determinant of a 4x4 matrix
-// 
-VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix4 & mat );
-
-// Conditionally select between two 4x4 matrices
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 );
-
-// Conditionally select between two 4x4 matrices (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a 4x4 matrix
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat );
-
-// Print a 4x4 matrix and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat, const char * name );
-
-#endif
-
-// A 3x4 transformation matrix in array-of-structures format
-//
-class Transform3
-{
-    Vector3 mCol0;
-    Vector3 mCol1;
-    Vector3 mCol2;
-    Vector3 mCol3;
-
-public:
-    // Default constructor; does no initialization
-    // 
-    VECTORMATH_FORCE_INLINE Transform3( ) { };
-
-    // Copy a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3( const Transform3 & tfrm );
-
-    // Construct a 3x4 transformation matrix containing the specified columns
-    // 
-    VECTORMATH_FORCE_INLINE Transform3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2, const Vector3 &col3 );
-
-    // Construct a 3x4 transformation matrix from a 3x3 matrix and a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Transform3( const Matrix3 & tfrm, const Vector3 &translateVec );
-
-    // Construct a 3x4 transformation matrix from a unit-length quaternion and a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE Transform3( const Quat &unitQuat, const Vector3 &translateVec );
-
-    // Set all elements of a 3x4 transformation matrix to the same scalar value
-    // 
-    explicit VECTORMATH_FORCE_INLINE Transform3( float scalar );
-
-    // Set all elements of a 3x4 transformation matrix to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit VECTORMATH_FORCE_INLINE Transform3( const floatInVec &scalar );
-
-    // Assign one 3x4 transformation matrix to another
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & operator =( const Transform3 & tfrm );
-
-    // Set the upper-left 3x3 submatrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setUpper3x3( const Matrix3 & mat3 );
-
-    // Get the upper-left 3x3 submatrix of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Matrix3 getUpper3x3( ) const;
-
-    // Set translation component
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setTranslation( const Vector3 &translateVec );
-
-    // Get the translation component of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getTranslation( ) const;
-
-    // Set column 0 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setCol0( const Vector3 &col0 );
-
-    // Set column 1 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setCol1( const Vector3 &col1 );
-
-    // Set column 2 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setCol2( const Vector3 &col2 );
-
-    // Set column 3 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setCol3( const Vector3 &col3 );
-
-    // Get column 0 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol0( ) const;
-
-    // Get column 1 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol1( ) const;
-
-    // Get column 2 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol2( ) const;
-
-    // Get column 3 of a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol3( ) const;
-
-    // Set the column of a 3x4 transformation matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setCol( int col, const Vector3 &vec );
-
-    // Set the row of a 3x4 transformation matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setRow( int row, const Vector4 &vec );
-
-    // Get the column of a 3x4 transformation matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 getCol( int col ) const;
-
-    // Get the row of a 3x4 transformation matrix referred to by the specified index
-    // 
-    VECTORMATH_FORCE_INLINE const Vector4 getRow( int row ) const;
-
-    // Subscripting operator to set or get a column
-    // 
-    VECTORMATH_FORCE_INLINE Vector3 & operator []( int col );
-
-    // Subscripting operator to get a column
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator []( int col ) const;
-
-    // Set the element of a 3x4 transformation matrix referred to by column and row indices
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setElem( int col, int row, float val );
-
-    // Set the element of a 3x4 transformation matrix referred to by column and row indices (scalar data contained in vector data type)
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & setElem( int col, int row, const floatInVec &val );
-
-    // Get the element of a 3x4 transformation matrix referred to by column and row indices
-    // 
-    VECTORMATH_FORCE_INLINE const floatInVec getElem( int col, int row ) const;
-
-    // Multiply a 3x4 transformation matrix by a 3-D vector
-    // 
-    VECTORMATH_FORCE_INLINE const Vector3 operator *( const Vector3 &vec ) const;
-
-    // Multiply a 3x4 transformation matrix by a 3-D point
-    // 
-    VECTORMATH_FORCE_INLINE const Point3 operator *( const Point3 &pnt ) const;
-
-    // Multiply two 3x4 transformation matrices
-    // 
-    VECTORMATH_FORCE_INLINE const Transform3 operator *( const Transform3 & tfrm ) const;
-
-    // Perform compound assignment and multiplication by a 3x4 transformation matrix
-    // 
-    VECTORMATH_FORCE_INLINE Transform3 & operator *=( const Transform3 & tfrm );
-
-    // Construct an identity 3x4 transformation matrix
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 identity( );
-
-    // Construct a 3x4 transformation matrix to rotate around the x axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationX( float radians );
-
-    // Construct a 3x4 transformation matrix to rotate around the y axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationY( float radians );
-
-    // Construct a 3x4 transformation matrix to rotate around the z axis
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationZ( float radians );
-
-    // Construct a 3x4 transformation matrix to rotate around the x axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationX( const floatInVec &radians );
-
-    // Construct a 3x4 transformation matrix to rotate around the y axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationY( const floatInVec &radians );
-
-    // Construct a 3x4 transformation matrix to rotate around the z axis (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationZ( const floatInVec &radians );
-
-    // Construct a 3x4 transformation matrix to rotate around the x, y, and z axes
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotationZYX( const Vector3 &radiansXYZ );
-
-    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotation( float radians, const Vector3 &unitVec );
-
-    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotation( const floatInVec &radians, const Vector3 &unitVec );
-
-    // Construct a rotation matrix from a unit-length quaternion
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 rotation( const Quat &unitQuat );
-
-    // Construct a 3x4 transformation matrix to perform scaling
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 scale( const Vector3 &scaleVec );
-
-    // Construct a 3x4 transformation matrix to perform translation
-    // 
-    static VECTORMATH_FORCE_INLINE const Transform3 translation( const Vector3 &translateVec );
-
-};
-// Append (post-multiply) a scale transformation to a 3x4 transformation matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-// 
-VECTORMATH_FORCE_INLINE const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec );
-
-// Prepend (pre-multiply) a scale transformation to a 3x4 transformation matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-// 
-VECTORMATH_FORCE_INLINE const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm );
-
-// Multiply two 3x4 transformation matrices per element
-// 
-VECTORMATH_FORCE_INLINE const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 );
-
-// Compute the absolute value of a 3x4 transformation matrix per element
-// 
-VECTORMATH_FORCE_INLINE const Transform3 absPerElem( const Transform3 & tfrm );
-
-// Inverse of a 3x4 transformation matrix
-// NOTE: 
-// Result is unpredictable when the determinant of the left 3x3 submatrix is equal to or near 0.
-// 
-VECTORMATH_FORCE_INLINE const Transform3 inverse( const Transform3 & tfrm );
-
-// Compute the inverse of a 3x4 transformation matrix, expected to have an orthogonal upper-left 3x3 submatrix
-// NOTE: 
-// This can be used to achieve better performance than a general inverse when the specified 3x4 transformation matrix meets the given restrictions.
-// 
-VECTORMATH_FORCE_INLINE const Transform3 orthoInverse( const Transform3 & tfrm );
-
-// Conditionally select between two 3x4 transformation matrices
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 );
-
-// Conditionally select between two 3x4 transformation matrices (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a 3x4 transformation matrix
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm );
-
-// Print a 3x4 transformation matrix and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm, const char * name );
-
-#endif
-
-} // namespace Aos
-} // namespace Vectormath
-
-#include "vec_aos.h"
-#include "quat_aos.h"
-#include "mat_aos.h"
-
-#endif
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef _VECTORMATH_AOS_CPP_SSE_H
+#define _VECTORMATH_AOS_CPP_SSE_H
+
+#include <math.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <assert.h>
+
+#define Vector3Ref Vector3&
+#define QuatRef	Quat&
+#define Matrix3Ref Matrix3&
+
+#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400)
+	#define USE_SSE3_LDDQU
+
+	#define VM_ATTRIBUTE_ALIGNED_CLASS16(a) __declspec(align(16)) a
+	#define VM_ATTRIBUTE_ALIGN16 __declspec(align(16))
+	#define VECTORMATH_FORCE_INLINE __forceinline 
+#else
+	#define VM_ATTRIBUTE_ALIGNED_CLASS16(a) a __attribute__ ((aligned (16)))	
+	#define VM_ATTRIBUTE_ALIGN16 __attribute__ ((aligned (16)))	
+	#define VECTORMATH_FORCE_INLINE inline __attribute__ ((always_inline))
+	#ifdef __SSE3__
+		#define USE_SSE3_LDDQU
+	#endif //__SSE3__
+#endif//_WIN32
+
+
+#ifdef USE_SSE3_LDDQU
+#include <pmmintrin.h>//_mm_lddqu_si128
+#endif //USE_SSE3_LDDQU
+
+
+// TODO: Tidy
+typedef __m128 vec_float4;
+typedef __m128 vec_uint4;
+typedef __m128 vec_int4;
+typedef __m128i vec_uchar16;
+typedef __m128i vec_ushort8;
+
+#define vec_splat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e))
+
+#define _mm_ror_ps(vec,i)	\
+	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(i+3)%4,(unsigned char)(i+2)%4,(unsigned char)(i+1)%4,(unsigned char)(i+0)%4))) : (vec))
+#define _mm_rol_ps(vec,i)	\
+	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(7-i)%4,(unsigned char)(6-i)%4,(unsigned char)(5-i)%4,(unsigned char)(4-i)%4))) : (vec))
+
+#define vec_sld(vec,vec2,x) _mm_ror_ps(vec, ((x)/4))
+
+#define _mm_abs_ps(vec)		_mm_andnot_ps(_MASKSIGN_,vec)
+#define _mm_neg_ps(vec)		_mm_xor_ps(_MASKSIGN_,vec)
+
+#define vec_madd(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b) )
+
+union SSEFloat
+{
+	__m128i vi;
+	__m128 m128;
+	__m128 vf;
+	unsigned int	ui[4];
+	unsigned short s[8];
+	float f[4];
+	SSEFloat(__m128 v) : m128(v) {}
+    SSEFloat(__m128i v) : vi(v) {}
+	SSEFloat() {}//uninitialized
+};
+
+static VECTORMATH_FORCE_INLINE __m128 vec_sel(__m128 a, __m128 b, __m128 mask)
+{
+	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
+}
+static VECTORMATH_FORCE_INLINE __m128 vec_sel(__m128 a, __m128 b, const unsigned int *_mask)
+{
+	return vec_sel(a, b, _mm_load_ps((float *)_mask));
+}
+static VECTORMATH_FORCE_INLINE __m128 vec_sel(__m128 a, __m128 b, unsigned int _mask)
+{
+	return vec_sel(a, b, _mm_set1_ps(*(float *)&_mask));
+}
+
+static VECTORMATH_FORCE_INLINE __m128 toM128(unsigned int x)
+{
+    return _mm_set1_ps( *(float *)&x );
+}
+
+static VECTORMATH_FORCE_INLINE __m128 fabsf4(__m128 x)
+{
+    return _mm_and_ps( x, toM128( 0x7fffffff ) );
+}
+/*
+union SSE64
+{
+	__m128 m128;
+	struct
+	{
+		__m64 m01;
+		__m64 m23;
+	} m64;
+};
+
+static VECTORMATH_FORCE_INLINE __m128 vec_cts(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	SSE64 sse64;
+	sse64.m64.m01 = _mm_cvttps_pi32(x);
+	sse64.m64.m23 = _mm_cvttps_pi32(_mm_ror_ps(x,2));
+	_mm_empty();
+    return sse64.m128;
+}
+
+static VECTORMATH_FORCE_INLINE __m128 vec_ctf(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	SSE64 sse64;
+	sse64.m128 = x;
+	__m128 result =_mm_movelh_ps(
+		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m01),
+		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m23));
+	_mm_empty();
+	return result;
+}
+*/
+static VECTORMATH_FORCE_INLINE __m128 vec_cts(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	__m128i result = _mm_cvtps_epi32(x);
+    return (__m128 &)result;
+}
+
+static VECTORMATH_FORCE_INLINE __m128 vec_ctf(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	return _mm_cvtepi32_ps((__m128i &)x);
+}
+
+#define vec_nmsub(a,b,c) _mm_sub_ps( c, _mm_mul_ps( a, b ) )
+#define vec_sub(a,b) _mm_sub_ps( a, b )
+#define vec_add(a,b) _mm_add_ps( a, b )
+#define vec_mul(a,b) _mm_mul_ps( a, b )
+#define vec_xor(a,b) _mm_xor_ps( a, b )
+#define vec_and(a,b) _mm_and_ps( a, b )
+#define vec_cmpeq(a,b) _mm_cmpeq_ps( a, b )
+#define vec_cmpgt(a,b) _mm_cmpgt_ps( a, b )
+
+#define vec_mergeh(a,b) _mm_unpacklo_ps( a, b )
+#define vec_mergel(a,b) _mm_unpackhi_ps( a, b )
+
+#define vec_andc(a,b) _mm_andnot_ps( b, a )
+
+#define sqrtf4(x) _mm_sqrt_ps( x )
+#define rsqrtf4(x) _mm_rsqrt_ps( x )
+#define recipf4(x) _mm_rcp_ps( x )
+#define negatef4(x) _mm_sub_ps( _mm_setzero_ps(), x )
+
+static VECTORMATH_FORCE_INLINE __m128 newtonrapson_rsqrt4( const __m128 v )
+{   
+#define _half4 _mm_setr_ps(.5f,.5f,.5f,.5f) 
+#define _three _mm_setr_ps(3.f,3.f,3.f,3.f)
+const __m128 approx = _mm_rsqrt_ps( v );   
+const __m128 muls = _mm_mul_ps(_mm_mul_ps(v, approx), approx);   
+return _mm_mul_ps(_mm_mul_ps(_half4, approx), _mm_sub_ps(_three, muls) );
+}
+
+static VECTORMATH_FORCE_INLINE __m128 acosf4(__m128 x)
+{
+    __m128 xabs = fabsf4(x);
+	__m128 select = _mm_cmplt_ps( x, _mm_setzero_ps() );
+    __m128 t1 = sqrtf4(vec_sub(_mm_set1_ps(1.0f), xabs));
+    
+    /* Instruction counts can be reduced if the polynomial was
+     * computed entirely from nested (dependent) fma's. However, 
+     * to reduce the number of pipeline stalls, the polygon is evaluated 
+     * in two halves (hi amd lo). 
+     */
+    __m128 xabs2 = _mm_mul_ps(xabs,  xabs);
+    __m128 xabs4 = _mm_mul_ps(xabs2, xabs2);
+    __m128 hi = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0012624911f),
+		xabs, _mm_set1_ps(0.0066700901f)),
+			xabs, _mm_set1_ps(-0.0170881256f)),
+				xabs, _mm_set1_ps( 0.0308918810f));
+    __m128 lo = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0501743046f),
+		xabs, _mm_set1_ps(0.0889789874f)),
+			xabs, _mm_set1_ps(-0.2145988016f)),
+				xabs, _mm_set1_ps( 1.5707963050f));
+    
+    __m128 result = vec_madd(hi, xabs4, lo);
+    
+    // Adjust the result if x is negactive.
+    return vec_sel(
+		vec_mul(t1, result),									// Positive
+		vec_nmsub(t1, result, _mm_set1_ps(3.1415926535898f)),	// Negative
+		select);
+}
+
+static VECTORMATH_FORCE_INLINE __m128 sinf4(vec_float4 x)
+{
+
+//
+// Common constants used to evaluate sinf4/cosf4/tanf4
+//
+#define _SINCOS_CC0  -0.0013602249f
+#define _SINCOS_CC1   0.0416566950f
+#define _SINCOS_CC2  -0.4999990225f
+#define _SINCOS_SC0  -0.0001950727f
+#define _SINCOS_SC1   0.0083320758f
+#define _SINCOS_SC2  -0.1666665247f
+
+#define _SINCOS_KC1  1.57079625129f
+#define _SINCOS_KC2  7.54978995489e-8f
+
+    vec_float4 xl,xl2,xl3,res;
+
+    // Range reduction using : xl = angle * TwoOverPi;
+    //  
+    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
+
+    // Find the quadrant the angle falls in
+    // using:  q = (int) (ceil(abs(xl))*sign(xl))
+    //
+    vec_int4 q = vec_cts(xl,0);
+
+    // Compute an offset based on the quadrant that the angle falls in
+    // 
+    vec_int4 offset = _mm_and_ps(q,toM128(0x3));
+
+    // Remainder in range [-pi/4..pi/4]
+    //
+    vec_float4 qf = vec_ctf(q,0);
+    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
+    
+    // Compute x^2 and x^3
+    //
+    xl2 = vec_mul(xl,xl);
+    xl3 = vec_mul(xl2,xl);
+    
+    // Compute both the sin and cos of the angles
+    // using a polynomial expression:
+    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
+    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
+    //
+    
+    vec_float4 cx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
+    vec_float4 sx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
+
+    // Use the cosine when the offset is odd and the sin
+    // when the offset is even
+    //
+    res = vec_sel(cx,sx,vec_cmpeq(vec_and(offset,
+                                          toM128(0x1)),
+										  _mm_setzero_ps()));
+
+    // Flip the sign of the result when (offset mod 4) = 1 or 2
+    //
+    return vec_sel(
+		vec_xor(toM128(0x80000000U), res),	// Negative
+		res,								// Positive
+		vec_cmpeq(vec_and(offset,toM128(0x2)),_mm_setzero_ps()));
+}
+
+static VECTORMATH_FORCE_INLINE void sincosf4(vec_float4 x, vec_float4* s, vec_float4* c)
+{
+    vec_float4 xl,xl2,xl3;
+    vec_int4   offsetSin, offsetCos;
+
+    // Range reduction using : xl = angle * TwoOverPi;
+    //  
+    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
+
+    // Find the quadrant the angle falls in
+    // using:  q = (int) (ceil(abs(xl))*sign(xl))
+    //
+    //vec_int4 q = vec_cts(vec_add(xl,vec_sel(_mm_set1_ps(0.5f),xl,(0x80000000))),0);
+    vec_int4 q = vec_cts(xl,0);
+     
+    // Compute the offset based on the quadrant that the angle falls in.
+    // Add 1 to the offset for the cosine. 
+    //
+    offsetSin = vec_and(q,toM128((int)0x3));
+	__m128i temp = _mm_add_epi32(_mm_set1_epi32(1),(__m128i &)offsetSin);
+	offsetCos = (__m128 &)temp;
+
+    // Remainder in range [-pi/4..pi/4]
+    //
+    vec_float4 qf = vec_ctf(q,0);
+    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
+    
+    // Compute x^2 and x^3
+    //
+    xl2 = vec_mul(xl,xl);
+    xl3 = vec_mul(xl2,xl);
+    
+    // Compute both the sin and cos of the angles
+    // using a polynomial expression:
+    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
+    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
+    //
+    vec_float4 cx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
+    vec_float4 sx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
+
+    // Use the cosine when the offset is odd and the sin
+    // when the offset is even
+    //
+    vec_uint4 sinMask = (vec_uint4)vec_cmpeq(vec_and(offsetSin,toM128(0x1)),_mm_setzero_ps());
+    vec_uint4 cosMask = (vec_uint4)vec_cmpeq(vec_and(offsetCos,toM128(0x1)),_mm_setzero_ps());    
+    *s = vec_sel(cx,sx,sinMask);
+    *c = vec_sel(cx,sx,cosMask);
+
+    // Flip the sign of the result when (offset mod 4) = 1 or 2
+    //
+    sinMask = vec_cmpeq(vec_and(offsetSin,toM128(0x2)),_mm_setzero_ps());
+    cosMask = vec_cmpeq(vec_and(offsetCos,toM128(0x2)),_mm_setzero_ps());
+    
+    *s = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*s),*s,sinMask);
+    *c = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*c),*c,cosMask);    
+}
+
+#include "vecidx_aos.h"
+#include "floatInVec.h"
+#include "boolInVec.h"
+
+#ifdef _VECTORMATH_DEBUG
+#include <stdio.h>
+#endif
+namespace Vectormath {
+
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// Forward Declarations
+//
+
+class Vector3;
+class Vector4;
+class Point3;
+class Quat;
+class Matrix3;
+class Matrix4;
+class Transform3;
+
+// A 3-D vector in array-of-structures format
+//
+class Vector3
+{
+    __m128 mVec128;
+
+	VECTORMATH_FORCE_INLINE void set128(vec_float4 vec);
+	 
+	 VECTORMATH_FORCE_INLINE  vec_float4& get128Ref();
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Vector3( ) { };
+
+	// Default copy constructor
+    // 
+	VECTORMATH_FORCE_INLINE Vector3(const Vector3& vec);
+
+    // Construct a 3-D vector from x, y, and z elements
+    // 
+    VECTORMATH_FORCE_INLINE Vector3( float x, float y, float z );
+
+    // Construct a 3-D vector from x, y, and z elements (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
+
+    // Copy elements from a 3-D point into a 3-D vector
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector3( const Point3 &pnt );
+
+    // Set all elements of a 3-D vector to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector3( float scalar );
+
+    // Set all elements of a 3-D vector to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector3( const floatInVec &scalar );
+
+    // Set vector float data in a 3-D vector
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector3( __m128 vf4 );
+
+    // Get vector float data from a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
+
+    // Assign one 3-D vector to another
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator =( const Vector3 &vec );
+
+    // Set the x element of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setX( float x );
+
+    // Set the y element of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setY( float y );
+
+    // Set the z element of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setZ( float z );
+
+    // Set the x element of a 3-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setX( const floatInVec &x );
+
+    // Set the y element of a 3-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setY( const floatInVec &y );
+
+    // Set the z element of a 3-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setZ( const floatInVec &z );
+
+    // Get the x element of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
+
+    // Get the y element of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
+
+    // Get the z element of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D vector by index
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setElem( int idx, float value );
+
+    // Set an x, y, or z element of a 3-D vector by index (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, or z element of a 3-D vector by index
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
+
+    // Add two 3-D vectors
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator +( const Vector3 &vec ) const;
+
+    // Subtract a 3-D vector from another 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator -( const Vector3 &vec ) const;
+
+    // Add a 3-D vector to a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const Point3 operator +( const Point3 &pnt ) const;
+
+    // Multiply a 3-D vector by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator *( float scalar ) const;
+
+    // Divide a 3-D vector by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator /( float scalar ) const;
+
+    // Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator *( const floatInVec &scalar ) const;
+
+    // Divide a 3-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator /( const floatInVec &scalar ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator +=( const Vector3 &vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator -=( const Vector3 &vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator /=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator /=( const floatInVec &scalar );
+
+    // Negate all elements of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector3 xAxis( );
+
+    // Construct y axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector3 yAxis( );
+
+    // Construct z axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector3 zAxis( );
+
+};
+
+// Multiply a 3-D vector by a scalar
+// 
+VECTORMATH_FORCE_INLINE const Vector3 operator *( float scalar, const Vector3 &vec );
+
+// Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec );
+
+// Multiply two 3-D vectors per element
+// 
+VECTORMATH_FORCE_INLINE const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Divide two 3-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Compute the reciprocal of a 3-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 recipPerElem( const Vector3 &vec );
+
+// Compute the absolute value of a 3-D vector per element
+// 
+VECTORMATH_FORCE_INLINE const Vector3 absPerElem( const Vector3 &vec );
+
+// Copy sign from one 3-D vector to another, per element
+// 
+VECTORMATH_FORCE_INLINE const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Maximum of two 3-D vectors per element
+// 
+VECTORMATH_FORCE_INLINE const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Minimum of two 3-D vectors per element
+// 
+VECTORMATH_FORCE_INLINE const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Maximum element of a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector3 &vec );
+
+// Minimum element of a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector3 &vec );
+
+// Compute the sum of all elements of a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector3 &vec );
+
+// Compute the dot product of two 3-D vectors
+// 
+VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Compute the square of the length of a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector3 &vec );
+
+// Compute the length of a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec length( const Vector3 &vec );
+
+// Normalize a 3-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 normalize( const Vector3 &vec );
+
+// Compute cross product of two 3-D vectors
+// 
+VECTORMATH_FORCE_INLINE const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Outer product of two 3-D vectors
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 outer( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Pre-multiply a row vector by a 3x3 matrix
+// NOTE: 
+// Slower than column post-multiply.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat );
+
+// Cross-product matrix of a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 crossMatrix( const Vector3 &vec );
+
+// Create cross-product matrix and multiply
+// NOTE: 
+// Faster than separately creating a cross-product matrix and multiplying.
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat );
+
+// Linear interpolation between two 3-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 );
+
+// Linear interpolation between two 3-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 );
+
+// Spherical linear interpolation between two 3-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
+
+// Spherical linear interpolation between two 3-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
+
+// Conditionally select between two 3-D vectors
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 );
+
+// Conditionally select between two 3-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, const boolInVec &select1 );
+
+// Store x, y, and z elements of 3-D vector in first three words of a quadword, preserving fourth word
+// 
+VECTORMATH_FORCE_INLINE void storeXYZ( const Vector3 &vec, __m128 * quad );
+
+// Load four three-float 3-D vectors, stored in three quadwords
+// 
+VECTORMATH_FORCE_INLINE void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads );
+
+// Store four 3-D vectors in three quadwords
+// 
+VECTORMATH_FORCE_INLINE void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads );
+
+// Store eight 3-D vectors as half-floats
+// 
+VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Vector3 &vec );
+
+// Print a 3-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Vector3 &vec, const char * name );
+
+#endif
+
+// A 4-D vector in array-of-structures format
+//
+class Vector4
+{
+    __m128 mVec128;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Vector4( ) { };
+
+    // Construct a 4-D vector from x, y, z, and w elements
+    // 
+    VECTORMATH_FORCE_INLINE Vector4( float x, float y, float z, float w );
+
+    // Construct a 4-D vector from x, y, z, and w elements (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
+
+    // Construct a 4-D vector from a 3-D vector and a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Vector4( const Vector3 &xyz, float w );
+
+    // Construct a 4-D vector from a 3-D vector and a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4( const Vector3 &xyz, const floatInVec &w );
+
+    // Copy x, y, and z from a 3-D vector into a 4-D vector, and set w to 0
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector4( const Vector3 &vec );
+
+    // Copy x, y, and z from a 3-D point into a 4-D vector, and set w to 1
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector4( const Point3 &pnt );
+
+    // Copy elements from a quaternion into a 4-D vector
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector4( const Quat &quat );
+
+    // Set all elements of a 4-D vector to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector4( float scalar );
+
+    // Set all elements of a 4-D vector to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector4( const floatInVec &scalar );
+
+    // Set vector float data in a 4-D vector
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector4( __m128 vf4 );
+
+    // Get vector float data from a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
+
+    // Assign one 4-D vector to another
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator =( const Vector4 &vec );
+
+    // Set the x, y, and z elements of a 4-D vector
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setXYZ( const Vector3 &vec );
+
+    // Get the x, y, and z elements of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getXYZ( ) const;
+
+    // Set the x element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setX( float x );
+
+    // Set the y element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setY( float y );
+
+    // Set the z element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setZ( float z );
+
+    // Set the w element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setW( float w );
+
+    // Set the x element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setX( const floatInVec &x );
+
+    // Set the y element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setY( const floatInVec &y );
+
+    // Set the z element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setZ( const floatInVec &z );
+
+    // Set the w element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setW( const floatInVec &w );
+
+    // Get the x element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
+
+    // Get the y element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
+
+    // Get the z element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
+
+    // Get the w element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getW( ) const;
+
+    // Set an x, y, z, or w element of a 4-D vector by index
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setElem( int idx, float value );
+
+    // Set an x, y, z, or w element of a 4-D vector by index (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, z, or w element of a 4-D vector by index
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
+
+    // Add two 4-D vectors
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator +( const Vector4 &vec ) const;
+
+    // Subtract a 4-D vector from another 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator -( const Vector4 &vec ) const;
+
+    // Multiply a 4-D vector by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator *( float scalar ) const;
+
+    // Divide a 4-D vector by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator /( float scalar ) const;
+
+    // Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator *( const floatInVec &scalar ) const;
+
+    // Divide a 4-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator /( const floatInVec &scalar ) const;
+
+    // Perform compound assignment and addition with a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator +=( const Vector4 &vec );
+
+    // Perform compound assignment and subtraction by a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator -=( const Vector4 &vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator /=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator /=( const floatInVec &scalar );
+
+    // Negate all elements of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector4 xAxis( );
+
+    // Construct y axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector4 yAxis( );
+
+    // Construct z axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector4 zAxis( );
+
+    // Construct w axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector4 wAxis( );
+
+};
+
+// Multiply a 4-D vector by a scalar
+// 
+VECTORMATH_FORCE_INLINE const Vector4 operator *( float scalar, const Vector4 &vec );
+
+// Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec );
+
+// Multiply two 4-D vectors per element
+// 
+VECTORMATH_FORCE_INLINE const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Divide two 4-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Compute the reciprocal of a 4-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 recipPerElem( const Vector4 &vec );
+
+// Compute the absolute value of a 4-D vector per element
+// 
+VECTORMATH_FORCE_INLINE const Vector4 absPerElem( const Vector4 &vec );
+
+// Copy sign from one 4-D vector to another, per element
+// 
+VECTORMATH_FORCE_INLINE const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Maximum of two 4-D vectors per element
+// 
+VECTORMATH_FORCE_INLINE const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Minimum of two 4-D vectors per element
+// 
+VECTORMATH_FORCE_INLINE const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Maximum element of a 4-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector4 &vec );
+
+// Minimum element of a 4-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector4 &vec );
+
+// Compute the sum of all elements of a 4-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector4 &vec );
+
+// Compute the dot product of two 4-D vectors
+// 
+VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Compute the square of the length of a 4-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector4 &vec );
+
+// Compute the length of a 4-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec length( const Vector4 &vec );
+
+// Normalize a 4-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 normalize( const Vector4 &vec );
+
+// Outer product of two 4-D vectors
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 outer( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Linear interpolation between two 4-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 );
+
+// Linear interpolation between two 4-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 );
+
+// Spherical linear interpolation between two 4-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
+
+// Spherical linear interpolation between two 4-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
+
+// Conditionally select between two 4-D vectors
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 );
+
+// Conditionally select between two 4-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, const boolInVec &select1 );
+
+// Store four 4-D vectors as half-floats
+// 
+VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Vector4 &vec );
+
+// Print a 4-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Vector4 &vec, const char * name );
+
+#endif
+
+// A 3-D point in array-of-structures format
+//
+class Point3
+{
+    __m128 mVec128;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Point3( ) { };
+
+    // Construct a 3-D point from x, y, and z elements
+    // 
+    VECTORMATH_FORCE_INLINE Point3( float x, float y, float z );
+
+    // Construct a 3-D point from x, y, and z elements (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Point3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
+
+    // Copy elements from a 3-D vector into a 3-D point
+    // 
+    explicit VECTORMATH_FORCE_INLINE Point3( const Vector3 &vec );
+
+    // Set all elements of a 3-D point to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Point3( float scalar );
+
+    // Set all elements of a 3-D point to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Point3( const floatInVec &scalar );
+
+    // Set vector float data in a 3-D point
+    // 
+    explicit VECTORMATH_FORCE_INLINE Point3( __m128 vf4 );
+
+    // Get vector float data from a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
+
+    // Assign one 3-D point to another
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & operator =( const Point3 &pnt );
+
+    // Set the x element of a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setX( float x );
+
+    // Set the y element of a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setY( float y );
+
+    // Set the z element of a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setZ( float z );
+
+    // Set the x element of a 3-D point (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setX( const floatInVec &x );
+
+    // Set the y element of a 3-D point (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setY( const floatInVec &y );
+
+    // Set the z element of a 3-D point (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setZ( const floatInVec &z );
+
+    // Get the x element of a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
+
+    // Get the y element of a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
+
+    // Get the z element of a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D point by index
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setElem( int idx, float value );
+
+    // Set an x, y, or z element of a 3-D point by index (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, or z element of a 3-D point by index
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
+
+    // Subtract a 3-D point from another 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator -( const Point3 &pnt ) const;
+
+    // Add a 3-D point to a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Point3 operator +( const Vector3 &vec ) const;
+
+    // Subtract a 3-D vector from a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const Point3 operator -( const Vector3 &vec ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & operator +=( const Vector3 &vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & operator -=( const Vector3 &vec );
+
+};
+
+// Multiply two 3-D points per element
+// 
+VECTORMATH_FORCE_INLINE const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Divide two 3-D points per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+VECTORMATH_FORCE_INLINE const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Compute the reciprocal of a 3-D point per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+VECTORMATH_FORCE_INLINE const Point3 recipPerElem( const Point3 &pnt );
+
+// Compute the absolute value of a 3-D point per element
+// 
+VECTORMATH_FORCE_INLINE const Point3 absPerElem( const Point3 &pnt );
+
+// Copy sign from one 3-D point to another, per element
+// 
+VECTORMATH_FORCE_INLINE const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Maximum of two 3-D points per element
+// 
+VECTORMATH_FORCE_INLINE const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Minimum of two 3-D points per element
+// 
+VECTORMATH_FORCE_INLINE const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Maximum element of a 3-D point
+// 
+VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Point3 &pnt );
+
+// Minimum element of a 3-D point
+// 
+VECTORMATH_FORCE_INLINE const floatInVec minElem( const Point3 &pnt );
+
+// Compute the sum of all elements of a 3-D point
+// 
+VECTORMATH_FORCE_INLINE const floatInVec sum( const Point3 &pnt );
+
+// Apply uniform scale to a 3-D point
+// 
+VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, float scaleVal );
+
+// Apply uniform scale to a 3-D point (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal );
+
+// Apply non-uniform scale to a 3-D point
+// 
+VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec );
+
+// Scalar projection of a 3-D point on a unit-length 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec );
+
+// Compute the square of the distance of a 3-D point from the coordinate-system origin
+// 
+VECTORMATH_FORCE_INLINE const floatInVec distSqrFromOrigin( const Point3 &pnt );
+
+// Compute the distance of a 3-D point from the coordinate-system origin
+// 
+VECTORMATH_FORCE_INLINE const floatInVec distFromOrigin( const Point3 &pnt );
+
+// Compute the square of the distance between two 3-D points
+// 
+VECTORMATH_FORCE_INLINE const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Compute the distance between two 3-D points
+// 
+VECTORMATH_FORCE_INLINE const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Linear interpolation between two 3-D points
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 );
+
+// Linear interpolation between two 3-D points (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 );
+
+// Conditionally select between two 3-D points
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 );
+
+// Conditionally select between two 3-D points (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 );
+
+// Store x, y, and z elements of 3-D point in first three words of a quadword, preserving fourth word
+// 
+VECTORMATH_FORCE_INLINE void storeXYZ( const Point3 &pnt, __m128 * quad );
+
+// Load four three-float 3-D points, stored in three quadwords
+// 
+VECTORMATH_FORCE_INLINE void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads );
+
+// Store four 3-D points in three quadwords
+// 
+VECTORMATH_FORCE_INLINE void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads );
+
+// Store eight 3-D points as half-floats
+// 
+VECTORMATH_FORCE_INLINE void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D point
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Point3 &pnt );
+
+// Print a 3-D point and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Point3 &pnt, const char * name );
+
+#endif
+
+// A quaternion in array-of-structures format
+//
+class Quat
+{
+    __m128 mVec128;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Quat( ) { };
+
+	VECTORMATH_FORCE_INLINE  Quat(const Quat& quat);
+
+    // Construct a quaternion from x, y, z, and w elements
+    // 
+    VECTORMATH_FORCE_INLINE Quat( float x, float y, float z, float w );
+
+    // Construct a quaternion from x, y, z, and w elements (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
+
+    // Construct a quaternion from a 3-D vector and a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Quat( const Vector3 &xyz, float w );
+
+    // Construct a quaternion from a 3-D vector and a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat( const Vector3 &xyz, const floatInVec &w );
+
+    // Copy elements from a 4-D vector into a quaternion
+    // 
+    explicit VECTORMATH_FORCE_INLINE Quat( const Vector4 &vec );
+
+    // Convert a rotation matrix to a unit-length quaternion
+    // 
+    explicit VECTORMATH_FORCE_INLINE Quat( const Matrix3 & rotMat );
+
+    // Set all elements of a quaternion to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Quat( float scalar );
+
+    // Set all elements of a quaternion to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Quat( const floatInVec &scalar );
+
+    // Set vector float data in a quaternion
+    // 
+    explicit VECTORMATH_FORCE_INLINE Quat( __m128 vf4 );
+
+    // Get vector float data from a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
+
+	// Set a quaterion from vector float data
+    //
+	VECTORMATH_FORCE_INLINE void set128(vec_float4 vec);
+
+    // Assign one quaternion to another
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator =( const Quat &quat );
+
+    // Set the x, y, and z elements of a quaternion
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setXYZ( const Vector3 &vec );
+
+    // Get the x, y, and z elements of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getXYZ( ) const;
+
+    // Set the x element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setX( float x );
+
+    // Set the y element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setY( float y );
+
+    // Set the z element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setZ( float z );
+
+    // Set the w element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setW( float w );
+
+    // Set the x element of a quaternion (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setX( const floatInVec &x );
+
+    // Set the y element of a quaternion (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setY( const floatInVec &y );
+
+    // Set the z element of a quaternion (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setZ( const floatInVec &z );
+
+    // Set the w element of a quaternion (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setW( const floatInVec &w );
+
+    // Get the x element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
+
+    // Get the y element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
+
+    // Get the z element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
+
+    // Get the w element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getW( ) const;
+
+    // Set an x, y, z, or w element of a quaternion by index
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setElem( int idx, float value );
+
+    // Set an x, y, z, or w element of a quaternion by index (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, z, or w element of a quaternion by index
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
+
+    // Add two quaternions
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator +( const Quat &quat ) const;
+
+    // Subtract a quaternion from another quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator -( const Quat &quat ) const;
+
+    // Multiply two quaternions
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator *( const Quat &quat ) const;
+
+    // Multiply a quaternion by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator *( float scalar ) const;
+
+    // Divide a quaternion by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator /( float scalar ) const;
+
+    // Multiply a quaternion by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator *( const floatInVec &scalar ) const;
+
+    // Divide a quaternion by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator /( const floatInVec &scalar ) const;
+
+    // Perform compound assignment and addition with a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator +=( const Quat &quat );
+
+    // Perform compound assignment and subtraction by a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator -=( const Quat &quat );
+
+    // Perform compound assignment and multiplication by a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator *=( const Quat &quat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator /=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator /=( const floatInVec &scalar );
+
+    // Negate all elements of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator -( ) const;
+
+    // Construct an identity quaternion
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat identity( );
+
+    // Construct a quaternion to rotate between two unit-length 3-D vectors
+    // NOTE: 
+    // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions.
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 );
+
+    // Construct a quaternion to rotate around a unit-length 3-D vector
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a quaternion to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a quaternion to rotate around the x axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotationX( float radians );
+
+    // Construct a quaternion to rotate around the y axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotationY( float radians );
+
+    // Construct a quaternion to rotate around the z axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotationZ( float radians );
+
+    // Construct a quaternion to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotationX( const floatInVec &radians );
+
+    // Construct a quaternion to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotationY( const floatInVec &radians );
+
+    // Construct a quaternion to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotationZ( const floatInVec &radians );
+
+};
+
+// Multiply a quaternion by a scalar
+// 
+VECTORMATH_FORCE_INLINE const Quat operator *( float scalar, const Quat &quat );
+
+// Multiply a quaternion by a scalar (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Quat operator *( const floatInVec &scalar, const Quat &quat );
+
+// Compute the conjugate of a quaternion
+// 
+VECTORMATH_FORCE_INLINE const Quat conj( const Quat &quat );
+
+// Use a unit-length quaternion to rotate a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const Vector3 rotate( const Quat &unitQuat, const Vector3 &vec );
+
+// Compute the dot product of two quaternions
+// 
+VECTORMATH_FORCE_INLINE const floatInVec dot( const Quat &quat0, const Quat &quat1 );
+
+// Compute the norm of a quaternion
+// 
+VECTORMATH_FORCE_INLINE const floatInVec norm( const Quat &quat );
+
+// Compute the length of a quaternion
+// 
+VECTORMATH_FORCE_INLINE const floatInVec length( const Quat &quat );
+
+// Normalize a quaternion
+// NOTE: 
+// The result is unpredictable when all elements of quat are at or near zero.
+// 
+VECTORMATH_FORCE_INLINE const Quat normalize( const Quat &quat );
+
+// Linear interpolation between two quaternions
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Quat lerp( float t, const Quat &quat0, const Quat &quat1 );
+
+// Linear interpolation between two quaternions (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 );
+
+// Spherical linear interpolation between two quaternions
+// NOTE: 
+// Interpolates along the shortest path between orientations.
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 );
+
+// Spherical linear interpolation between two quaternions (scalar data contained in vector data type)
+// NOTE: 
+// Interpolates along the shortest path between orientations.
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 );
+
+// Spherical quadrangle interpolation
+// 
+VECTORMATH_FORCE_INLINE const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
+
+// Spherical quadrangle interpolation (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
+
+// Conditionally select between two quaternions
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, bool select1 );
+
+// Conditionally select between two quaternions (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a quaternion
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Quat &quat );
+
+// Print a quaternion and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Quat &quat, const char * name );
+
+#endif
+
+// A 3x3 matrix in array-of-structures format
+//
+class Matrix3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3( ) { };
+
+    // Copy a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3( const Matrix3 & mat );
+
+    // Construct a 3x3 matrix containing the specified columns
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2 );
+
+    // Construct a 3x3 rotation matrix from a unit-length quaternion
+    // 
+    explicit VECTORMATH_FORCE_INLINE Matrix3( const Quat &unitQuat );
+
+    // Set all elements of a 3x3 matrix to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Matrix3( float scalar );
+
+    // Set all elements of a 3x3 matrix to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Matrix3( const floatInVec &scalar );
+
+    // Assign one 3x3 matrix to another
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & operator =( const Matrix3 & mat );
+
+    // Set column 0 of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setCol0( const Vector3 &col0 );
+
+    // Set column 1 of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setCol1( const Vector3 &col1 );
+
+    // Set column 2 of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setCol2( const Vector3 &col2 );
+
+    // Get column 0 of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol2( ) const;
+
+    // Set the column of a 3x3 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setCol( int col, const Vector3 &vec );
+
+    // Set the row of a 3x3 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setRow( int row, const Vector3 &vec );
+
+    // Get the column of a 3x3 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x3 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x3 matrix referred to by column and row indices
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setElem( int col, int row, float val );
+
+    // Set the element of a 3x3 matrix referred to by column and row indices (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setElem( int col, int row, const floatInVec &val );
+
+    // Get the element of a 3x3 matrix referred to by column and row indices
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int col, int row ) const;
+
+    // Add two 3x3 matrices
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 operator +( const Matrix3 & mat ) const;
+
+    // Subtract a 3x3 matrix from another 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 operator -( const Matrix3 & mat ) const;
+
+    // Negate all elements of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 operator -( ) const;
+
+    // Multiply a 3x3 matrix by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 operator *( float scalar ) const;
+
+    // Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 operator *( const floatInVec &scalar ) const;
+
+    // Multiply a 3x3 matrix by a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator *( const Vector3 &vec ) const;
+
+    // Multiply two 3x3 matrices
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 operator *( const Matrix3 & mat ) const;
+
+    // Perform compound assignment and addition with a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & operator +=( const Matrix3 & mat );
+
+    // Perform compound assignment and subtraction by a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & operator -=( const Matrix3 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and multiplication by a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & operator *=( const Matrix3 & mat );
+
+    // Construct an identity 3x3 matrix
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 identity( );
+
+    // Construct a 3x3 matrix to rotate around the x axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationX( float radians );
+
+    // Construct a 3x3 matrix to rotate around the y axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationY( float radians );
+
+    // Construct a 3x3 matrix to rotate around the z axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationZ( float radians );
+
+    // Construct a 3x3 matrix to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationX( const floatInVec &radians );
+
+    // Construct a 3x3 matrix to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationY( const floatInVec &radians );
+
+    // Construct a 3x3 matrix to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationZ( const floatInVec &radians );
+
+    // Construct a 3x3 matrix to rotate around the x, y, and z axes
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationZYX( const Vector3 &radiansXYZ );
+
+    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotation( const Quat &unitQuat );
+
+    // Construct a 3x3 matrix to perform scaling
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 scale( const Vector3 &scaleVec );
+
+};
+// Multiply a 3x3 matrix by a scalar
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 operator *( float scalar, const Matrix3 & mat );
+
+// Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat );
+
+// Append (post-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat );
+
+// Multiply two 3x3 matrices per element
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 );
+
+// Compute the absolute value of a 3x3 matrix per element
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 absPerElem( const Matrix3 & mat );
+
+// Transpose of a 3x3 matrix
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 transpose( const Matrix3 & mat );
+
+// Compute the inverse of a 3x3 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 inverse( const Matrix3 & mat );
+
+// Determinant of a 3x3 matrix
+// 
+VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix3 & mat );
+
+// Conditionally select between two 3x3 matrices
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 );
+
+// Conditionally select between two 3x3 matrices (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x3 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat );
+
+// Print a 3x3 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat, const char * name );
+
+#endif
+
+// A 4x4 matrix in array-of-structures format
+//
+class Matrix4
+{
+    Vector4 mCol0;
+    Vector4 mCol1;
+    Vector4 mCol2;
+    Vector4 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4( ) { };
+
+    // Copy a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4( const Matrix4 & mat );
+
+    // Construct a 4x4 matrix containing the specified columns
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4( const Vector4 &col0, const Vector4 &col1, const Vector4 &col2, const Vector4 &col3 );
+
+    // Construct a 4x4 matrix from a 3x4 transformation matrix
+    // 
+    explicit VECTORMATH_FORCE_INLINE Matrix4( const Transform3 & mat );
+
+    // Construct a 4x4 matrix from a 3x3 matrix and a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4( const Matrix3 & mat, const Vector3 &translateVec );
+
+    // Construct a 4x4 matrix from a unit-length quaternion and a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4( const Quat &unitQuat, const Vector3 &translateVec );
+
+    // Set all elements of a 4x4 matrix to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Matrix4( float scalar );
+
+    // Set all elements of a 4x4 matrix to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Matrix4( const floatInVec &scalar );
+
+    // Assign one 4x4 matrix to another
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator =( const Matrix4 & mat );
+
+    // Set the upper-left 3x3 submatrix
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setTranslation( const Vector3 &translateVec );
+
+    // Get the translation component of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setCol0( const Vector4 &col0 );
+
+    // Set column 1 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setCol1( const Vector4 &col1 );
+
+    // Set column 2 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setCol2( const Vector4 &col2 );
+
+    // Set column 3 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setCol3( const Vector4 &col3 );
+
+    // Get column 0 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getCol0( ) const;
+
+    // Get column 1 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getCol1( ) const;
+
+    // Get column 2 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getCol2( ) const;
+
+    // Get column 3 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getCol3( ) const;
+
+    // Set the column of a 4x4 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setCol( int col, const Vector4 &vec );
+
+    // Set the row of a 4x4 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setRow( int row, const Vector4 &vec );
+
+    // Get the column of a 4x4 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getCol( int col ) const;
+
+    // Get the row of a 4x4 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator []( int col ) const;
+
+    // Set the element of a 4x4 matrix referred to by column and row indices
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setElem( int col, int row, float val );
+
+    // Set the element of a 4x4 matrix referred to by column and row indices (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setElem( int col, int row, const floatInVec &val );
+
+    // Get the element of a 4x4 matrix referred to by column and row indices
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int col, int row ) const;
+
+    // Add two 4x4 matrices
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator +( const Matrix4 & mat ) const;
+
+    // Subtract a 4x4 matrix from another 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator -( const Matrix4 & mat ) const;
+
+    // Negate all elements of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator -( ) const;
+
+    // Multiply a 4x4 matrix by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator *( float scalar ) const;
+
+    // Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator *( const floatInVec &scalar ) const;
+
+    // Multiply a 4x4 matrix by a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator *( const Vector4 &vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator *( const Vector3 &vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator *( const Point3 &pnt ) const;
+
+    // Multiply two 4x4 matrices
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator *( const Matrix4 & mat ) const;
+
+    // Multiply a 4x4 matrix by a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and addition with a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator +=( const Matrix4 & mat );
+
+    // Perform compound assignment and subtraction by a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator -=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and multiplication by a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 4x4 matrix
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 identity( );
+
+    // Construct a 4x4 matrix to rotate around the x axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationX( float radians );
+
+    // Construct a 4x4 matrix to rotate around the y axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationY( float radians );
+
+    // Construct a 4x4 matrix to rotate around the z axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationZ( float radians );
+
+    // Construct a 4x4 matrix to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationX( const floatInVec &radians );
+
+    // Construct a 4x4 matrix to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationY( const floatInVec &radians );
+
+    // Construct a 4x4 matrix to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationZ( const floatInVec &radians );
+
+    // Construct a 4x4 matrix to rotate around the x, y, and z axes
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationZYX( const Vector3 &radiansXYZ );
+
+    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotation( const Quat &unitQuat );
+
+    // Construct a 4x4 matrix to perform scaling
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 scale( const Vector3 &scaleVec );
+
+    // Construct a 4x4 matrix to perform translation
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 translation( const Vector3 &translateVec );
+
+    // Construct viewing matrix based on eye, position looked at, and up direction
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec );
+
+    // Construct a perspective projection matrix
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 perspective( float fovyRadians, float aspect, float zNear, float zFar );
+
+    // Construct a perspective projection matrix based on frustum
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 frustum( float left, float right, float bottom, float top, float zNear, float zFar );
+
+    // Construct an orthographic projection matrix
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 orthographic( float left, float right, float bottom, float top, float zNear, float zFar );
+
+};
+// Multiply a 4x4 matrix by a scalar
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 operator *( float scalar, const Matrix4 & mat );
+
+// Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat );
+
+// Append (post-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat );
+
+// Multiply two 4x4 matrices per element
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 );
+
+// Compute the absolute value of a 4x4 matrix per element
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 absPerElem( const Matrix4 & mat );
+
+// Transpose of a 4x4 matrix
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 transpose( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 inverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.  The result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 affineInverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix with an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 orthoInverse( const Matrix4 & mat );
+
+// Determinant of a 4x4 matrix
+// 
+VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix4 & mat );
+
+// Conditionally select between two 4x4 matrices
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 );
+
+// Conditionally select between two 4x4 matrices (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4x4 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat );
+
+// Print a 4x4 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat, const char * name );
+
+#endif
+
+// A 3x4 transformation matrix in array-of-structures format
+//
+class Transform3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+    Vector3 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Transform3( ) { };
+
+    // Copy a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3( const Transform3 & tfrm );
+
+    // Construct a 3x4 transformation matrix containing the specified columns
+    // 
+    VECTORMATH_FORCE_INLINE Transform3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2, const Vector3 &col3 );
+
+    // Construct a 3x4 transformation matrix from a 3x3 matrix and a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Transform3( const Matrix3 & tfrm, const Vector3 &translateVec );
+
+    // Construct a 3x4 transformation matrix from a unit-length quaternion and a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Transform3( const Quat &unitQuat, const Vector3 &translateVec );
+
+    // Set all elements of a 3x4 transformation matrix to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Transform3( float scalar );
+
+    // Set all elements of a 3x4 transformation matrix to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Transform3( const floatInVec &scalar );
+
+    // Assign one 3x4 transformation matrix to another
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & operator =( const Transform3 & tfrm );
+
+    // Set the upper-left 3x3 submatrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setTranslation( const Vector3 &translateVec );
+
+    // Get the translation component of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setCol0( const Vector3 &col0 );
+
+    // Set column 1 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setCol1( const Vector3 &col1 );
+
+    // Set column 2 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setCol2( const Vector3 &col2 );
+
+    // Set column 3 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setCol3( const Vector3 &col3 );
+
+    // Get column 0 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol2( ) const;
+
+    // Get column 3 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol3( ) const;
+
+    // Set the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setCol( int col, const Vector3 &vec );
+
+    // Set the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setRow( int row, const Vector4 &vec );
+
+    // Get the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setElem( int col, int row, float val );
+
+    // Set the element of a 3x4 transformation matrix referred to by column and row indices (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setElem( int col, int row, const floatInVec &val );
+
+    // Get the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int col, int row ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator *( const Vector3 &vec ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const Point3 operator *( const Point3 &pnt ) const;
+
+    // Multiply two 3x4 transformation matrices
+    // 
+    VECTORMATH_FORCE_INLINE const Transform3 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 3x4 transformation matrix
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 identity( );
+
+    // Construct a 3x4 transformation matrix to rotate around the x axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationX( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the y axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationY( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the z axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationZ( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationX( const floatInVec &radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationY( const floatInVec &radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationZ( const floatInVec &radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the x, y, and z axes
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationZYX( const Vector3 &radiansXYZ );
+
+    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotation( const Quat &unitQuat );
+
+    // Construct a 3x4 transformation matrix to perform scaling
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 scale( const Vector3 &scaleVec );
+
+    // Construct a 3x4 transformation matrix to perform translation
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 translation( const Vector3 &translateVec );
+
+};
+// Append (post-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+VECTORMATH_FORCE_INLINE const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+VECTORMATH_FORCE_INLINE const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm );
+
+// Multiply two 3x4 transformation matrices per element
+// 
+VECTORMATH_FORCE_INLINE const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 );
+
+// Compute the absolute value of a 3x4 transformation matrix per element
+// 
+VECTORMATH_FORCE_INLINE const Transform3 absPerElem( const Transform3 & tfrm );
+
+// Inverse of a 3x4 transformation matrix
+// NOTE: 
+// Result is unpredictable when the determinant of the left 3x3 submatrix is equal to or near 0.
+// 
+VECTORMATH_FORCE_INLINE const Transform3 inverse( const Transform3 & tfrm );
+
+// Compute the inverse of a 3x4 transformation matrix, expected to have an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 3x4 transformation matrix meets the given restrictions.
+// 
+VECTORMATH_FORCE_INLINE const Transform3 orthoInverse( const Transform3 & tfrm );
+
+// Conditionally select between two 3x4 transformation matrices
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 );
+
+// Conditionally select between two 3x4 transformation matrices (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x4 transformation matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm );
+
+// Print a 3x4 transformation matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm, const char * name );
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#include "vec_aos.h"
+#include "quat_aos.h"
+#include "mat_aos.h"
+
+#endif
diff --git a/src/vectormath/vmInclude.h b/src/vectormath/vmInclude.h
index 63a84998c..656514e42 100644
--- a/src/vectormath/vmInclude.h
+++ b/src/vectormath/vmInclude.h
@@ -1,27 +1,31 @@
-
-#ifndef __VM_INCLUDE_H
-#define __VM_INCLUDE_H
-
-#include "LinearMath/btScalar.h"
-
-#if defined (USE_SYSTEM_VECTORMATH) || defined (__CELLOS_LV2__)
-	#include <vectormath_aos.h>
-#else //(USE_SYSTEM_VECTORMATH)
-	#if defined (BT_USE_SSE) && defined (_WIN32)
-		#include "sse/vectormath_aos.h"
-	#else //all other platforms
-		#include "scalar/vectormath_aos.h"
-	#endif //(BT_USE_SSE) && defined (_WIN32)
-#endif //(USE_SYSTEM_VECTORMATH)
-
-
-
-typedef Vectormath::Aos::Vector3    vmVector3;
-typedef Vectormath::Aos::Quat       vmQuat;
-typedef Vectormath::Aos::Matrix3    vmMatrix3;
-typedef Vectormath::Aos::Transform3 vmTransform3;
-typedef Vectormath::Aos::Point3     vmPoint3;
-
-#endif //__VM_INCLUDE_H
-
-
+
+#ifndef __VM_INCLUDE_H
+#define __VM_INCLUDE_H
+
+#include "LinearMath/btScalar.h"
+
+#if defined (USE_SYSTEM_VECTORMATH) || defined (__CELLOS_LV2__)
+	#include <vectormath_aos.h>
+#else //(USE_SYSTEM_VECTORMATH)
+	#if defined (BT_USE_SSE) 
+		#include "sse/vectormath_aos.h"
+	#else //all other platforms
+        #if defined (BT_USE_NEON)
+            #include "neon/vectormath_aos.h"
+        #else
+            #include "scalar/vectormath_aos.h"
+        #endif
+	#endif //(BT_USE_SSE) && defined (_WIN32)
+#endif //(USE_SYSTEM_VECTORMATH)
+
+
+
+typedef Vectormath::Aos::Vector3    vmVector3;
+typedef Vectormath::Aos::Quat       vmQuat;
+typedef Vectormath::Aos::Matrix3    vmMatrix3;
+typedef Vectormath::Aos::Transform3 vmTransform3;
+typedef Vectormath::Aos::Point3     vmPoint3;
+
+#endif //__VM_INCLUDE_H
+
+