From 074b869a9f463b14f309922d532be104d43f18ec Mon Sep 17 00:00:00 2001
From: Erwin Coumans <erwin.coumans@gmail.com>
Date: Tue, 12 May 2015 09:17:27 -0700
Subject: [PATCH] enable premake4 option --ios and Test_LinearMath usage:
 ./premake4_osx --ios xcode4 open xcode4ios/0_Bullet3Solution.xcworkspace

---
 build3/premake4.lua                           |   51 +-
 .../Source/Tests/Test_quat_aos_neon.cpp       |    2 +-
 test/Bullet2/premake4.lua                     |    7 +-
 test/Bullet2/vectormath/neon/boolInVec.h      |  226 ++
 test/Bullet2/vectormath/neon/floatInVec.h     |  344 +++
 test/Bullet2/vectormath/neon/mat_aos.h        | 1631 +++++++++++
 test/Bullet2/vectormath/neon/quat_aos.h       |  413 +++
 test/Bullet2/vectormath/neon/vec_aos.h        | 1427 +++++++++
 test/Bullet2/vectormath/neon/vectormath_aos.h | 1890 ++++++++++++
 test/Bullet2/vectormath/scalar/boolInVec.h    |  225 ++
 test/Bullet2/vectormath/scalar/floatInVec.h   |  343 +++
 test/Bullet2/vectormath/scalar/mat_aos.h      | 1630 +++++++++++
 test/Bullet2/vectormath/scalar/quat_aos.h     |  433 +++
 test/Bullet2/vectormath/scalar/vec_aos.h      | 1426 +++++++++
 .../vectormath/scalar/vectormath_aos.h        | 1872 ++++++++++++
 test/Bullet2/vectormath/sse/boolInVec.h       |  247 ++
 test/Bullet2/vectormath/sse/floatInVec.h      |  340 +++
 test/Bullet2/vectormath/sse/mat_aos.h         | 2190 ++++++++++++++
 test/Bullet2/vectormath/sse/quat_aos.h        |  579 ++++
 test/Bullet2/vectormath/sse/vec_aos.h         | 1455 ++++++++++
 test/Bullet2/vectormath/sse/vecidx_aos.h      |   80 +
 test/Bullet2/vectormath/sse/vectormath_aos.h  | 2547 +++++++++++++++++
 test/Bullet2/vectormath/vmInclude.h           |   31 +
 test/collision/premake4.lua                   |    2 +-
 24 files changed, 19370 insertions(+), 21 deletions(-)
 create mode 100644 test/Bullet2/vectormath/neon/boolInVec.h
 create mode 100644 test/Bullet2/vectormath/neon/floatInVec.h
 create mode 100644 test/Bullet2/vectormath/neon/mat_aos.h
 create mode 100644 test/Bullet2/vectormath/neon/quat_aos.h
 create mode 100644 test/Bullet2/vectormath/neon/vec_aos.h
 create mode 100644 test/Bullet2/vectormath/neon/vectormath_aos.h
 create mode 100644 test/Bullet2/vectormath/scalar/boolInVec.h
 create mode 100644 test/Bullet2/vectormath/scalar/floatInVec.h
 create mode 100644 test/Bullet2/vectormath/scalar/mat_aos.h
 create mode 100644 test/Bullet2/vectormath/scalar/quat_aos.h
 create mode 100644 test/Bullet2/vectormath/scalar/vec_aos.h
 create mode 100644 test/Bullet2/vectormath/scalar/vectormath_aos.h
 create mode 100644 test/Bullet2/vectormath/sse/boolInVec.h
 create mode 100644 test/Bullet2/vectormath/sse/floatInVec.h
 create mode 100644 test/Bullet2/vectormath/sse/mat_aos.h
 create mode 100644 test/Bullet2/vectormath/sse/quat_aos.h
 create mode 100644 test/Bullet2/vectormath/sse/vec_aos.h
 create mode 100644 test/Bullet2/vectormath/sse/vecidx_aos.h
 create mode 100644 test/Bullet2/vectormath/sse/vectormath_aos.h
 create mode 100644 test/Bullet2/vectormath/vmInclude.h

diff --git a/build3/premake4.lua b/build3/premake4.lua
index 53c087357..6b7a33056 100644
--- a/build3/premake4.lua
+++ b/build3/premake4.lua
@@ -18,6 +18,11 @@
         act = _ACTION
     end
 
+	  newoption {
+    		trigger     = "ios",
+    		description = "Enable iOS target (requires xcode4)"
+	}
+
 	newoption
         {
                 trigger = "force_dlopen_opengl",
@@ -95,12 +100,25 @@
 	postfix=""
 
 	if _ACTION == "xcode4" then
+			 if _OPTIONS["ios"] then
+                        postfix = "ios";
+                        xcodebuildsettings
+                        {
+                                'INFOPLIST_FILE = "../../test/Bullet2/Info.plist"',
+                                'CODE_SIGN_IDENTITY = "iPhone Developer"',
+                                "SDKROOT = iphoneos",
+                                'ARCHS = "armv7"',
+                                'TARGETED_DEVICE_FAMILY = "1,2"',
+                                'VALID_ARCHS = "armv7"',
+                        }       
+			else
 			xcodebuildsettings
 			{
         		'ARCHS = "$(ARCHS_STANDARD_32_BIT) $(ARCHS_STANDARD_64_BIT)"',
         		'VALID_ARCHS = "x86_64 i386"',
 			'SDKROOT = "macosx10.9"',
 			}
+			end
 	end
 
 -- comment-out for now, URDF reader needs exceptions
@@ -123,6 +141,8 @@
 
 	language "C++"
 
+if not _OPTIONS["ios"] then
+
 	include "../examples/ExampleBrowser"
 	include "../examples/OpenGLWindow"
 	
@@ -137,26 +157,25 @@
 		include "../test/enet/server"	
 	end
 	
-	if not _OPTIONS["without-gtest"] then
-		include "../test/gtest-1.7.0"
---		include "../test/hello_gtest"
-		include "../test/collision"
-		include "../test/TestBullet3OpenCL"
-		include "../test/GwenOpenGLTest"
-	end
-	
-	
-	include "../src/BulletSoftBody"
-	include "../src/BulletDynamics"
-	include "../src/BulletCollision"
-	include "../src/LinearMath"
-	
 	include "../src/Bullet3Common"
 	include "../src/Bullet3Geometry"
 	include "../src/Bullet3Collision"
 	include "../src/Bullet3Dynamics"
 	include "../src/Bullet3OpenCL"
 	include "../src/Bullet3Serialize/Bullet2FileLoader"
-	
-	
+
+ 	if not _OPTIONS["without-gtest"] then
+                include "../test/gtest-1.7.0"
+--              include "../test/hello_gtest"
+                include "../test/collision"
+                include "../test/TestBullet3OpenCL"
+                include "../test/GwenOpenGLTest"
+        end
+end
+
+	include "../test/Bullet2"	
+ 	include "../src/BulletSoftBody"
+        include "../src/BulletDynamics"
+        include "../src/BulletCollision"
+        include "../src/LinearMath"
 	
diff --git a/test/Bullet2/Source/Tests/Test_quat_aos_neon.cpp b/test/Bullet2/Source/Tests/Test_quat_aos_neon.cpp
index c13e41aeb..0a8f69724 100644
--- a/test/Bullet2/Source/Tests/Test_quat_aos_neon.cpp
+++ b/test/Bullet2/Source/Tests/Test_quat_aos_neon.cpp
@@ -14,7 +14,7 @@
 #include "Utils.h"
 #include "main.h"
 
-#include <vectormath/vmInclude.h>
+#include "../vectormath/vmInclude.h"
 
 
 //typedef Vectormath::Aos::Vector3    vmVector3;
diff --git a/test/Bullet2/premake4.lua b/test/Bullet2/premake4.lua
index bca0cb5dc..7d6a77111 100644
--- a/test/Bullet2/premake4.lua
+++ b/test/Bullet2/premake4.lua
@@ -1,14 +1,15 @@
 
-project "AppUnitTest"
+project "Test_LinearMath"
 
 if _OPTIONS["ios"] then
 	kind "WindowedApp"
 else	
 	kind "ConsoleApp"
 end
-targetdir "bin"
 
-includedirs {"../src","Source", "Source/Tests"}
+targetdir "../../bin"
+
+includedirs {"../../src","Source", "Source/Tests"}
 
 links {
 	"BulletDynamics","BulletCollision", "LinearMath"
diff --git a/test/Bullet2/vectormath/neon/boolInVec.h b/test/Bullet2/vectormath/neon/boolInVec.h
new file mode 100644
index 000000000..ba16838c0
--- /dev/null
+++ b/test/Bullet2/vectormath/neon/boolInVec.h
@@ -0,0 +1,226 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef _BOOLINVEC_H
+#define _BOOLINVEC_H
+
+#include <math.h>
+namespace Vectormath {
+
+class floatInVec;
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec class
+//
+
+class boolInVec
+{
+private:
+    unsigned int mData;
+
+public:
+    // Default constructor; does no initialization
+    //
+    inline boolInVec( ) { };
+
+    // Construct from a value converted from float
+    //
+    inline boolInVec(floatInVec vec);
+
+    // Explicit cast from bool
+    //
+    explicit inline boolInVec(bool scalar);
+
+    // Explicit cast to bool
+    //
+    inline bool getAsBool() const;
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+    // Implicit cast to bool
+    //
+    inline operator bool() const;
+#endif
+
+    // Boolean negation operator
+    //
+    inline const boolInVec operator ! () const;
+
+    // Assignment operator
+    //
+    inline boolInVec& operator = (boolInVec vec);
+
+    // Boolean and assignment operator
+    //
+    inline boolInVec& operator &= (boolInVec vec);
+
+    // Boolean exclusive or assignment operator
+    //
+    inline boolInVec& operator ^= (boolInVec vec);
+
+    // Boolean or assignment operator
+    //
+    inline boolInVec& operator |= (boolInVec vec);
+
+};
+
+// Equal operator
+//
+inline const boolInVec operator == (boolInVec vec0, boolInVec vec1);
+
+// Not equal operator
+//
+inline const boolInVec operator != (boolInVec vec0, boolInVec vec1);
+
+// And operator
+//
+inline const boolInVec operator & (boolInVec vec0, boolInVec vec1);
+
+// Exclusive or operator
+//
+inline const boolInVec operator ^ (boolInVec vec0, boolInVec vec1);
+
+// Or operator
+//
+inline const boolInVec operator | (boolInVec vec0, boolInVec vec1);
+
+// Conditionally select between two values
+//
+inline const boolInVec select(boolInVec vec0, boolInVec vec1, boolInVec select_vec1);
+
+
+} // namespace Vectormath
+
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec implementation
+//
+
+#include "floatInVec.h"
+
+namespace Vectormath {
+
+inline
+boolInVec::boolInVec(floatInVec vec)
+{
+    *this = (vec != floatInVec(0.0f));
+}
+
+inline
+boolInVec::boolInVec(bool scalar)
+{
+    mData = -(int)scalar;
+}
+
+inline
+bool
+boolInVec::getAsBool() const
+{
+    return (mData > 0);
+}
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+inline
+boolInVec::operator bool() const
+{
+    return getAsBool();
+}
+#endif
+
+inline
+const boolInVec
+boolInVec::operator ! () const
+{
+    return boolInVec(!mData);
+}
+
+inline
+boolInVec&
+boolInVec::operator = (boolInVec vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator &= (boolInVec vec)
+{
+    *this = *this & vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator ^= (boolInVec vec)
+{
+    *this = *this ^ vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator |= (boolInVec vec)
+{
+    *this = *this | vec;
+    return *this;
+}
+
+inline
+const boolInVec
+operator == (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() == vec1.getAsBool());
+}
+
+inline
+const boolInVec
+operator != (boolInVec vec0, boolInVec vec1)
+{
+    return !(vec0 == vec1);
+}
+
+inline
+const boolInVec
+operator & (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() & vec1.getAsBool());
+}
+
+inline
+const boolInVec
+operator | (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() | vec1.getAsBool());
+}
+
+inline
+const boolInVec
+operator ^ (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() ^ vec1.getAsBool());
+}
+
+inline
+const boolInVec
+select(boolInVec vec0, boolInVec vec1, boolInVec select_vec1)
+{
+    return (select_vec1.getAsBool() == 0) ? vec0 : vec1;
+}
+
+} // namespace Vectormath
+
+#endif // boolInVec_h
+
diff --git a/test/Bullet2/vectormath/neon/floatInVec.h b/test/Bullet2/vectormath/neon/floatInVec.h
new file mode 100644
index 000000000..26147d22b
--- /dev/null
+++ b/test/Bullet2/vectormath/neon/floatInVec.h
@@ -0,0 +1,344 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+#ifndef _FLOATINVEC_H
+#define _FLOATINVEC_H
+
+#include <math.h>
+namespace Vectormath {
+
+class boolInVec;
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec class
+//
+
+// A class representing a scalar float value contained in a vector register
+// This class does not support fastmath
+class floatInVec
+{
+private:
+    float mData;
+
+public:
+    // Default constructor; does no initialization
+    //
+    inline floatInVec( ) { };
+
+    // Construct from a value converted from bool
+    //
+    inline floatInVec(boolInVec vec);
+
+    // Explicit cast from float
+    //
+    explicit inline floatInVec(float scalar);
+
+    // Explicit cast to float
+    //
+    inline float getAsFloat() const;
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+    // Implicit cast to float
+    //
+    inline operator float() const;
+#endif
+
+    // Post increment (add 1.0f)
+    //
+    inline const floatInVec operator ++ (int);
+
+    // Post decrement (subtract 1.0f)
+    //
+    inline const floatInVec operator -- (int);
+
+    // Pre increment (add 1.0f)
+    //
+    inline floatInVec& operator ++ ();
+
+    // Pre decrement (subtract 1.0f)
+    //
+    inline floatInVec& operator -- ();
+
+    // Negation operator
+    //
+    inline const floatInVec operator - () const;
+
+    // Assignment operator
+    //
+    inline floatInVec& operator = (floatInVec vec);
+
+    // Multiplication assignment operator
+    //
+    inline floatInVec& operator *= (floatInVec vec);
+
+    // Division assignment operator
+    //
+    inline floatInVec& operator /= (floatInVec vec);
+
+    // Addition assignment operator
+    //
+    inline floatInVec& operator += (floatInVec vec);
+
+    // Subtraction assignment operator
+    //
+    inline floatInVec& operator -= (floatInVec vec);
+
+};
+
+// Multiplication operator
+//
+inline const floatInVec operator * (floatInVec vec0, floatInVec vec1);
+
+// Division operator
+//
+inline const floatInVec operator / (floatInVec vec0, floatInVec vec1);
+
+// Addition operator
+//
+inline const floatInVec operator + (floatInVec vec0, floatInVec vec1);
+
+// Subtraction operator
+//
+inline const floatInVec operator - (floatInVec vec0, floatInVec vec1);
+
+// Less than operator
+//
+inline const boolInVec operator < (floatInVec vec0, floatInVec vec1);
+
+// Less than or equal operator
+//
+inline const boolInVec operator <= (floatInVec vec0, floatInVec vec1);
+
+// Greater than operator
+//
+inline const boolInVec operator > (floatInVec vec0, floatInVec vec1);
+
+// Greater than or equal operator
+//
+inline const boolInVec operator >= (floatInVec vec0, floatInVec vec1);
+
+// Equal operator
+//
+inline const boolInVec operator == (floatInVec vec0, floatInVec vec1);
+
+// Not equal operator
+//
+inline const boolInVec operator != (floatInVec vec0, floatInVec vec1);
+
+// Conditionally select between two values
+//
+inline const floatInVec select(floatInVec vec0, floatInVec vec1, boolInVec select_vec1);
+
+
+} // namespace Vectormath
+
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec implementation
+//
+
+#include "boolInVec.h"
+
+namespace Vectormath {
+
+inline
+floatInVec::floatInVec(boolInVec vec)
+{
+    mData = float(vec.getAsBool());
+}
+
+inline
+floatInVec::floatInVec(float scalar)
+{
+    mData = scalar;
+}
+
+inline
+float
+floatInVec::getAsFloat() const
+{
+    return mData;
+}
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+inline
+floatInVec::operator float() const
+{
+    return getAsFloat();
+}
+#endif
+
+inline
+const floatInVec
+floatInVec::operator ++ (int)
+{
+    float olddata = mData;
+    operator ++();
+    return floatInVec(olddata);
+}
+
+inline
+const floatInVec
+floatInVec::operator -- (int)
+{
+    float olddata = mData;
+    operator --();
+    return floatInVec(olddata);
+}
+
+inline
+floatInVec&
+floatInVec::operator ++ ()
+{
+    *this += floatInVec(1.0f);
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -- ()
+{
+    *this -= floatInVec(1.0f);
+    return *this;
+}
+
+inline
+const floatInVec
+floatInVec::operator - () const
+{
+    return floatInVec(-mData);
+}
+
+inline
+floatInVec&
+floatInVec::operator = (floatInVec vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator *= (floatInVec vec)
+{
+    *this = *this * vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator /= (floatInVec vec)
+{
+    *this = *this / vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator += (floatInVec vec)
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -= (floatInVec vec)
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline
+const floatInVec
+operator * (floatInVec vec0, floatInVec vec1)
+{
+    return floatInVec(vec0.getAsFloat() * vec1.getAsFloat());
+}
+
+inline
+const floatInVec
+operator / (floatInVec num, floatInVec den)
+{
+    return floatInVec(num.getAsFloat() / den.getAsFloat());
+}
+
+inline
+const floatInVec
+operator + (floatInVec vec0, floatInVec vec1)
+{
+    return floatInVec(vec0.getAsFloat() + vec1.getAsFloat());
+}
+
+inline
+const floatInVec
+operator - (floatInVec vec0, floatInVec vec1)
+{
+    return floatInVec(vec0.getAsFloat() - vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator < (floatInVec vec0, floatInVec vec1)
+{
+    return boolInVec(vec0.getAsFloat() < vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator <= (floatInVec vec0, floatInVec vec1)
+{
+    return !(vec0 > vec1);
+}
+
+inline
+const boolInVec
+operator > (floatInVec vec0, floatInVec vec1)
+{
+    return boolInVec(vec0.getAsFloat() > vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator >= (floatInVec vec0, floatInVec vec1)
+{
+    return !(vec0 < vec1);
+}
+
+inline
+const boolInVec
+operator == (floatInVec vec0, floatInVec vec1)
+{
+    return boolInVec(vec0.getAsFloat() == vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator != (floatInVec vec0, floatInVec vec1)
+{
+    return !(vec0 == vec1);
+}
+
+inline
+const floatInVec
+select(floatInVec vec0, floatInVec vec1, boolInVec select_vec1)
+{
+    return (select_vec1.getAsBool() == 0) ? vec0 : vec1;
+}
+
+} // namespace Vectormath
+
+#endif // floatInVec_h
+
diff --git a/test/Bullet2/vectormath/neon/mat_aos.h b/test/Bullet2/vectormath/neon/mat_aos.h
new file mode 100644
index 000000000..e61f601c3
--- /dev/null
+++ b/test/Bullet2/vectormath/neon/mat_aos.h
@@ -0,0 +1,1631 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef _VECTORMATH_MAT_AOS_CPP_H
+#define _VECTORMATH_MAT_AOS_CPP_H
+
+namespace Vectormath {
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// Constants
+
+#define _VECTORMATH_PI_OVER_2 1.570796327f
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+inline Matrix3::Matrix3( const Matrix3 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+}
+
+inline Matrix3::Matrix3( float scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+}
+
+inline Matrix3::Matrix3( const Quat & unitQuat )
+{
+    float qx, qy, qz, qw, qx2, qy2, qz2, qxqx2, qyqy2, qzqz2, qxqy2, qyqz2, qzqw2, qxqz2, qyqw2, qxqw2;
+    qx = unitQuat.getX();
+    qy = unitQuat.getY();
+    qz = unitQuat.getZ();
+    qw = unitQuat.getW();
+    qx2 = ( qx + qx );
+    qy2 = ( qy + qy );
+    qz2 = ( qz + qz );
+    qxqx2 = ( qx * qx2 );
+    qxqy2 = ( qx * qy2 );
+    qxqz2 = ( qx * qz2 );
+    qxqw2 = ( qw * qx2 );
+    qyqy2 = ( qy * qy2 );
+    qyqz2 = ( qy * qz2 );
+    qyqw2 = ( qw * qy2 );
+    qzqz2 = ( qz * qz2 );
+    qzqw2 = ( qw * qz2 );
+    mCol0 = Vector3( ( ( 1.0f - qyqy2 ) - qzqz2 ), ( qxqy2 + qzqw2 ), ( qxqz2 - qyqw2 ) );
+    mCol1 = Vector3( ( qxqy2 - qzqw2 ), ( ( 1.0f - qxqx2 ) - qzqz2 ), ( qyqz2 + qxqw2 ) );
+    mCol2 = Vector3( ( qxqz2 + qyqw2 ), ( qyqz2 - qxqw2 ), ( ( 1.0f - qxqx2 ) - qyqy2 ) );
+}
+
+inline Matrix3::Matrix3( const Vector3 & _col0, const Vector3 & _col1, const Vector3 & _col2 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+}
+
+inline Matrix3 & Matrix3::setCol0( const Vector3 & _col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setCol1( const Vector3 & _col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setCol2( const Vector3 & _col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setCol( int col, const Vector3 & vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setRow( int row, const Vector3 & vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setElem( int col, int row, float val )
+{
+    Vector3 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+inline float Matrix3::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+inline const Vector3 Matrix3::getCol0( ) const
+{
+    return mCol0;
+}
+
+inline const Vector3 Matrix3::getCol1( ) const
+{
+    return mCol1;
+}
+
+inline const Vector3 Matrix3::getCol2( ) const
+{
+    return mCol2;
+}
+
+inline const Vector3 Matrix3::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector3 Matrix3::getRow( int row ) const
+{
+    return Vector3( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ) );
+}
+
+inline Vector3 & Matrix3::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector3 Matrix3::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline Matrix3 & Matrix3::operator =( const Matrix3 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    return *this;
+}
+
+inline const Matrix3 transpose( const Matrix3 & mat )
+{
+    return Matrix3(
+        Vector3( mat.getCol0().getX(), mat.getCol1().getX(), mat.getCol2().getX() ),
+        Vector3( mat.getCol0().getY(), mat.getCol1().getY(), mat.getCol2().getY() ),
+        Vector3( mat.getCol0().getZ(), mat.getCol1().getZ(), mat.getCol2().getZ() )
+    );
+}
+
+inline const Matrix3 inverse( const Matrix3 & mat )
+{
+    Vector3 tmp0, tmp1, tmp2;
+    float detinv;
+    tmp0 = cross( mat.getCol1(), mat.getCol2() );
+    tmp1 = cross( mat.getCol2(), mat.getCol0() );
+    tmp2 = cross( mat.getCol0(), mat.getCol1() );
+    detinv = ( 1.0f / dot( mat.getCol2(), tmp2 ) );
+    return Matrix3(
+        Vector3( ( tmp0.getX() * detinv ), ( tmp1.getX() * detinv ), ( tmp2.getX() * detinv ) ),
+        Vector3( ( tmp0.getY() * detinv ), ( tmp1.getY() * detinv ), ( tmp2.getY() * detinv ) ),
+        Vector3( ( tmp0.getZ() * detinv ), ( tmp1.getZ() * detinv ), ( tmp2.getZ() * detinv ) )
+    );
+}
+
+inline float determinant( const Matrix3 & mat )
+{
+    return dot( mat.getCol2(), cross( mat.getCol0(), mat.getCol1() ) );
+}
+
+inline const Matrix3 Matrix3::operator +( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( mCol0 + mat.mCol0 ),
+        ( mCol1 + mat.mCol1 ),
+        ( mCol2 + mat.mCol2 )
+    );
+}
+
+inline const Matrix3 Matrix3::operator -( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( mCol0 - mat.mCol0 ),
+        ( mCol1 - mat.mCol1 ),
+        ( mCol2 - mat.mCol2 )
+    );
+}
+
+inline Matrix3 & Matrix3::operator +=( const Matrix3 & mat )
+{
+    *this = *this + mat;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::operator -=( const Matrix3 & mat )
+{
+    *this = *this - mat;
+    return *this;
+}
+
+inline const Matrix3 Matrix3::operator -( ) const
+{
+    return Matrix3(
+        ( -mCol0 ),
+        ( -mCol1 ),
+        ( -mCol2 )
+    );
+}
+
+inline const Matrix3 absPerElem( const Matrix3 & mat )
+{
+    return Matrix3(
+        absPerElem( mat.getCol0() ),
+        absPerElem( mat.getCol1() ),
+        absPerElem( mat.getCol2() )
+    );
+}
+
+inline const Matrix3 Matrix3::operator *( float scalar ) const
+{
+    return Matrix3(
+        ( mCol0 * scalar ),
+        ( mCol1 * scalar ),
+        ( mCol2 * scalar )
+    );
+}
+
+inline Matrix3 & Matrix3::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Matrix3 operator *( float scalar, const Matrix3 & mat )
+{
+    return mat * scalar;
+}
+
+inline const Vector3 Matrix3::operator *( const Vector3 & vec ) const
+{
+    return Vector3(
+        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
+        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
+        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
+    );
+}
+
+inline const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( *this * mat.mCol0 ),
+        ( *this * mat.mCol1 ),
+        ( *this * mat.mCol2 )
+    );
+}
+
+inline Matrix3 & Matrix3::operator *=( const Matrix3 & mat )
+{
+    *this = *this * mat;
+    return *this;
+}
+
+inline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 )
+{
+    return Matrix3(
+        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
+        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
+        mulPerElem( mat0.getCol2(), mat1.getCol2() )
+    );
+}
+
+inline const Matrix3 Matrix3::identity( )
+{
+    return Matrix3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( )
+    );
+}
+
+inline const Matrix3 Matrix3::rotationX( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix3(
+        Vector3::xAxis( ),
+        Vector3( 0.0f, c, s ),
+        Vector3( 0.0f, -s, c )
+    );
+}
+
+inline const Matrix3 Matrix3::rotationY( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix3(
+        Vector3( c, 0.0f, -s ),
+        Vector3::yAxis( ),
+        Vector3( s, 0.0f, c )
+    );
+}
+
+inline const Matrix3 Matrix3::rotationZ( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix3(
+        Vector3( c, s, 0.0f ),
+        Vector3( -s, c, 0.0f ),
+        Vector3::zAxis( )
+    );
+}
+
+inline const Matrix3 Matrix3::rotationZYX( const Vector3 & radiansXYZ )
+{
+    float sX, cX, sY, cY, sZ, cZ, tmp0, tmp1;
+    sX = sinf( radiansXYZ.getX() );
+    cX = cosf( radiansXYZ.getX() );
+    sY = sinf( radiansXYZ.getY() );
+    cY = cosf( radiansXYZ.getY() );
+    sZ = sinf( radiansXYZ.getZ() );
+    cZ = cosf( radiansXYZ.getZ() );
+    tmp0 = ( cZ * sY );
+    tmp1 = ( sZ * sY );
+    return Matrix3(
+        Vector3( ( cZ * cY ), ( sZ * cY ), -sY ),
+        Vector3( ( ( tmp0 * sX ) - ( sZ * cX ) ), ( ( tmp1 * sX ) + ( cZ * cX ) ), ( cY * sX ) ),
+        Vector3( ( ( tmp0 * cX ) + ( sZ * sX ) ), ( ( tmp1 * cX ) - ( cZ * sX ) ), ( cY * cX ) )
+    );
+}
+
+inline const Matrix3 Matrix3::rotation( float radians, const Vector3 & unitVec )
+{
+    float x, y, z, s, c, oneMinusC, xy, yz, zx;
+    s = sinf( radians );
+    c = cosf( radians );
+    x = unitVec.getX();
+    y = unitVec.getY();
+    z = unitVec.getZ();
+    xy = ( x * y );
+    yz = ( y * z );
+    zx = ( z * x );
+    oneMinusC = ( 1.0f - c );
+    return Matrix3(
+        Vector3( ( ( ( x * x ) * oneMinusC ) + c ), ( ( xy * oneMinusC ) + ( z * s ) ), ( ( zx * oneMinusC ) - ( y * s ) ) ),
+        Vector3( ( ( xy * oneMinusC ) - ( z * s ) ), ( ( ( y * y ) * oneMinusC ) + c ), ( ( yz * oneMinusC ) + ( x * s ) ) ),
+        Vector3( ( ( zx * oneMinusC ) + ( y * s ) ), ( ( yz * oneMinusC ) - ( x * s ) ), ( ( ( z * z ) * oneMinusC ) + c ) )
+    );
+}
+
+inline const Matrix3 Matrix3::rotation( const Quat & unitQuat )
+{
+    return Matrix3( unitQuat );
+}
+
+inline const Matrix3 Matrix3::scale( const Vector3 & scaleVec )
+{
+    return Matrix3(
+        Vector3( scaleVec.getX(), 0.0f, 0.0f ),
+        Vector3( 0.0f, scaleVec.getY(), 0.0f ),
+        Vector3( 0.0f, 0.0f, scaleVec.getZ() )
+    );
+}
+
+inline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 & scaleVec )
+{
+    return Matrix3(
+        ( mat.getCol0() * scaleVec.getX( ) ),
+        ( mat.getCol1() * scaleVec.getY( ) ),
+        ( mat.getCol2() * scaleVec.getZ( ) )
+    );
+}
+
+inline const Matrix3 prependScale( const Vector3 & scaleVec, const Matrix3 & mat )
+{
+    return Matrix3(
+        mulPerElem( mat.getCol0(), scaleVec ),
+        mulPerElem( mat.getCol1(), scaleVec ),
+        mulPerElem( mat.getCol2(), scaleVec )
+    );
+}
+
+inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 )
+{
+    return Matrix3(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Matrix3 & mat )
+{
+    print( mat.getRow( 0 ) );
+    print( mat.getRow( 1 ) );
+    print( mat.getRow( 2 ) );
+}
+
+inline void print( const Matrix3 & mat, const char * name )
+{
+    printf("%s:\n", name);
+    print( mat );
+}
+
+#endif
+
+inline Matrix4::Matrix4( const Matrix4 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    mCol3 = mat.mCol3;
+}
+
+inline Matrix4::Matrix4( float scalar )
+{
+    mCol0 = Vector4( scalar );
+    mCol1 = Vector4( scalar );
+    mCol2 = Vector4( scalar );
+    mCol3 = Vector4( scalar );
+}
+
+inline Matrix4::Matrix4( const Transform3 & mat )
+{
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( mat.getCol3(), 1.0f );
+}
+
+inline Matrix4::Matrix4( const Vector4 & _col0, const Vector4 & _col1, const Vector4 & _col2, const Vector4 & _col3 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+    mCol3 = _col3;
+}
+
+inline Matrix4::Matrix4( const Matrix3 & mat, const Vector3 & translateVec )
+{
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( translateVec, 1.0f );
+}
+
+inline Matrix4::Matrix4( const Quat & unitQuat, const Vector3 & translateVec )
+{
+    Matrix3 mat;
+    mat = Matrix3( unitQuat );
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( translateVec, 1.0f );
+}
+
+inline Matrix4 & Matrix4::setCol0( const Vector4 & _col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setCol1( const Vector4 & _col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setCol2( const Vector4 & _col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setCol3( const Vector4 & _col3 )
+{
+    mCol3 = _col3;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setCol( int col, const Vector4 & vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setRow( int row, const Vector4 & vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    mCol3.setElem( row, vec.getElem( 3 ) );
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setElem( int col, int row, float val )
+{
+    Vector4 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+inline float Matrix4::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+inline const Vector4 Matrix4::getCol0( ) const
+{
+    return mCol0;
+}
+
+inline const Vector4 Matrix4::getCol1( ) const
+{
+    return mCol1;
+}
+
+inline const Vector4 Matrix4::getCol2( ) const
+{
+    return mCol2;
+}
+
+inline const Vector4 Matrix4::getCol3( ) const
+{
+    return mCol3;
+}
+
+inline const Vector4 Matrix4::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector4 Matrix4::getRow( int row ) const
+{
+    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
+}
+
+inline Vector4 & Matrix4::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector4 Matrix4::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline Matrix4 & Matrix4::operator =( const Matrix4 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    mCol3 = mat.mCol3;
+    return *this;
+}
+
+inline const Matrix4 transpose( const Matrix4 & mat )
+{
+    return Matrix4(
+        Vector4( mat.getCol0().getX(), mat.getCol1().getX(), mat.getCol2().getX(), mat.getCol3().getX() ),
+        Vector4( mat.getCol0().getY(), mat.getCol1().getY(), mat.getCol2().getY(), mat.getCol3().getY() ),
+        Vector4( mat.getCol0().getZ(), mat.getCol1().getZ(), mat.getCol2().getZ(), mat.getCol3().getZ() ),
+        Vector4( mat.getCol0().getW(), mat.getCol1().getW(), mat.getCol2().getW(), mat.getCol3().getW() )
+    );
+}
+
+inline const Matrix4 inverse( const Matrix4 & mat )
+{
+    Vector4 res0, res1, res2, res3;
+    float mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, detInv;
+    mA = mat.getCol0().getX();
+    mB = mat.getCol0().getY();
+    mC = mat.getCol0().getZ();
+    mD = mat.getCol0().getW();
+    mE = mat.getCol1().getX();
+    mF = mat.getCol1().getY();
+    mG = mat.getCol1().getZ();
+    mH = mat.getCol1().getW();
+    mI = mat.getCol2().getX();
+    mJ = mat.getCol2().getY();
+    mK = mat.getCol2().getZ();
+    mL = mat.getCol2().getW();
+    mM = mat.getCol3().getX();
+    mN = mat.getCol3().getY();
+    mO = mat.getCol3().getZ();
+    mP = mat.getCol3().getW();
+    tmp0 = ( ( mK * mD ) - ( mC * mL ) );
+    tmp1 = ( ( mO * mH ) - ( mG * mP ) );
+    tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
+    tmp3 = ( ( mF * mO ) - ( mN * mG ) );
+    tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
+    tmp5 = ( ( mN * mH ) - ( mF * mP ) );
+    res0.setX( ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) ) );
+    res0.setY( ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) ) );
+    res0.setZ( ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) ) );
+    res0.setW( ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) ) );
+    detInv = ( 1.0f / ( ( ( ( mA * res0.getX() ) + ( mE * res0.getY() ) ) + ( mI * res0.getZ() ) ) + ( mM * res0.getW() ) ) );
+    res1.setX( ( mI * tmp1 ) );
+    res1.setY( ( mM * tmp0 ) );
+    res1.setZ( ( mA * tmp1 ) );
+    res1.setW( ( mE * tmp0 ) );
+    res3.setX( ( mI * tmp3 ) );
+    res3.setY( ( mM * tmp2 ) );
+    res3.setZ( ( mA * tmp3 ) );
+    res3.setW( ( mE * tmp2 ) );
+    res2.setX( ( mI * tmp5 ) );
+    res2.setY( ( mM * tmp4 ) );
+    res2.setZ( ( mA * tmp5 ) );
+    res2.setW( ( mE * tmp4 ) );
+    tmp0 = ( ( mI * mB ) - ( mA * mJ ) );
+    tmp1 = ( ( mM * mF ) - ( mE * mN ) );
+    tmp2 = ( ( mI * mD ) - ( mA * mL ) );
+    tmp3 = ( ( mM * mH ) - ( mE * mP ) );
+    tmp4 = ( ( mI * mC ) - ( mA * mK ) );
+    tmp5 = ( ( mM * mG ) - ( mE * mO ) );
+    res2.setX( ( ( ( mL * tmp1 ) - ( mJ * tmp3 ) ) + res2.getX() ) );
+    res2.setY( ( ( ( mP * tmp0 ) - ( mN * tmp2 ) ) + res2.getY() ) );
+    res2.setZ( ( ( ( mB * tmp3 ) - ( mD * tmp1 ) ) - res2.getZ() ) );
+    res2.setW( ( ( ( mF * tmp2 ) - ( mH * tmp0 ) ) - res2.getW() ) );
+    res3.setX( ( ( ( mJ * tmp5 ) - ( mK * tmp1 ) ) + res3.getX() ) );
+    res3.setY( ( ( ( mN * tmp4 ) - ( mO * tmp0 ) ) + res3.getY() ) );
+    res3.setZ( ( ( ( mC * tmp1 ) - ( mB * tmp5 ) ) - res3.getZ() ) );
+    res3.setW( ( ( ( mG * tmp0 ) - ( mF * tmp4 ) ) - res3.getW() ) );
+    res1.setX( ( ( ( mK * tmp3 ) - ( mL * tmp5 ) ) - res1.getX() ) );
+    res1.setY( ( ( ( mO * tmp2 ) - ( mP * tmp4 ) ) - res1.getY() ) );
+    res1.setZ( ( ( ( mD * tmp5 ) - ( mC * tmp3 ) ) + res1.getZ() ) );
+    res1.setW( ( ( ( mH * tmp4 ) - ( mG * tmp2 ) ) + res1.getW() ) );
+    return Matrix4(
+        ( res0 * detInv ),
+        ( res1 * detInv ),
+        ( res2 * detInv ),
+        ( res3 * detInv )
+    );
+}
+
+inline const Matrix4 affineInverse( const Matrix4 & mat )
+{
+    Transform3 affineMat;
+    affineMat.setCol0( mat.getCol0().getXYZ( ) );
+    affineMat.setCol1( mat.getCol1().getXYZ( ) );
+    affineMat.setCol2( mat.getCol2().getXYZ( ) );
+    affineMat.setCol3( mat.getCol3().getXYZ( ) );
+    return Matrix4( inverse( affineMat ) );
+}
+
+inline const Matrix4 orthoInverse( const Matrix4 & mat )
+{
+    Transform3 affineMat;
+    affineMat.setCol0( mat.getCol0().getXYZ( ) );
+    affineMat.setCol1( mat.getCol1().getXYZ( ) );
+    affineMat.setCol2( mat.getCol2().getXYZ( ) );
+    affineMat.setCol3( mat.getCol3().getXYZ( ) );
+    return Matrix4( orthoInverse( affineMat ) );
+}
+
+inline float determinant( const Matrix4 & mat )
+{
+    float dx, dy, dz, dw, mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    mA = mat.getCol0().getX();
+    mB = mat.getCol0().getY();
+    mC = mat.getCol0().getZ();
+    mD = mat.getCol0().getW();
+    mE = mat.getCol1().getX();
+    mF = mat.getCol1().getY();
+    mG = mat.getCol1().getZ();
+    mH = mat.getCol1().getW();
+    mI = mat.getCol2().getX();
+    mJ = mat.getCol2().getY();
+    mK = mat.getCol2().getZ();
+    mL = mat.getCol2().getW();
+    mM = mat.getCol3().getX();
+    mN = mat.getCol3().getY();
+    mO = mat.getCol3().getZ();
+    mP = mat.getCol3().getW();
+    tmp0 = ( ( mK * mD ) - ( mC * mL ) );
+    tmp1 = ( ( mO * mH ) - ( mG * mP ) );
+    tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
+    tmp3 = ( ( mF * mO ) - ( mN * mG ) );
+    tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
+    tmp5 = ( ( mN * mH ) - ( mF * mP ) );
+    dx = ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) );
+    dy = ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) );
+    dz = ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) );
+    dw = ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) );
+    return ( ( ( ( mA * dx ) + ( mE * dy ) ) + ( mI * dz ) ) + ( mM * dw ) );
+}
+
+inline const Matrix4 Matrix4::operator +( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( mCol0 + mat.mCol0 ),
+        ( mCol1 + mat.mCol1 ),
+        ( mCol2 + mat.mCol2 ),
+        ( mCol3 + mat.mCol3 )
+    );
+}
+
+inline const Matrix4 Matrix4::operator -( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( mCol0 - mat.mCol0 ),
+        ( mCol1 - mat.mCol1 ),
+        ( mCol2 - mat.mCol2 ),
+        ( mCol3 - mat.mCol3 )
+    );
+}
+
+inline Matrix4 & Matrix4::operator +=( const Matrix4 & mat )
+{
+    *this = *this + mat;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::operator -=( const Matrix4 & mat )
+{
+    *this = *this - mat;
+    return *this;
+}
+
+inline const Matrix4 Matrix4::operator -( ) const
+{
+    return Matrix4(
+        ( -mCol0 ),
+        ( -mCol1 ),
+        ( -mCol2 ),
+        ( -mCol3 )
+    );
+}
+
+inline const Matrix4 absPerElem( const Matrix4 & mat )
+{
+    return Matrix4(
+        absPerElem( mat.getCol0() ),
+        absPerElem( mat.getCol1() ),
+        absPerElem( mat.getCol2() ),
+        absPerElem( mat.getCol3() )
+    );
+}
+
+inline const Matrix4 Matrix4::operator *( float scalar ) const
+{
+    return Matrix4(
+        ( mCol0 * scalar ),
+        ( mCol1 * scalar ),
+        ( mCol2 * scalar ),
+        ( mCol3 * scalar )
+    );
+}
+
+inline Matrix4 & Matrix4::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Matrix4 operator *( float scalar, const Matrix4 & mat )
+{
+    return mat * scalar;
+}
+
+inline const Vector4 Matrix4::operator *( const Vector4 & vec ) const
+{
+    return Vector4(
+        ( ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ) + ( mCol3.getX() * vec.getW() ) ),
+        ( ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ) + ( mCol3.getY() * vec.getW() ) ),
+        ( ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) ) + ( mCol3.getZ() * vec.getW() ) ),
+        ( ( ( ( mCol0.getW() * vec.getX() ) + ( mCol1.getW() * vec.getY() ) ) + ( mCol2.getW() * vec.getZ() ) ) + ( mCol3.getW() * vec.getW() ) )
+    );
+}
+
+inline const Vector4 Matrix4::operator *( const Vector3 & vec ) const
+{
+    return Vector4(
+        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
+        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
+        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) ),
+        ( ( ( mCol0.getW() * vec.getX() ) + ( mCol1.getW() * vec.getY() ) ) + ( mCol2.getW() * vec.getZ() ) )
+    );
+}
+
+inline const Vector4 Matrix4::operator *( const Point3 & pnt ) const
+{
+    return Vector4(
+        ( ( ( ( mCol0.getX() * pnt.getX() ) + ( mCol1.getX() * pnt.getY() ) ) + ( mCol2.getX() * pnt.getZ() ) ) + mCol3.getX() ),
+        ( ( ( ( mCol0.getY() * pnt.getX() ) + ( mCol1.getY() * pnt.getY() ) ) + ( mCol2.getY() * pnt.getZ() ) ) + mCol3.getY() ),
+        ( ( ( ( mCol0.getZ() * pnt.getX() ) + ( mCol1.getZ() * pnt.getY() ) ) + ( mCol2.getZ() * pnt.getZ() ) ) + mCol3.getZ() ),
+        ( ( ( ( mCol0.getW() * pnt.getX() ) + ( mCol1.getW() * pnt.getY() ) ) + ( mCol2.getW() * pnt.getZ() ) ) + mCol3.getW() )
+    );
+}
+
+inline const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( *this * mat.mCol0 ),
+        ( *this * mat.mCol1 ),
+        ( *this * mat.mCol2 ),
+        ( *this * mat.mCol3 )
+    );
+}
+
+inline Matrix4 & Matrix4::operator *=( const Matrix4 & mat )
+{
+    *this = *this * mat;
+    return *this;
+}
+
+inline const Matrix4 Matrix4::operator *( const Transform3 & tfrm ) const
+{
+    return Matrix4(
+        ( *this * tfrm.getCol0() ),
+        ( *this * tfrm.getCol1() ),
+        ( *this * tfrm.getCol2() ),
+        ( *this * Point3( tfrm.getCol3() ) )
+    );
+}
+
+inline Matrix4 & Matrix4::operator *=( const Transform3 & tfrm )
+{
+    *this = *this * tfrm;
+    return *this;
+}
+
+inline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 )
+{
+    return Matrix4(
+        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
+        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
+        mulPerElem( mat0.getCol2(), mat1.getCol2() ),
+        mulPerElem( mat0.getCol3(), mat1.getCol3() )
+    );
+}
+
+inline const Matrix4 Matrix4::identity( )
+{
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4::yAxis( ),
+        Vector4::zAxis( ),
+        Vector4::wAxis( )
+    );
+}
+
+inline Matrix4 & Matrix4::setUpper3x3( const Matrix3 & mat3 )
+{
+    mCol0.setXYZ( mat3.getCol0() );
+    mCol1.setXYZ( mat3.getCol1() );
+    mCol2.setXYZ( mat3.getCol2() );
+    return *this;
+}
+
+inline const Matrix3 Matrix4::getUpper3x3( ) const
+{
+    return Matrix3(
+        mCol0.getXYZ( ),
+        mCol1.getXYZ( ),
+        mCol2.getXYZ( )
+    );
+}
+
+inline Matrix4 & Matrix4::setTranslation( const Vector3 & translateVec )
+{
+    mCol3.setXYZ( translateVec );
+    return *this;
+}
+
+inline const Vector3 Matrix4::getTranslation( ) const
+{
+    return mCol3.getXYZ( );
+}
+
+inline const Matrix4 Matrix4::rotationX( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4( 0.0f, c, s, 0.0f ),
+        Vector4( 0.0f, -s, c, 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotationY( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix4(
+        Vector4( c, 0.0f, -s, 0.0f ),
+        Vector4::yAxis( ),
+        Vector4( s, 0.0f, c, 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotationZ( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix4(
+        Vector4( c, s, 0.0f, 0.0f ),
+        Vector4( -s, c, 0.0f, 0.0f ),
+        Vector4::zAxis( ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotationZYX( const Vector3 & radiansXYZ )
+{
+    float sX, cX, sY, cY, sZ, cZ, tmp0, tmp1;
+    sX = sinf( radiansXYZ.getX() );
+    cX = cosf( radiansXYZ.getX() );
+    sY = sinf( radiansXYZ.getY() );
+    cY = cosf( radiansXYZ.getY() );
+    sZ = sinf( radiansXYZ.getZ() );
+    cZ = cosf( radiansXYZ.getZ() );
+    tmp0 = ( cZ * sY );
+    tmp1 = ( sZ * sY );
+    return Matrix4(
+        Vector4( ( cZ * cY ), ( sZ * cY ), -sY, 0.0f ),
+        Vector4( ( ( tmp0 * sX ) - ( sZ * cX ) ), ( ( tmp1 * sX ) + ( cZ * cX ) ), ( cY * sX ), 0.0f ),
+        Vector4( ( ( tmp0 * cX ) + ( sZ * sX ) ), ( ( tmp1 * cX ) - ( cZ * sX ) ), ( cY * cX ), 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotation( float radians, const Vector3 & unitVec )
+{
+    float x, y, z, s, c, oneMinusC, xy, yz, zx;
+    s = sinf( radians );
+    c = cosf( radians );
+    x = unitVec.getX();
+    y = unitVec.getY();
+    z = unitVec.getZ();
+    xy = ( x * y );
+    yz = ( y * z );
+    zx = ( z * x );
+    oneMinusC = ( 1.0f - c );
+    return Matrix4(
+        Vector4( ( ( ( x * x ) * oneMinusC ) + c ), ( ( xy * oneMinusC ) + ( z * s ) ), ( ( zx * oneMinusC ) - ( y * s ) ), 0.0f ),
+        Vector4( ( ( xy * oneMinusC ) - ( z * s ) ), ( ( ( y * y ) * oneMinusC ) + c ), ( ( yz * oneMinusC ) + ( x * s ) ), 0.0f ),
+        Vector4( ( ( zx * oneMinusC ) + ( y * s ) ), ( ( yz * oneMinusC ) - ( x * s ) ), ( ( ( z * z ) * oneMinusC ) + c ), 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotation( const Quat & unitQuat )
+{
+    return Matrix4( Transform3::rotation( unitQuat ) );
+}
+
+inline const Matrix4 Matrix4::scale( const Vector3 & scaleVec )
+{
+    return Matrix4(
+        Vector4( scaleVec.getX(), 0.0f, 0.0f, 0.0f ),
+        Vector4( 0.0f, scaleVec.getY(), 0.0f, 0.0f ),
+        Vector4( 0.0f, 0.0f, scaleVec.getZ(), 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 & scaleVec )
+{
+    return Matrix4(
+        ( mat.getCol0() * scaleVec.getX( ) ),
+        ( mat.getCol1() * scaleVec.getY( ) ),
+        ( mat.getCol2() * scaleVec.getZ( ) ),
+        mat.getCol3()
+    );
+}
+
+inline const Matrix4 prependScale( const Vector3 & scaleVec, const Matrix4 & mat )
+{
+    Vector4 scale4;
+    scale4 = Vector4( scaleVec, 1.0f );
+    return Matrix4(
+        mulPerElem( mat.getCol0(), scale4 ),
+        mulPerElem( mat.getCol1(), scale4 ),
+        mulPerElem( mat.getCol2(), scale4 ),
+        mulPerElem( mat.getCol3(), scale4 )
+    );
+}
+
+inline const Matrix4 Matrix4::translation( const Vector3 & translateVec )
+{
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4::yAxis( ),
+        Vector4::zAxis( ),
+        Vector4( translateVec, 1.0f )
+    );
+}
+
+inline const Matrix4 Matrix4::lookAt( const Point3 & eyePos, const Point3 & lookAtPos, const Vector3 & upVec )
+{
+    Matrix4 m4EyeFrame;
+    Vector3 v3X, v3Y, v3Z;
+    v3Y = normalize( upVec );
+    v3Z = normalize( ( eyePos - lookAtPos ) );
+    v3X = normalize( cross( v3Y, v3Z ) );
+    v3Y = cross( v3Z, v3X );
+    m4EyeFrame = Matrix4( Vector4( v3X ), Vector4( v3Y ), Vector4( v3Z ), Vector4( eyePos ) );
+    return orthoInverse( m4EyeFrame );
+}
+
+inline const Matrix4 Matrix4::perspective( float fovyRadians, float aspect, float zNear, float zFar )
+{
+    float f, rangeInv;
+    f = tanf( ( (float)( _VECTORMATH_PI_OVER_2 ) - ( 0.5f * fovyRadians ) ) );
+    rangeInv = ( 1.0f / ( zNear - zFar ) );
+    return Matrix4(
+        Vector4( ( f / aspect ), 0.0f, 0.0f, 0.0f ),
+        Vector4( 0.0f, f, 0.0f, 0.0f ),
+        Vector4( 0.0f, 0.0f, ( ( zNear + zFar ) * rangeInv ), -1.0f ),
+        Vector4( 0.0f, 0.0f, ( ( ( zNear * zFar ) * rangeInv ) * 2.0f ), 0.0f )
+    );
+}
+
+inline const Matrix4 Matrix4::frustum( float left, float right, float bottom, float top, float zNear, float zFar )
+{
+    float sum_rl, sum_tb, sum_nf, inv_rl, inv_tb, inv_nf, n2;
+    sum_rl = ( right + left );
+    sum_tb = ( top + bottom );
+    sum_nf = ( zNear + zFar );
+    inv_rl = ( 1.0f / ( right - left ) );
+    inv_tb = ( 1.0f / ( top - bottom ) );
+    inv_nf = ( 1.0f / ( zNear - zFar ) );
+    n2 = ( zNear + zNear );
+    return Matrix4(
+        Vector4( ( n2 * inv_rl ), 0.0f, 0.0f, 0.0f ),
+        Vector4( 0.0f, ( n2 * inv_tb ), 0.0f, 0.0f ),
+        Vector4( ( sum_rl * inv_rl ), ( sum_tb * inv_tb ), ( sum_nf * inv_nf ), -1.0f ),
+        Vector4( 0.0f, 0.0f, ( ( n2 * inv_nf ) * zFar ), 0.0f )
+    );
+}
+
+inline const Matrix4 Matrix4::orthographic( float left, float right, float bottom, float top, float zNear, float zFar )
+{
+    float sum_rl, sum_tb, sum_nf, inv_rl, inv_tb, inv_nf;
+    sum_rl = ( right + left );
+    sum_tb = ( top + bottom );
+    sum_nf = ( zNear + zFar );
+    inv_rl = ( 1.0f / ( right - left ) );
+    inv_tb = ( 1.0f / ( top - bottom ) );
+    inv_nf = ( 1.0f / ( zNear - zFar ) );
+    return Matrix4(
+        Vector4( ( inv_rl + inv_rl ), 0.0f, 0.0f, 0.0f ),
+        Vector4( 0.0f, ( inv_tb + inv_tb ), 0.0f, 0.0f ),
+        Vector4( 0.0f, 0.0f, ( inv_nf + inv_nf ), 0.0f ),
+        Vector4( ( -sum_rl * inv_rl ), ( -sum_tb * inv_tb ), ( sum_nf * inv_nf ), 1.0f )
+    );
+}
+
+inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 )
+{
+    return Matrix4(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 ),
+        select( mat0.getCol3(), mat1.getCol3(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Matrix4 & mat )
+{
+    print( mat.getRow( 0 ) );
+    print( mat.getRow( 1 ) );
+    print( mat.getRow( 2 ) );
+    print( mat.getRow( 3 ) );
+}
+
+inline void print( const Matrix4 & mat, const char * name )
+{
+    printf("%s:\n", name);
+    print( mat );
+}
+
+#endif
+
+inline Transform3::Transform3( const Transform3 & tfrm )
+{
+    mCol0 = tfrm.mCol0;
+    mCol1 = tfrm.mCol1;
+    mCol2 = tfrm.mCol2;
+    mCol3 = tfrm.mCol3;
+}
+
+inline Transform3::Transform3( float scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+    mCol3 = Vector3( scalar );
+}
+
+inline Transform3::Transform3( const Vector3 & _col0, const Vector3 & _col1, const Vector3 & _col2, const Vector3 & _col3 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+    mCol3 = _col3;
+}
+
+inline Transform3::Transform3( const Matrix3 & tfrm, const Vector3 & translateVec )
+{
+    this->setUpper3x3( tfrm );
+    this->setTranslation( translateVec );
+}
+
+inline Transform3::Transform3( const Quat & unitQuat, const Vector3 & translateVec )
+{
+    this->setUpper3x3( Matrix3( unitQuat ) );
+    this->setTranslation( translateVec );
+}
+
+inline Transform3 & Transform3::setCol0( const Vector3 & _col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+inline Transform3 & Transform3::setCol1( const Vector3 & _col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+inline Transform3 & Transform3::setCol2( const Vector3 & _col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+inline Transform3 & Transform3::setCol3( const Vector3 & _col3 )
+{
+    mCol3 = _col3;
+    return *this;
+}
+
+inline Transform3 & Transform3::setCol( int col, const Vector3 & vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+inline Transform3 & Transform3::setRow( int row, const Vector4 & vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    mCol3.setElem( row, vec.getElem( 3 ) );
+    return *this;
+}
+
+inline Transform3 & Transform3::setElem( int col, int row, float val )
+{
+    Vector3 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+inline float Transform3::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+inline const Vector3 Transform3::getCol0( ) const
+{
+    return mCol0;
+}
+
+inline const Vector3 Transform3::getCol1( ) const
+{
+    return mCol1;
+}
+
+inline const Vector3 Transform3::getCol2( ) const
+{
+    return mCol2;
+}
+
+inline const Vector3 Transform3::getCol3( ) const
+{
+    return mCol3;
+}
+
+inline const Vector3 Transform3::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector4 Transform3::getRow( int row ) const
+{
+    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
+}
+
+inline Vector3 & Transform3::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector3 Transform3::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline Transform3 & Transform3::operator =( const Transform3 & tfrm )
+{
+    mCol0 = tfrm.mCol0;
+    mCol1 = tfrm.mCol1;
+    mCol2 = tfrm.mCol2;
+    mCol3 = tfrm.mCol3;
+    return *this;
+}
+
+inline const Transform3 inverse( const Transform3 & tfrm )
+{
+    Vector3 tmp0, tmp1, tmp2, inv0, inv1, inv2;
+    float detinv;
+    tmp0 = cross( tfrm.getCol1(), tfrm.getCol2() );
+    tmp1 = cross( tfrm.getCol2(), tfrm.getCol0() );
+    tmp2 = cross( tfrm.getCol0(), tfrm.getCol1() );
+    detinv = ( 1.0f / dot( tfrm.getCol2(), tmp2 ) );
+    inv0 = Vector3( ( tmp0.getX() * detinv ), ( tmp1.getX() * detinv ), ( tmp2.getX() * detinv ) );
+    inv1 = Vector3( ( tmp0.getY() * detinv ), ( tmp1.getY() * detinv ), ( tmp2.getY() * detinv ) );
+    inv2 = Vector3( ( tmp0.getZ() * detinv ), ( tmp1.getZ() * detinv ), ( tmp2.getZ() * detinv ) );
+    return Transform3(
+        inv0,
+        inv1,
+        inv2,
+        Vector3( ( -( ( inv0 * tfrm.getCol3().getX() ) + ( ( inv1 * tfrm.getCol3().getY() ) + ( inv2 * tfrm.getCol3().getZ() ) ) ) ) )
+    );
+}
+
+inline const Transform3 orthoInverse( const Transform3 & tfrm )
+{
+    Vector3 inv0, inv1, inv2;
+    inv0 = Vector3( tfrm.getCol0().getX(), tfrm.getCol1().getX(), tfrm.getCol2().getX() );
+    inv1 = Vector3( tfrm.getCol0().getY(), tfrm.getCol1().getY(), tfrm.getCol2().getY() );
+    inv2 = Vector3( tfrm.getCol0().getZ(), tfrm.getCol1().getZ(), tfrm.getCol2().getZ() );
+    return Transform3(
+        inv0,
+        inv1,
+        inv2,
+        Vector3( ( -( ( inv0 * tfrm.getCol3().getX() ) + ( ( inv1 * tfrm.getCol3().getY() ) + ( inv2 * tfrm.getCol3().getZ() ) ) ) ) )
+    );
+}
+
+inline const Transform3 absPerElem( const Transform3 & tfrm )
+{
+    return Transform3(
+        absPerElem( tfrm.getCol0() ),
+        absPerElem( tfrm.getCol1() ),
+        absPerElem( tfrm.getCol2() ),
+        absPerElem( tfrm.getCol3() )
+    );
+}
+
+inline const Vector3 Transform3::operator *( const Vector3 & vec ) const
+{
+    return Vector3(
+        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
+        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
+        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
+    );
+}
+
+inline const Point3 Transform3::operator *( const Point3 & pnt ) const
+{
+    return Point3(
+        ( ( ( ( mCol0.getX() * pnt.getX() ) + ( mCol1.getX() * pnt.getY() ) ) + ( mCol2.getX() * pnt.getZ() ) ) + mCol3.getX() ),
+        ( ( ( ( mCol0.getY() * pnt.getX() ) + ( mCol1.getY() * pnt.getY() ) ) + ( mCol2.getY() * pnt.getZ() ) ) + mCol3.getY() ),
+        ( ( ( ( mCol0.getZ() * pnt.getX() ) + ( mCol1.getZ() * pnt.getY() ) ) + ( mCol2.getZ() * pnt.getZ() ) ) + mCol3.getZ() )
+    );
+}
+
+inline const Transform3 Transform3::operator *( const Transform3 & tfrm ) const
+{
+    return Transform3(
+        ( *this * tfrm.mCol0 ),
+        ( *this * tfrm.mCol1 ),
+        ( *this * tfrm.mCol2 ),
+        Vector3( ( *this * Point3( tfrm.mCol3 ) ) )
+    );
+}
+
+inline Transform3 & Transform3::operator *=( const Transform3 & tfrm )
+{
+    *this = *this * tfrm;
+    return *this;
+}
+
+inline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 )
+{
+    return Transform3(
+        mulPerElem( tfrm0.getCol0(), tfrm1.getCol0() ),
+        mulPerElem( tfrm0.getCol1(), tfrm1.getCol1() ),
+        mulPerElem( tfrm0.getCol2(), tfrm1.getCol2() ),
+        mulPerElem( tfrm0.getCol3(), tfrm1.getCol3() )
+    );
+}
+
+inline const Transform3 Transform3::identity( )
+{
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( ),
+        Vector3( 0.0f )
+    );
+}
+
+inline Transform3 & Transform3::setUpper3x3( const Matrix3 & tfrm )
+{
+    mCol0 = tfrm.getCol0();
+    mCol1 = tfrm.getCol1();
+    mCol2 = tfrm.getCol2();
+    return *this;
+}
+
+inline const Matrix3 Transform3::getUpper3x3( ) const
+{
+    return Matrix3( mCol0, mCol1, mCol2 );
+}
+
+inline Transform3 & Transform3::setTranslation( const Vector3 & translateVec )
+{
+    mCol3 = translateVec;
+    return *this;
+}
+
+inline const Vector3 Transform3::getTranslation( ) const
+{
+    return mCol3;
+}
+
+inline const Transform3 Transform3::rotationX( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3( 0.0f, c, s ),
+        Vector3( 0.0f, -s, c ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 Transform3::rotationY( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Transform3(
+        Vector3( c, 0.0f, -s ),
+        Vector3::yAxis( ),
+        Vector3( s, 0.0f, c ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 Transform3::rotationZ( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Transform3(
+        Vector3( c, s, 0.0f ),
+        Vector3( -s, c, 0.0f ),
+        Vector3::zAxis( ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 Transform3::rotationZYX( const Vector3 & radiansXYZ )
+{
+    float sX, cX, sY, cY, sZ, cZ, tmp0, tmp1;
+    sX = sinf( radiansXYZ.getX() );
+    cX = cosf( radiansXYZ.getX() );
+    sY = sinf( radiansXYZ.getY() );
+    cY = cosf( radiansXYZ.getY() );
+    sZ = sinf( radiansXYZ.getZ() );
+    cZ = cosf( radiansXYZ.getZ() );
+    tmp0 = ( cZ * sY );
+    tmp1 = ( sZ * sY );
+    return Transform3(
+        Vector3( ( cZ * cY ), ( sZ * cY ), -sY ),
+        Vector3( ( ( tmp0 * sX ) - ( sZ * cX ) ), ( ( tmp1 * sX ) + ( cZ * cX ) ), ( cY * sX ) ),
+        Vector3( ( ( tmp0 * cX ) + ( sZ * sX ) ), ( ( tmp1 * cX ) - ( cZ * sX ) ), ( cY * cX ) ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 Transform3::rotation( float radians, const Vector3 & unitVec )
+{
+    return Transform3( Matrix3::rotation( radians, unitVec ), Vector3( 0.0f ) );
+}
+
+inline const Transform3 Transform3::rotation( const Quat & unitQuat )
+{
+    return Transform3( Matrix3( unitQuat ), Vector3( 0.0f ) );
+}
+
+inline const Transform3 Transform3::scale( const Vector3 & scaleVec )
+{
+    return Transform3(
+        Vector3( scaleVec.getX(), 0.0f, 0.0f ),
+        Vector3( 0.0f, scaleVec.getY(), 0.0f ),
+        Vector3( 0.0f, 0.0f, scaleVec.getZ() ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 & scaleVec )
+{
+    return Transform3(
+        ( tfrm.getCol0() * scaleVec.getX( ) ),
+        ( tfrm.getCol1() * scaleVec.getY( ) ),
+        ( tfrm.getCol2() * scaleVec.getZ( ) ),
+        tfrm.getCol3()
+    );
+}
+
+inline const Transform3 prependScale( const Vector3 & scaleVec, const Transform3 & tfrm )
+{
+    return Transform3(
+        mulPerElem( tfrm.getCol0(), scaleVec ),
+        mulPerElem( tfrm.getCol1(), scaleVec ),
+        mulPerElem( tfrm.getCol2(), scaleVec ),
+        mulPerElem( tfrm.getCol3(), scaleVec )
+    );
+}
+
+inline const Transform3 Transform3::translation( const Vector3 & translateVec )
+{
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( ),
+        translateVec
+    );
+}
+
+inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 )
+{
+    return Transform3(
+        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
+        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
+        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
+        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Transform3 & tfrm )
+{
+    print( tfrm.getRow( 0 ) );
+    print( tfrm.getRow( 1 ) );
+    print( tfrm.getRow( 2 ) );
+}
+
+inline void print( const Transform3 & tfrm, const char * name )
+{
+    printf("%s:\n", name);
+    print( tfrm );
+}
+
+#endif
+
+inline Quat::Quat( const Matrix3 & tfrm )
+{
+    float trace, radicand, scale, xx, yx, zx, xy, yy, zy, xz, yz, zz, tmpx, tmpy, tmpz, tmpw, qx, qy, qz, qw;
+    int negTrace, ZgtX, ZgtY, YgtX;
+    int largestXorY, largestYorZ, largestZorX;
+
+    xx = tfrm.getCol0().getX();
+    yx = tfrm.getCol0().getY();
+    zx = tfrm.getCol0().getZ();
+    xy = tfrm.getCol1().getX();
+    yy = tfrm.getCol1().getY();
+    zy = tfrm.getCol1().getZ();
+    xz = tfrm.getCol2().getX();
+    yz = tfrm.getCol2().getY();
+    zz = tfrm.getCol2().getZ();
+
+    trace = ( ( xx + yy ) + zz );
+
+    negTrace = ( trace < 0.0f );
+    ZgtX = zz > xx;
+    ZgtY = zz > yy;
+    YgtX = yy > xx;
+    largestXorY = ( !ZgtX || !ZgtY ) && negTrace;
+    largestYorZ = ( YgtX || ZgtX ) && negTrace;
+    largestZorX = ( ZgtY || !YgtX ) && negTrace;
+    
+    if ( largestXorY )
+    {
+        zz = -zz;
+        xy = -xy;
+    }
+    if ( largestYorZ )
+    {
+        xx = -xx;
+        yz = -yz;
+    }
+    if ( largestZorX )
+    {
+        yy = -yy;
+        zx = -zx;
+    }
+
+    radicand = ( ( ( xx + yy ) + zz ) + 1.0f );
+    scale = ( 0.5f * ( 1.0f / sqrtf( radicand ) ) );
+
+    tmpx = ( ( zy - yz ) * scale );
+    tmpy = ( ( xz - zx ) * scale );
+    tmpz = ( ( yx - xy ) * scale );
+    tmpw = ( radicand * scale );
+    qx = tmpx;
+    qy = tmpy;
+    qz = tmpz;
+    qw = tmpw;
+
+    if ( largestXorY )
+    {
+        qx = tmpw;
+        qy = tmpz;
+        qz = tmpy;
+        qw = tmpx;
+    }
+    if ( largestYorZ )
+    {
+        tmpx = qx;
+        tmpz = qz;
+        qx = qy;
+        qy = tmpx;
+        qz = qw;
+        qw = tmpz;
+    }
+
+    mXYZW[0] = qx;
+    mXYZW[1] = qy;
+    mXYZW[2] = qz;
+    mXYZW[3] = qw;
+}
+
+inline const Matrix3 outer( const Vector3 & tfrm0, const Vector3 & tfrm1 )
+{
+    return Matrix3(
+        ( tfrm0 * tfrm1.getX( ) ),
+        ( tfrm0 * tfrm1.getY( ) ),
+        ( tfrm0 * tfrm1.getZ( ) )
+    );
+}
+
+inline const Matrix4 outer( const Vector4 & tfrm0, const Vector4 & tfrm1 )
+{
+    return Matrix4(
+        ( tfrm0 * tfrm1.getX( ) ),
+        ( tfrm0 * tfrm1.getY( ) ),
+        ( tfrm0 * tfrm1.getZ( ) ),
+        ( tfrm0 * tfrm1.getW( ) )
+    );
+}
+
+inline const Vector3 rowMul( const Vector3 & vec, const Matrix3 & mat )
+{
+    return Vector3(
+        ( ( ( vec.getX() * mat.getCol0().getX() ) + ( vec.getY() * mat.getCol0().getY() ) ) + ( vec.getZ() * mat.getCol0().getZ() ) ),
+        ( ( ( vec.getX() * mat.getCol1().getX() ) + ( vec.getY() * mat.getCol1().getY() ) ) + ( vec.getZ() * mat.getCol1().getZ() ) ),
+        ( ( ( vec.getX() * mat.getCol2().getX() ) + ( vec.getY() * mat.getCol2().getY() ) ) + ( vec.getZ() * mat.getCol2().getZ() ) )
+    );
+}
+
+inline const Matrix3 crossMatrix( const Vector3 & vec )
+{
+    return Matrix3(
+        Vector3( 0.0f, vec.getZ(), -vec.getY() ),
+        Vector3( -vec.getZ(), 0.0f, vec.getX() ),
+        Vector3( vec.getY(), -vec.getX(), 0.0f )
+    );
+}
+
+inline const Matrix3 crossMatrixMul( const Vector3 & vec, const Matrix3 & mat )
+{
+    return Matrix3( cross( vec, mat.getCol0() ), cross( vec, mat.getCol1() ), cross( vec, mat.getCol2() ) );
+}
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
+
diff --git a/test/Bullet2/vectormath/neon/quat_aos.h b/test/Bullet2/vectormath/neon/quat_aos.h
new file mode 100644
index 000000000..d06184603
--- /dev/null
+++ b/test/Bullet2/vectormath/neon/quat_aos.h
@@ -0,0 +1,413 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef _VECTORMATH_QUAT_AOS_CPP_H
+#define _VECTORMATH_QUAT_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+    inline Quat::Quat( const Quat & quat )
+    {        
+        vXYZW = quat.vXYZW;
+    }
+    
+    inline Quat::Quat( float _x, float _y, float _z, float _w )
+    {        
+        mXYZW[0] = _x;
+        mXYZW[1] = _y;
+        mXYZW[2] = _z;
+        mXYZW[3] = _w;
+    }
+    
+    inline Quat::Quat( float32x4_t fXYZW )  
+    {        
+        vXYZW = fXYZW;
+    }
+    
+    inline Quat::Quat( const Vector3 & xyz, float _w )
+    {        
+        this->setXYZ( xyz );
+        this->setW( _w );
+    }
+    
+    inline Quat::Quat( const Vector4 & vec )
+    {        
+        mXYZW[0] = vec.getX();
+        mXYZW[1] = vec.getY();
+        mXYZW[2] = vec.getZ();
+        mXYZW[3] = vec.getW();
+    }
+    
+    inline Quat::Quat( float scalar )  
+    {        
+        vXYZW = vdupq_n_f32(scalar);
+    }
+    
+    inline const Quat Quat::identity( )
+    {        
+        return Quat( 0.0f, 0.0f, 0.0f, 1.0f );
+    }
+    
+    inline const Quat lerp( float t, const Quat & quat0, const Quat & quat1 )
+    {        
+        return ( quat0 + ( ( quat1 - quat0 ) * t ) );
+    }
+    
+    inline const Quat slerp( float t, const Quat & unitQuat0, const Quat & unitQuat1 )
+    {
+        Quat start;
+        float recipSinAngle, scale0, scale1, cosAngle, angle;
+        cosAngle = dot( unitQuat0, unitQuat1 );
+        if ( cosAngle < 0.0f ) {
+            cosAngle = -cosAngle;
+            start = ( -unitQuat0 );
+        } else {
+            start = unitQuat0;
+        }
+        if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
+            angle = acosf( cosAngle );
+            recipSinAngle = ( 1.0f / sinf( angle ) );
+            scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
+            scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
+        } else {
+            scale0 = ( 1.0f - t );
+            scale1 = t;
+        }
+        return ( ( start * scale0 ) + ( unitQuat1 * scale1 ) );
+    }
+    
+    inline const Quat squad( float t, const Quat & unitQuat0, const Quat & unitQuat1, const Quat & unitQuat2, const Quat & unitQuat3 )
+    {        
+        Quat tmp0, tmp1;
+        tmp0 = slerp( t, unitQuat0, unitQuat3 );
+        tmp1 = slerp( t, unitQuat1, unitQuat2 );
+        return slerp( ( ( 2.0f * t ) * ( 1.0f - t ) ), tmp0, tmp1 );
+    }
+    
+    inline void loadXYZW( Quat & quat, const float * fptr )
+    {        
+        quat = Quat( fptr[0], fptr[1], fptr[2], fptr[3] );
+    }
+    
+    inline void storeXYZW( const Quat & quat, float * fptr )
+    {        
+        vst1q_f32(fptr, quat.getvXYZW());
+    }
+    
+    inline Quat & Quat::operator =( const Quat & quat )
+    {        
+        vXYZW = quat.getvXYZW();
+        return *this;
+    }
+    
+    inline Quat & Quat::setXYZ( const Vector3 & vec )
+    {        
+        mXYZW[0] = vec.getX();
+        mXYZW[1] = vec.getY();
+        mXYZW[2] = vec.getZ();
+        return *this;
+    }
+    
+    inline const Vector3 Quat::getXYZ( ) const
+    {        
+        return Vector3( mXYZW[0], mXYZW[1], mXYZW[2] );
+    }
+    
+    inline float32x4_t Quat::getvXYZW( ) const
+    {        
+        return vXYZW;
+    }
+    
+    inline Quat & Quat::setX( float _x )
+    {        
+        mXYZW[0] = _x;
+        return *this;
+    }
+    
+    inline float Quat::getX( ) const
+    {        
+        return mXYZW[0];
+    }
+    
+    inline Quat & Quat::setY( float _y )
+    {        
+        mXYZW[1] = _y;
+        return *this;
+    }
+    
+    inline float Quat::getY( ) const
+    {        
+        return mXYZW[1];
+    }
+    
+    inline Quat & Quat::setZ( float _z )
+    {        
+        mXYZW[2] = _z;
+        return *this;
+    }
+    
+    inline float Quat::getZ( ) const
+    {        
+        return mXYZW[2];
+    }
+    
+    inline Quat & Quat::setW( float _w )
+    {        
+        mXYZW[3] = _w;
+        return *this;
+    }
+    
+    inline float Quat::getW( ) const
+    {        
+        return mXYZW[3];
+    }
+    
+    inline Quat & Quat::setElem( int idx, float value )
+    {        
+        *(&mXYZW[0] + idx) = value;
+        return *this;
+    }
+    
+    inline float Quat::getElem( int idx ) const
+    {        
+        return *(&mXYZW[0] + idx);
+    }
+    
+    inline float & Quat::operator []( int idx )
+    {        
+        return *(&mXYZW[0] + idx);
+    }
+    
+    inline float Quat::operator []( int idx ) const
+    {        
+        return *(&mXYZW[0] + idx);
+    }
+    
+    inline const Quat Quat::operator +( const Quat & quat ) const
+    {        
+        return Quat( vaddq_f32(vXYZW, quat.vXYZW) );
+    }
+    
+    inline const Quat Quat::operator -( const Quat & quat ) const
+    {        
+        return Quat( vsubq_f32(vXYZW, quat.vXYZW) );
+    }
+    
+    inline const Quat Quat::operator *( float scalar ) const
+    {        
+        float32x4_t v_scalar = vdupq_n_f32(scalar);
+        return Quat( vmulq_f32(vXYZW, v_scalar) );
+    }
+    
+    inline Quat & Quat::operator +=( const Quat & quat )
+    {        
+        *this = *this + quat;
+        return *this;
+    }
+    
+    inline Quat & Quat::operator -=( const Quat & quat )
+    {
+        *this = *this - quat;
+        return *this;
+    }
+    
+    inline Quat & Quat::operator *=( float scalar )
+    {        
+        *this = *this * scalar;
+        return *this;
+    }
+    
+    inline const Quat Quat::operator /( float scalar ) const
+    {        
+        return Quat(
+                    ( mXYZW[0] / scalar ),
+                    ( mXYZW[1] / scalar ),
+                    ( mXYZW[2] / scalar ),
+                    ( mXYZW[3] / scalar )
+                    );
+    }
+    
+    inline Quat & Quat::operator /=( float scalar )
+    {        
+        *this = *this / scalar;
+        return *this;
+    }
+    
+    inline const Quat Quat::operator -( ) const
+    {        
+        return Quat( vnegq_f32(vXYZW) );
+    }
+    
+    inline const Quat operator *( float scalar, const Quat & quat )
+    {        
+        return quat * scalar;
+    }
+    
+    inline float dot( const Quat & quat0, const Quat & quat1 )
+    {        
+        float result;
+        result = ( quat0.getX() * quat1.getX() );
+        result = ( result + ( quat0.getY() * quat1.getY() ) );
+        result = ( result + ( quat0.getZ() * quat1.getZ() ) );
+        result = ( result + ( quat0.getW() * quat1.getW() ) );
+        return result;
+    }
+    
+    inline float norm( const Quat & quat )
+    {        
+        float result;
+        result = ( quat.getX() * quat.getX() );
+        result = ( result + ( quat.getY() * quat.getY() ) );
+        result = ( result + ( quat.getZ() * quat.getZ() ) );
+        result = ( result + ( quat.getW() * quat.getW() ) );
+        return result;
+    }
+    
+    inline float length( const Quat & quat )
+    {        
+        return ::sqrtf( norm( quat ) );
+    }
+    
+    inline const Quat normalize( const Quat & quat )
+    {        
+        float lenSqr, lenInv;
+        lenSqr = norm( quat );
+        lenInv = ( 1.0f / sqrtf( lenSqr ) );
+        return Quat(
+                    ( quat.getX() * lenInv ),
+                    ( quat.getY() * lenInv ),
+                    ( quat.getZ() * lenInv ),
+                    ( quat.getW() * lenInv )
+                    );
+    }
+    
+    inline const Quat Quat::rotation( const Vector3 & unitVec0, const Vector3 & unitVec1 )
+    {        
+        float cosHalfAngleX2, recipCosHalfAngleX2;
+        cosHalfAngleX2 = sqrtf( ( 2.0f * ( 1.0f + dot( unitVec0, unitVec1 ) ) ) );
+        recipCosHalfAngleX2 = ( 1.0f / cosHalfAngleX2 );
+        return Quat( ( cross( unitVec0, unitVec1 ) * recipCosHalfAngleX2 ), ( cosHalfAngleX2 * 0.5f ) );
+    }
+    
+    inline const Quat Quat::rotation( float radians, const Vector3 & unitVec )
+    {        
+        float s, c, angle;
+        angle = ( radians * 0.5f );
+        s = sinf( angle );
+        c = cosf( angle );
+        return Quat( ( unitVec * s ), c );
+    }
+    
+    inline const Quat Quat::rotationX( float radians )
+    {        
+        float s, c, angle;
+        angle = ( radians * 0.5f );
+        s = sinf( angle );
+        c = cosf( angle );
+        return Quat( s, 0.0f, 0.0f, c );
+    }
+    
+    inline const Quat Quat::rotationY( float radians )
+    {        
+        float s, c, angle;
+        angle = ( radians * 0.5f );
+        s = sinf( angle );
+        c = cosf( angle );
+        return Quat( 0.0f, s, 0.0f, c );
+    }
+    
+    inline const Quat Quat::rotationZ( float radians )
+    {        
+        float s, c, angle;
+        angle = ( radians * 0.5f );
+        s = sinf( angle );
+        c = cosf( angle );
+        return Quat( 0.0f, 0.0f, s, c );
+    }
+    
+    inline const Quat Quat::operator *( const Quat & quat ) const
+    {        
+        return Quat(
+                    ( ( ( ( mXYZW[3] * quat.mXYZW[0] ) + ( mXYZW[0] * quat.mXYZW[3] ) ) + ( mXYZW[1] * quat.mXYZW[2] ) ) - ( mXYZW[2] * quat.mXYZW[1] ) ),
+                    ( ( ( ( mXYZW[3] * quat.mXYZW[1] ) + ( mXYZW[1] * quat.mXYZW[3] ) ) + ( mXYZW[2] * quat.mXYZW[0] ) ) - ( mXYZW[0] * quat.mXYZW[2] ) ),
+                    ( ( ( ( mXYZW[3] * quat.mXYZW[2] ) + ( mXYZW[2] * quat.mXYZW[3] ) ) + ( mXYZW[0] * quat.mXYZW[1] ) ) - ( mXYZW[1] * quat.mXYZW[0] ) ),
+                    ( ( ( ( mXYZW[3] * quat.mXYZW[3] ) - ( mXYZW[0] * quat.mXYZW[0] ) ) - ( mXYZW[1] * quat.mXYZW[1] ) ) - ( mXYZW[2] * quat.mXYZW[2] ) )
+                    );
+    }
+    
+    inline Quat & Quat::operator *=( const Quat & quat )
+    {        
+        *this = *this * quat;
+        return *this;
+    }
+    
+    inline const Vector3 rotate( const Quat & quat, const Vector3 & vec )
+    {
+        float tmpX, tmpY, tmpZ, tmpW;
+        tmpX = ( ( ( quat.getW() * vec.getX() ) + ( quat.getY() * vec.getZ() ) ) - ( quat.getZ() * vec.getY() ) );
+        tmpY = ( ( ( quat.getW() * vec.getY() ) + ( quat.getZ() * vec.getX() ) ) - ( quat.getX() * vec.getZ() ) );
+        tmpZ = ( ( ( quat.getW() * vec.getZ() ) + ( quat.getX() * vec.getY() ) ) - ( quat.getY() * vec.getX() ) );
+        tmpW = ( ( ( quat.getX() * vec.getX() ) + ( quat.getY() * vec.getY() ) ) + ( quat.getZ() * vec.getZ() ) );
+        return Vector3(
+                       ( ( ( ( tmpW * quat.getX() ) + ( tmpX * quat.getW() ) ) - ( tmpY * quat.getZ() ) ) + ( tmpZ * quat.getY() ) ),
+                       ( ( ( ( tmpW * quat.getY() ) + ( tmpY * quat.getW() ) ) - ( tmpZ * quat.getX() ) ) + ( tmpX * quat.getZ() ) ),
+                       ( ( ( ( tmpW * quat.getZ() ) + ( tmpZ * quat.getW() ) ) - ( tmpX * quat.getY() ) ) + ( tmpY * quat.getX() ) )
+                       );
+    }
+    
+    inline const Quat conj( const Quat & quat )
+    {        
+        return Quat( -quat.getX(), -quat.getY(), -quat.getZ(), quat.getW() );
+    }
+    
+    inline const Quat select( const Quat & quat0, const Quat & quat1, bool select1 )
+    {
+        return Quat(
+                    ( select1 )? quat1.getX() : quat0.getX(),
+                    ( select1 )? quat1.getY() : quat0.getY(),
+                    ( select1 )? quat1.getZ() : quat0.getZ(),
+                    ( select1 )? quat1.getW() : quat0.getW()
+                    );
+    }
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Quat & quat )
+{
+    printf( "( %f %f %f %f )\n", quat.getX(), quat.getY(), quat.getZ(), quat.getW() );
+}
+
+inline void print( const Quat & quat, const char * name )
+{
+    printf( "%s: ( %f %f %f %f )\n", name, quat.getX(), quat.getY(), quat.getZ(), quat.getW() );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
+
diff --git a/test/Bullet2/vectormath/neon/vec_aos.h b/test/Bullet2/vectormath/neon/vec_aos.h
new file mode 100644
index 000000000..7bcf8dbec
--- /dev/null
+++ b/test/Bullet2/vectormath/neon/vec_aos.h
@@ -0,0 +1,1427 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef _VECTORMATH_VEC_AOS_CPP_H
+#define _VECTORMATH_VEC_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Constants
+
+#define _VECTORMATH_SLERP_TOL 0.999f
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+inline Vector3::Vector3( const Vector3 & vec )
+{
+    mX = vec.mX;
+    mY = vec.mY;
+    mZ = vec.mZ;
+}
+
+inline Vector3::Vector3( float _x, float _y, float _z )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+}
+
+inline Vector3::Vector3( const Point3 & pnt )
+{
+    mX = pnt.getX();
+    mY = pnt.getY();
+    mZ = pnt.getZ();
+}
+
+inline Vector3::Vector3( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+}
+
+inline const Vector3 Vector3::xAxis( )
+{
+    return Vector3( 1.0f, 0.0f, 0.0f );
+}
+
+inline const Vector3 Vector3::yAxis( )
+{
+    return Vector3( 0.0f, 1.0f, 0.0f );
+}
+
+inline const Vector3 Vector3::zAxis( )
+{
+    return Vector3( 0.0f, 0.0f, 1.0f );
+}
+
+inline const Vector3 lerp( float t, const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
+}
+
+inline const Vector3 slerp( float t, const Vector3 & unitVec0, const Vector3 & unitVec1 )
+{
+    float recipSinAngle, scale0, scale1, cosAngle, angle;
+    cosAngle = dot( unitVec0, unitVec1 );
+    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
+        angle = acosf( cosAngle );
+        recipSinAngle = ( 1.0f / sinf( angle ) );
+        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
+        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
+    } else {
+        scale0 = ( 1.0f - t );
+        scale1 = t;
+    }
+    return ( ( unitVec0 * scale0 ) + ( unitVec1 * scale1 ) );
+}
+
+inline void loadXYZ( Vector3 & vec, const float * fptr )
+{
+    vec = Vector3( fptr[0], fptr[1], fptr[2] );
+}
+
+inline void storeXYZ( const Vector3 & vec, float * fptr )
+{
+    fptr[0] = vec.getX();
+    fptr[1] = vec.getY();
+    fptr[2] = vec.getZ();
+}
+
+inline void loadHalfFloats( Vector3 & vec, const unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 3; i++) {
+        unsigned short fp16 = hfptr[i];
+        unsigned int sign = fp16 >> 15;
+        unsigned int exponent = (fp16 >> 10) & ((1 << 5) - 1);
+        unsigned int mantissa = fp16 & ((1 << 10) - 1);
+
+        if (exponent == 0) {
+            // zero
+            mantissa = 0;
+
+        } else if (exponent == 31) {
+            // infinity or nan -> infinity
+            exponent = 255;
+	    mantissa = 0;
+
+        } else {
+            exponent += 127 - 15;
+            mantissa <<= 13;
+        }
+
+        Data32 d;
+        d.u32 = (sign << 31) | (exponent << 23) | mantissa;
+        vec[i] = d.f32;
+    }
+}
+
+inline void storeHalfFloats( const Vector3 & vec, unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 3; i++) {
+        Data32 d;
+        d.f32 = vec[i];
+
+        unsigned int sign = d.u32 >> 31;
+        unsigned int exponent = (d.u32 >> 23) & ((1 << 8) - 1);
+        unsigned int mantissa = d.u32 & ((1 << 23) - 1);;
+
+        if (exponent == 0) {
+            // zero or denorm -> zero
+            mantissa = 0;
+
+        } else if (exponent == 255 && mantissa != 0) {
+            // nan -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent >= 127 - 15 + 31) {
+            // overflow or infinity -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent <= 127 - 15) {
+            // underflow -> zero
+            exponent = 0;
+            mantissa = 0;
+
+        } else {
+            exponent -= 127 - 15;
+            mantissa >>= 13;
+        }
+
+        hfptr[i] = (unsigned short)((sign << 15) | (exponent << 10) | mantissa);
+    }
+}
+
+inline Vector3 & Vector3::operator =( const Vector3 & vec )
+{
+    mX = vec.mX;
+    mY = vec.mY;
+    mZ = vec.mZ;
+    return *this;
+}
+
+inline Vector3 & Vector3::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Vector3::getX( ) const
+{
+    return mX;
+}
+
+inline Vector3 & Vector3::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Vector3::getY( ) const
+{
+    return mY;
+}
+
+inline Vector3 & Vector3::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Vector3::getZ( ) const
+{
+    return mZ;
+}
+
+inline Vector3 & Vector3::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Vector3::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Vector3::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Vector3::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Vector3 Vector3::operator +( const Vector3 & vec ) const
+{
+    return Vector3(
+        ( mX + vec.mX ),
+        ( mY + vec.mY ),
+        ( mZ + vec.mZ )
+    );
+}
+
+inline const Vector3 Vector3::operator -( const Vector3 & vec ) const
+{
+    return Vector3(
+        ( mX - vec.mX ),
+        ( mY - vec.mY ),
+        ( mZ - vec.mZ )
+    );
+}
+
+inline const Point3 Vector3::operator +( const Point3 & pnt ) const
+{
+    return Point3(
+        ( mX + pnt.getX() ),
+        ( mY + pnt.getY() ),
+        ( mZ + pnt.getZ() )
+    );
+}
+
+inline const Vector3 Vector3::operator *( float scalar ) const
+{
+    return Vector3(
+        ( mX * scalar ),
+        ( mY * scalar ),
+        ( mZ * scalar )
+    );
+}
+
+inline Vector3 & Vector3::operator +=( const Vector3 & vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline Vector3 & Vector3::operator -=( const Vector3 & vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline Vector3 & Vector3::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Vector3 Vector3::operator /( float scalar ) const
+{
+    return Vector3(
+        ( mX / scalar ),
+        ( mY / scalar ),
+        ( mZ / scalar )
+    );
+}
+
+inline Vector3 & Vector3::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+inline const Vector3 Vector3::operator -( ) const
+{
+    return Vector3(
+        -mX,
+        -mY,
+        -mZ
+    );
+}
+
+inline const Vector3 operator *( float scalar, const Vector3 & vec )
+{
+    return vec * scalar;
+}
+
+inline const Vector3 mulPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        ( vec0.getX() * vec1.getX() ),
+        ( vec0.getY() * vec1.getY() ),
+        ( vec0.getZ() * vec1.getZ() )
+    );
+}
+
+inline const Vector3 divPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        ( vec0.getX() / vec1.getX() ),
+        ( vec0.getY() / vec1.getY() ),
+        ( vec0.getZ() / vec1.getZ() )
+    );
+}
+
+inline const Vector3 recipPerElem( const Vector3 & vec )
+{
+    return Vector3(
+        ( 1.0f / vec.getX() ),
+        ( 1.0f / vec.getY() ),
+        ( 1.0f / vec.getZ() )
+    );
+}
+
+inline const Vector3 sqrtPerElem( const Vector3 & vec )
+{
+    return Vector3(
+        sqrtf( vec.getX() ),
+        sqrtf( vec.getY() ),
+        sqrtf( vec.getZ() )
+    );
+}
+
+inline const Vector3 rsqrtPerElem( const Vector3 & vec )
+{
+    return Vector3(
+        ( 1.0f / sqrtf( vec.getX() ) ),
+        ( 1.0f / sqrtf( vec.getY() ) ),
+        ( 1.0f / sqrtf( vec.getZ() ) )
+    );
+}
+
+inline const Vector3 absPerElem( const Vector3 & vec )
+{
+    return Vector3(
+        fabsf( vec.getX() ),
+        fabsf( vec.getY() ),
+        fabsf( vec.getZ() )
+    );
+}
+
+inline const Vector3 copySignPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        ( vec1.getX() < 0.0f )? -fabsf( vec0.getX() ) : fabsf( vec0.getX() ),
+        ( vec1.getY() < 0.0f )? -fabsf( vec0.getY() ) : fabsf( vec0.getY() ),
+        ( vec1.getZ() < 0.0f )? -fabsf( vec0.getZ() ) : fabsf( vec0.getZ() )
+    );
+}
+
+inline const Vector3 maxPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        (vec0.getX() > vec1.getX())? vec0.getX() : vec1.getX(),
+        (vec0.getY() > vec1.getY())? vec0.getY() : vec1.getY(),
+        (vec0.getZ() > vec1.getZ())? vec0.getZ() : vec1.getZ()
+    );
+}
+
+inline float maxElem( const Vector3 & vec )
+{
+    float result;
+    result = (vec.getX() > vec.getY())? vec.getX() : vec.getY();
+    result = (vec.getZ() > result)? vec.getZ() : result;
+    return result;
+}
+
+inline const Vector3 minPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        (vec0.getX() < vec1.getX())? vec0.getX() : vec1.getX(),
+        (vec0.getY() < vec1.getY())? vec0.getY() : vec1.getY(),
+        (vec0.getZ() < vec1.getZ())? vec0.getZ() : vec1.getZ()
+    );
+}
+
+inline float minElem( const Vector3 & vec )
+{
+    float result;
+    result = (vec.getX() < vec.getY())? vec.getX() : vec.getY();
+    result = (vec.getZ() < result)? vec.getZ() : result;
+    return result;
+}
+
+inline float sum( const Vector3 & vec )
+{
+    float result;
+    result = ( vec.getX() + vec.getY() );
+    result = ( result + vec.getZ() );
+    return result;
+}
+
+inline float dot( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    float result;
+    result = ( vec0.getX() * vec1.getX() );
+    result = ( result + ( vec0.getY() * vec1.getY() ) );
+    result = ( result + ( vec0.getZ() * vec1.getZ() ) );
+    return result;
+}
+
+inline float lengthSqr( const Vector3 & vec )
+{
+    float result;
+    result = ( vec.getX() * vec.getX() );
+    result = ( result + ( vec.getY() * vec.getY() ) );
+    result = ( result + ( vec.getZ() * vec.getZ() ) );
+    return result;
+}
+
+inline float length( const Vector3 & vec )
+{
+    return ::sqrtf( lengthSqr( vec ) );
+}
+
+inline const Vector3 normalize( const Vector3 & vec )
+{
+    float lenSqr, lenInv;
+    lenSqr = lengthSqr( vec );
+    lenInv = ( 1.0f / sqrtf( lenSqr ) );
+    return Vector3(
+        ( vec.getX() * lenInv ),
+        ( vec.getY() * lenInv ),
+        ( vec.getZ() * lenInv )
+    );
+}
+
+inline const Vector3 cross( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        ( ( vec0.getY() * vec1.getZ() ) - ( vec0.getZ() * vec1.getY() ) ),
+        ( ( vec0.getZ() * vec1.getX() ) - ( vec0.getX() * vec1.getZ() ) ),
+        ( ( vec0.getX() * vec1.getY() ) - ( vec0.getY() * vec1.getX() ) )
+    );
+}
+
+inline const Vector3 select( const Vector3 & vec0, const Vector3 & vec1, bool select1 )
+{
+    return Vector3(
+        ( select1 )? vec1.getX() : vec0.getX(),
+        ( select1 )? vec1.getY() : vec0.getY(),
+        ( select1 )? vec1.getZ() : vec0.getZ()
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Vector3 & vec )
+{
+    printf( "( %f %f %f )\n", vec.getX(), vec.getY(), vec.getZ() );
+}
+
+inline void print( const Vector3 & vec, const char * name )
+{
+    printf( "%s: ( %f %f %f )\n", name, vec.getX(), vec.getY(), vec.getZ() );
+}
+
+#endif
+
+inline Vector4::Vector4( const Vector4 & vec )
+{
+    mX = vec.mX;
+    mY = vec.mY;
+    mZ = vec.mZ;
+    mW = vec.mW;
+}
+
+inline Vector4::Vector4( float _x, float _y, float _z, float _w )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+    mW = _w;
+}
+
+inline Vector4::Vector4( const Vector3 & xyz, float _w )
+{
+    this->setXYZ( xyz );
+    this->setW( _w );
+}
+
+inline Vector4::Vector4( const Vector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    mW = 0.0f;
+}
+
+inline Vector4::Vector4( const Point3 & pnt )
+{
+    mX = pnt.getX();
+    mY = pnt.getY();
+    mZ = pnt.getZ();
+    mW = 1.0f;
+}
+
+inline Vector4::Vector4( const Quat & quat )
+{
+    mX = quat.getX();
+    mY = quat.getY();
+    mZ = quat.getZ();
+    mW = quat.getW();
+}
+
+inline Vector4::Vector4( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+    mW = scalar;
+}
+
+inline const Vector4 Vector4::xAxis( )
+{
+    return Vector4( 1.0f, 0.0f, 0.0f, 0.0f );
+}
+
+inline const Vector4 Vector4::yAxis( )
+{
+    return Vector4( 0.0f, 1.0f, 0.0f, 0.0f );
+}
+
+inline const Vector4 Vector4::zAxis( )
+{
+    return Vector4( 0.0f, 0.0f, 1.0f, 0.0f );
+}
+
+inline const Vector4 Vector4::wAxis( )
+{
+    return Vector4( 0.0f, 0.0f, 0.0f, 1.0f );
+}
+
+inline const Vector4 lerp( float t, const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
+}
+
+inline const Vector4 slerp( float t, const Vector4 & unitVec0, const Vector4 & unitVec1 )
+{
+    float recipSinAngle, scale0, scale1, cosAngle, angle;
+    cosAngle = dot( unitVec0, unitVec1 );
+    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
+        angle = acosf( cosAngle );
+        recipSinAngle = ( 1.0f / sinf( angle ) );
+        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
+        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
+    } else {
+        scale0 = ( 1.0f - t );
+        scale1 = t;
+    }
+    return ( ( unitVec0 * scale0 ) + ( unitVec1 * scale1 ) );
+}
+
+inline void loadXYZW( Vector4 & vec, const float * fptr )
+{
+    vec = Vector4( fptr[0], fptr[1], fptr[2], fptr[3] );
+}
+
+inline void storeXYZW( const Vector4 & vec, float * fptr )
+{
+    fptr[0] = vec.getX();
+    fptr[1] = vec.getY();
+    fptr[2] = vec.getZ();
+    fptr[3] = vec.getW();
+}
+
+inline void loadHalfFloats( Vector4 & vec, const unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 4; i++) {
+        unsigned short fp16 = hfptr[i];
+        unsigned int sign = fp16 >> 15;
+        unsigned int exponent = (fp16 >> 10) & ((1 << 5) - 1);
+        unsigned int mantissa = fp16 & ((1 << 10) - 1);
+
+        if (exponent == 0) {
+            // zero
+            mantissa = 0;
+
+        } else if (exponent == 31) {
+            // infinity or nan -> infinity
+            exponent = 255;
+	    mantissa = 0;
+
+        } else {
+            exponent += 127 - 15;
+            mantissa <<= 13;
+        }
+
+        Data32 d;
+        d.u32 = (sign << 31) | (exponent << 23) | mantissa;
+        vec[i] = d.f32;
+    }
+}
+
+inline void storeHalfFloats( const Vector4 & vec, unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 4; i++) {
+        Data32 d;
+        d.f32 = vec[i];
+
+        unsigned int sign = d.u32 >> 31;
+        unsigned int exponent = (d.u32 >> 23) & ((1 << 8) - 1);
+        unsigned int mantissa = d.u32 & ((1 << 23) - 1);;
+
+        if (exponent == 0) {
+            // zero or denorm -> zero
+            mantissa = 0;
+
+        } else if (exponent == 255 && mantissa != 0) {
+            // nan -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent >= 127 - 15 + 31) {
+            // overflow or infinity -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent <= 127 - 15) {
+            // underflow -> zero
+            exponent = 0;
+            mantissa = 0;
+
+        } else {
+            exponent -= 127 - 15;
+            mantissa >>= 13;
+        }
+
+        hfptr[i] = (unsigned short)((sign << 15) | (exponent << 10) | mantissa);
+    }
+}
+
+inline Vector4 & Vector4::operator =( const Vector4 & vec )
+{
+    mX = vec.mX;
+    mY = vec.mY;
+    mZ = vec.mZ;
+    mW = vec.mW;
+    return *this;
+}
+
+inline Vector4 & Vector4::setXYZ( const Vector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    return *this;
+}
+
+inline const Vector3 Vector4::getXYZ( ) const
+{
+    return Vector3( mX, mY, mZ );
+}
+
+inline Vector4 & Vector4::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Vector4::getX( ) const
+{
+    return mX;
+}
+
+inline Vector4 & Vector4::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Vector4::getY( ) const
+{
+    return mY;
+}
+
+inline Vector4 & Vector4::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Vector4::getZ( ) const
+{
+    return mZ;
+}
+
+inline Vector4 & Vector4::setW( float _w )
+{
+    mW = _w;
+    return *this;
+}
+
+inline float Vector4::getW( ) const
+{
+    return mW;
+}
+
+inline Vector4 & Vector4::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Vector4::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Vector4::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Vector4::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Vector4 Vector4::operator +( const Vector4 & vec ) const
+{
+    return Vector4(
+        ( mX + vec.mX ),
+        ( mY + vec.mY ),
+        ( mZ + vec.mZ ),
+        ( mW + vec.mW )
+    );
+}
+
+inline const Vector4 Vector4::operator -( const Vector4 & vec ) const
+{
+    return Vector4(
+        ( mX - vec.mX ),
+        ( mY - vec.mY ),
+        ( mZ - vec.mZ ),
+        ( mW - vec.mW )
+    );
+}
+
+inline const Vector4 Vector4::operator *( float scalar ) const
+{
+    return Vector4(
+        ( mX * scalar ),
+        ( mY * scalar ),
+        ( mZ * scalar ),
+        ( mW * scalar )
+    );
+}
+
+inline Vector4 & Vector4::operator +=( const Vector4 & vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline Vector4 & Vector4::operator -=( const Vector4 & vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline Vector4 & Vector4::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Vector4 Vector4::operator /( float scalar ) const
+{
+    return Vector4(
+        ( mX / scalar ),
+        ( mY / scalar ),
+        ( mZ / scalar ),
+        ( mW / scalar )
+    );
+}
+
+inline Vector4 & Vector4::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+inline const Vector4 Vector4::operator -( ) const
+{
+    return Vector4(
+        -mX,
+        -mY,
+        -mZ,
+        -mW
+    );
+}
+
+inline const Vector4 operator *( float scalar, const Vector4 & vec )
+{
+    return vec * scalar;
+}
+
+inline const Vector4 mulPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        ( vec0.getX() * vec1.getX() ),
+        ( vec0.getY() * vec1.getY() ),
+        ( vec0.getZ() * vec1.getZ() ),
+        ( vec0.getW() * vec1.getW() )
+    );
+}
+
+inline const Vector4 divPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        ( vec0.getX() / vec1.getX() ),
+        ( vec0.getY() / vec1.getY() ),
+        ( vec0.getZ() / vec1.getZ() ),
+        ( vec0.getW() / vec1.getW() )
+    );
+}
+
+inline const Vector4 recipPerElem( const Vector4 & vec )
+{
+    return Vector4(
+        ( 1.0f / vec.getX() ),
+        ( 1.0f / vec.getY() ),
+        ( 1.0f / vec.getZ() ),
+        ( 1.0f / vec.getW() )
+    );
+}
+
+inline const Vector4 sqrtPerElem( const Vector4 & vec )
+{
+    return Vector4(
+        sqrtf( vec.getX() ),
+        sqrtf( vec.getY() ),
+        sqrtf( vec.getZ() ),
+        sqrtf( vec.getW() )
+    );
+}
+
+inline const Vector4 rsqrtPerElem( const Vector4 & vec )
+{
+    return Vector4(
+        ( 1.0f / sqrtf( vec.getX() ) ),
+        ( 1.0f / sqrtf( vec.getY() ) ),
+        ( 1.0f / sqrtf( vec.getZ() ) ),
+        ( 1.0f / sqrtf( vec.getW() ) )
+    );
+}
+
+inline const Vector4 absPerElem( const Vector4 & vec )
+{
+    return Vector4(
+        fabsf( vec.getX() ),
+        fabsf( vec.getY() ),
+        fabsf( vec.getZ() ),
+        fabsf( vec.getW() )
+    );
+}
+
+inline const Vector4 copySignPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        ( vec1.getX() < 0.0f )? -fabsf( vec0.getX() ) : fabsf( vec0.getX() ),
+        ( vec1.getY() < 0.0f )? -fabsf( vec0.getY() ) : fabsf( vec0.getY() ),
+        ( vec1.getZ() < 0.0f )? -fabsf( vec0.getZ() ) : fabsf( vec0.getZ() ),
+        ( vec1.getW() < 0.0f )? -fabsf( vec0.getW() ) : fabsf( vec0.getW() )
+    );
+}
+
+inline const Vector4 maxPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        (vec0.getX() > vec1.getX())? vec0.getX() : vec1.getX(),
+        (vec0.getY() > vec1.getY())? vec0.getY() : vec1.getY(),
+        (vec0.getZ() > vec1.getZ())? vec0.getZ() : vec1.getZ(),
+        (vec0.getW() > vec1.getW())? vec0.getW() : vec1.getW()
+    );
+}
+
+inline float maxElem( const Vector4 & vec )
+{
+    float result;
+    result = (vec.getX() > vec.getY())? vec.getX() : vec.getY();
+    result = (vec.getZ() > result)? vec.getZ() : result;
+    result = (vec.getW() > result)? vec.getW() : result;
+    return result;
+}
+
+inline const Vector4 minPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        (vec0.getX() < vec1.getX())? vec0.getX() : vec1.getX(),
+        (vec0.getY() < vec1.getY())? vec0.getY() : vec1.getY(),
+        (vec0.getZ() < vec1.getZ())? vec0.getZ() : vec1.getZ(),
+        (vec0.getW() < vec1.getW())? vec0.getW() : vec1.getW()
+    );
+}
+
+inline float minElem( const Vector4 & vec )
+{
+    float result;
+    result = (vec.getX() < vec.getY())? vec.getX() : vec.getY();
+    result = (vec.getZ() < result)? vec.getZ() : result;
+    result = (vec.getW() < result)? vec.getW() : result;
+    return result;
+}
+
+inline float sum( const Vector4 & vec )
+{
+    float result;
+    result = ( vec.getX() + vec.getY() );
+    result = ( result + vec.getZ() );
+    result = ( result + vec.getW() );
+    return result;
+}
+
+inline float dot( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    float result;
+    result = ( vec0.getX() * vec1.getX() );
+    result = ( result + ( vec0.getY() * vec1.getY() ) );
+    result = ( result + ( vec0.getZ() * vec1.getZ() ) );
+    result = ( result + ( vec0.getW() * vec1.getW() ) );
+    return result;
+}
+
+inline float lengthSqr( const Vector4 & vec )
+{
+    float result;
+    result = ( vec.getX() * vec.getX() );
+    result = ( result + ( vec.getY() * vec.getY() ) );
+    result = ( result + ( vec.getZ() * vec.getZ() ) );
+    result = ( result + ( vec.getW() * vec.getW() ) );
+    return result;
+}
+
+inline float length( const Vector4 & vec )
+{
+    return ::sqrtf( lengthSqr( vec ) );
+}
+
+inline const Vector4 normalize( const Vector4 & vec )
+{
+    float lenSqr, lenInv;
+    lenSqr = lengthSqr( vec );
+    lenInv = ( 1.0f / sqrtf( lenSqr ) );
+    return Vector4(
+        ( vec.getX() * lenInv ),
+        ( vec.getY() * lenInv ),
+        ( vec.getZ() * lenInv ),
+        ( vec.getW() * lenInv )
+    );
+}
+
+inline const Vector4 select( const Vector4 & vec0, const Vector4 & vec1, bool select1 )
+{
+    return Vector4(
+        ( select1 )? vec1.getX() : vec0.getX(),
+        ( select1 )? vec1.getY() : vec0.getY(),
+        ( select1 )? vec1.getZ() : vec0.getZ(),
+        ( select1 )? vec1.getW() : vec0.getW()
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Vector4 & vec )
+{
+    printf( "( %f %f %f %f )\n", vec.getX(), vec.getY(), vec.getZ(), vec.getW() );
+}
+
+inline void print( const Vector4 & vec, const char * name )
+{
+    printf( "%s: ( %f %f %f %f )\n", name, vec.getX(), vec.getY(), vec.getZ(), vec.getW() );
+}
+
+#endif
+
+inline Point3::Point3( const Point3 & pnt )
+{
+    mX = pnt.mX;
+    mY = pnt.mY;
+    mZ = pnt.mZ;
+}
+
+inline Point3::Point3( float _x, float _y, float _z )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+}
+
+inline Point3::Point3( const Vector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+}
+
+inline Point3::Point3( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+}
+
+inline const Point3 lerp( float t, const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return ( pnt0 + ( ( pnt1 - pnt0 ) * t ) );
+}
+
+inline void loadXYZ( Point3 & pnt, const float * fptr )
+{
+    pnt = Point3( fptr[0], fptr[1], fptr[2] );
+}
+
+inline void storeXYZ( const Point3 & pnt, float * fptr )
+{
+    fptr[0] = pnt.getX();
+    fptr[1] = pnt.getY();
+    fptr[2] = pnt.getZ();
+}
+
+inline void loadHalfFloats( Point3 & vec, const unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 3; i++) {
+        unsigned short fp16 = hfptr[i];
+        unsigned int sign = fp16 >> 15;
+        unsigned int exponent = (fp16 >> 10) & ((1 << 5) - 1);
+        unsigned int mantissa = fp16 & ((1 << 10) - 1);
+
+        if (exponent == 0) {
+            // zero
+            mantissa = 0;
+
+        } else if (exponent == 31) {
+            // infinity or nan -> infinity
+            exponent = 255;
+	    mantissa = 0;
+
+        } else {
+            exponent += 127 - 15;
+            mantissa <<= 13;
+        }
+
+        Data32 d;
+        d.u32 = (sign << 31) | (exponent << 23) | mantissa;
+        vec[i] = d.f32;
+    }
+}
+
+inline void storeHalfFloats( const Point3 & vec, unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 3; i++) {
+        Data32 d;
+        d.f32 = vec[i];
+
+        unsigned int sign = d.u32 >> 31;
+        unsigned int exponent = (d.u32 >> 23) & ((1 << 8) - 1);
+        unsigned int mantissa = d.u32 & ((1 << 23) - 1);;
+
+        if (exponent == 0) {
+            // zero or denorm -> zero
+            mantissa = 0;
+
+        } else if (exponent == 255 && mantissa != 0) {
+            // nan -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent >= 127 - 15 + 31) {
+            // overflow or infinity -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent <= 127 - 15) {
+            // underflow -> zero
+            exponent = 0;
+            mantissa = 0;
+
+        } else {
+            exponent -= 127 - 15;
+            mantissa >>= 13;
+        }
+
+        hfptr[i] = (unsigned short)((sign << 15) | (exponent << 10) | mantissa);
+    }
+}
+
+inline Point3 & Point3::operator =( const Point3 & pnt )
+{
+    mX = pnt.mX;
+    mY = pnt.mY;
+    mZ = pnt.mZ;
+    return *this;
+}
+
+inline Point3 & Point3::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Point3::getX( ) const
+{
+    return mX;
+}
+
+inline Point3 & Point3::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Point3::getY( ) const
+{
+    return mY;
+}
+
+inline Point3 & Point3::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Point3::getZ( ) const
+{
+    return mZ;
+}
+
+inline Point3 & Point3::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Point3::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Point3::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Point3::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Vector3 Point3::operator -( const Point3 & pnt ) const
+{
+    return Vector3(
+        ( mX - pnt.mX ),
+        ( mY - pnt.mY ),
+        ( mZ - pnt.mZ )
+    );
+}
+
+inline const Point3 Point3::operator +( const Vector3 & vec ) const
+{
+    return Point3(
+        ( mX + vec.getX() ),
+        ( mY + vec.getY() ),
+        ( mZ + vec.getZ() )
+    );
+}
+
+inline const Point3 Point3::operator -( const Vector3 & vec ) const
+{
+    return Point3(
+        ( mX - vec.getX() ),
+        ( mY - vec.getY() ),
+        ( mZ - vec.getZ() )
+    );
+}
+
+inline Point3 & Point3::operator +=( const Vector3 & vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline Point3 & Point3::operator -=( const Vector3 & vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline const Point3 mulPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        ( pnt0.getX() * pnt1.getX() ),
+        ( pnt0.getY() * pnt1.getY() ),
+        ( pnt0.getZ() * pnt1.getZ() )
+    );
+}
+
+inline const Point3 divPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        ( pnt0.getX() / pnt1.getX() ),
+        ( pnt0.getY() / pnt1.getY() ),
+        ( pnt0.getZ() / pnt1.getZ() )
+    );
+}
+
+inline const Point3 recipPerElem( const Point3 & pnt )
+{
+    return Point3(
+        ( 1.0f / pnt.getX() ),
+        ( 1.0f / pnt.getY() ),
+        ( 1.0f / pnt.getZ() )
+    );
+}
+
+inline const Point3 sqrtPerElem( const Point3 & pnt )
+{
+    return Point3(
+        sqrtf( pnt.getX() ),
+        sqrtf( pnt.getY() ),
+        sqrtf( pnt.getZ() )
+    );
+}
+
+inline const Point3 rsqrtPerElem( const Point3 & pnt )
+{
+    return Point3(
+        ( 1.0f / sqrtf( pnt.getX() ) ),
+        ( 1.0f / sqrtf( pnt.getY() ) ),
+        ( 1.0f / sqrtf( pnt.getZ() ) )
+    );
+}
+
+inline const Point3 absPerElem( const Point3 & pnt )
+{
+    return Point3(
+        fabsf( pnt.getX() ),
+        fabsf( pnt.getY() ),
+        fabsf( pnt.getZ() )
+    );
+}
+
+inline const Point3 copySignPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        ( pnt1.getX() < 0.0f )? -fabsf( pnt0.getX() ) : fabsf( pnt0.getX() ),
+        ( pnt1.getY() < 0.0f )? -fabsf( pnt0.getY() ) : fabsf( pnt0.getY() ),
+        ( pnt1.getZ() < 0.0f )? -fabsf( pnt0.getZ() ) : fabsf( pnt0.getZ() )
+    );
+}
+
+inline const Point3 maxPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        (pnt0.getX() > pnt1.getX())? pnt0.getX() : pnt1.getX(),
+        (pnt0.getY() > pnt1.getY())? pnt0.getY() : pnt1.getY(),
+        (pnt0.getZ() > pnt1.getZ())? pnt0.getZ() : pnt1.getZ()
+    );
+}
+
+inline float maxElem( const Point3 & pnt )
+{
+    float result;
+    result = (pnt.getX() > pnt.getY())? pnt.getX() : pnt.getY();
+    result = (pnt.getZ() > result)? pnt.getZ() : result;
+    return result;
+}
+
+inline const Point3 minPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        (pnt0.getX() < pnt1.getX())? pnt0.getX() : pnt1.getX(),
+        (pnt0.getY() < pnt1.getY())? pnt0.getY() : pnt1.getY(),
+        (pnt0.getZ() < pnt1.getZ())? pnt0.getZ() : pnt1.getZ()
+    );
+}
+
+inline float minElem( const Point3 & pnt )
+{
+    float result;
+    result = (pnt.getX() < pnt.getY())? pnt.getX() : pnt.getY();
+    result = (pnt.getZ() < result)? pnt.getZ() : result;
+    return result;
+}
+
+inline float sum( const Point3 & pnt )
+{
+    float result;
+    result = ( pnt.getX() + pnt.getY() );
+    result = ( result + pnt.getZ() );
+    return result;
+}
+
+inline const Point3 scale( const Point3 & pnt, float scaleVal )
+{
+    return mulPerElem( pnt, Point3( scaleVal ) );
+}
+
+inline const Point3 scale( const Point3 & pnt, const Vector3 & scaleVec )
+{
+    return mulPerElem( pnt, Point3( scaleVec ) );
+}
+
+inline float projection( const Point3 & pnt, const Vector3 & unitVec )
+{
+    float result;
+    result = ( pnt.getX() * unitVec.getX() );
+    result = ( result + ( pnt.getY() * unitVec.getY() ) );
+    result = ( result + ( pnt.getZ() * unitVec.getZ() ) );
+    return result;
+}
+
+inline float distSqrFromOrigin( const Point3 & pnt )
+{
+    return lengthSqr( Vector3( pnt ) );
+}
+
+inline float distFromOrigin( const Point3 & pnt )
+{
+    return length( Vector3( pnt ) );
+}
+
+inline float distSqr( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return lengthSqr( ( pnt1 - pnt0 ) );
+}
+
+inline float dist( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return length( ( pnt1 - pnt0 ) );
+}
+
+inline const Point3 select( const Point3 & pnt0, const Point3 & pnt1, bool select1 )
+{
+    return Point3(
+        ( select1 )? pnt1.getX() : pnt0.getX(),
+        ( select1 )? pnt1.getY() : pnt0.getY(),
+        ( select1 )? pnt1.getZ() : pnt0.getZ()
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Point3 & pnt )
+{
+    printf( "( %f %f %f )\n", pnt.getX(), pnt.getY(), pnt.getZ() );
+}
+
+inline void print( const Point3 & pnt, const char * name )
+{
+    printf( "%s: ( %f %f %f )\n", name, pnt.getX(), pnt.getY(), pnt.getZ() );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
+
diff --git a/test/Bullet2/vectormath/neon/vectormath_aos.h b/test/Bullet2/vectormath/neon/vectormath_aos.h
new file mode 100644
index 000000000..97bdc278a
--- /dev/null
+++ b/test/Bullet2/vectormath/neon/vectormath_aos.h
@@ -0,0 +1,1890 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+ 
+This source version has been altered.
+
+*/
+
+#ifndef _VECTORMATH_AOS_CPP_H
+#define _VECTORMATH_AOS_CPP_H
+
+#include <math.h>
+
+#ifdef _VECTORMATH_DEBUG
+#include <stdio.h>
+#endif
+
+namespace Vectormath {
+
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// Forward Declarations
+//
+
+class Vector3;
+class Vector4;
+class Point3;
+class Quat;
+class Matrix3;
+class Matrix4;
+class Transform3;
+
+// A 3-D vector in array-of-structures format
+//
+class Vector3
+{
+    float mX;
+    float mY;
+    float mZ;
+#ifndef __GNUC__
+    float d;
+#endif
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Vector3( ) { };
+
+    // Copy a 3-D vector
+    // 
+    inline Vector3( const Vector3 & vec );
+
+    // Construct a 3-D vector from x, y, and z elements
+    // 
+    inline Vector3( float x, float y, float z );
+
+    // Copy elements from a 3-D point into a 3-D vector
+    // 
+    explicit inline Vector3( const Point3 & pnt );
+
+    // Set all elements of a 3-D vector to the same scalar value
+    // 
+    explicit inline Vector3( float scalar );
+
+    // Assign one 3-D vector to another
+    // 
+    inline Vector3 & operator =( const Vector3 & vec );
+
+    // Set the x element of a 3-D vector
+    // 
+    inline Vector3 & setX( float x );
+
+    // Set the y element of a 3-D vector
+    // 
+    inline Vector3 & setY( float y );
+
+    // Set the z element of a 3-D vector
+    // 
+    inline Vector3 & setZ( float z );
+
+    // Get the x element of a 3-D vector
+    // 
+    inline float getX( ) const;
+
+    // Get the y element of a 3-D vector
+    // 
+    inline float getY( ) const;
+
+    // Get the z element of a 3-D vector
+    // 
+    inline float getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D vector by index
+    // 
+    inline Vector3 & setElem( int idx, float value );
+
+    // Get an x, y, or z element of a 3-D vector by index
+    // 
+    inline float getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+
+    // Add two 3-D vectors
+    // 
+    inline const Vector3 operator +( const Vector3 & vec ) const;
+
+    // Subtract a 3-D vector from another 3-D vector
+    // 
+    inline const Vector3 operator -( const Vector3 & vec ) const;
+
+    // Add a 3-D vector to a 3-D point
+    // 
+    inline const Point3 operator +( const Point3 & pnt ) const;
+
+    // Multiply a 3-D vector by a scalar
+    // 
+    inline const Vector3 operator *( float scalar ) const;
+
+    // Divide a 3-D vector by a scalar
+    // 
+    inline const Vector3 operator /( float scalar ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    inline Vector3 & operator +=( const Vector3 & vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    inline Vector3 & operator -=( const Vector3 & vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Vector3 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    inline Vector3 & operator /=( float scalar );
+
+    // Negate all elements of a 3-D vector
+    // 
+    inline const Vector3 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static inline const Vector3 xAxis( );
+
+    // Construct y axis
+    // 
+    static inline const Vector3 yAxis( );
+
+    // Construct z axis
+    // 
+    static inline const Vector3 zAxis( );
+
+}
+#ifdef __GNUC__
+__attribute__ ((aligned(16)))
+#endif
+;
+
+// Multiply a 3-D vector by a scalar
+// 
+inline const Vector3 operator *( float scalar, const Vector3 & vec );
+
+// Multiply two 3-D vectors per element
+// 
+inline const Vector3 mulPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Divide two 3-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+inline const Vector3 divPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Compute the reciprocal of a 3-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+inline const Vector3 recipPerElem( const Vector3 & vec );
+
+// Compute the square root of a 3-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function sqrtf4.
+// 
+inline const Vector3 sqrtPerElem( const Vector3 & vec );
+
+// Compute the reciprocal square root of a 3-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function rsqrtf4.
+// 
+inline const Vector3 rsqrtPerElem( const Vector3 & vec );
+
+// Compute the absolute value of a 3-D vector per element
+// 
+inline const Vector3 absPerElem( const Vector3 & vec );
+
+// Copy sign from one 3-D vector to another, per element
+// 
+inline const Vector3 copySignPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Maximum of two 3-D vectors per element
+// 
+inline const Vector3 maxPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Minimum of two 3-D vectors per element
+// 
+inline const Vector3 minPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Maximum element of a 3-D vector
+// 
+inline float maxElem( const Vector3 & vec );
+
+// Minimum element of a 3-D vector
+// 
+inline float minElem( const Vector3 & vec );
+
+// Compute the sum of all elements of a 3-D vector
+// 
+inline float sum( const Vector3 & vec );
+
+// Compute the dot product of two 3-D vectors
+// 
+inline float dot( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Compute the square of the length of a 3-D vector
+// 
+inline float lengthSqr( const Vector3 & vec );
+
+// Compute the length of a 3-D vector
+// 
+inline float length( const Vector3 & vec );
+
+// Normalize a 3-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+inline const Vector3 normalize( const Vector3 & vec );
+
+// Compute cross product of two 3-D vectors
+// 
+inline const Vector3 cross( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Outer product of two 3-D vectors
+// 
+inline const Matrix3 outer( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Pre-multiply a row vector by a 3x3 matrix
+// 
+inline const Vector3 rowMul( const Vector3 & vec, const Matrix3 & mat );
+
+// Cross-product matrix of a 3-D vector
+// 
+inline const Matrix3 crossMatrix( const Vector3 & vec );
+
+// Create cross-product matrix and multiply
+// NOTE: 
+// Faster than separately creating a cross-product matrix and multiplying.
+// 
+inline const Matrix3 crossMatrixMul( const Vector3 & vec, const Matrix3 & mat );
+
+// Linear interpolation between two 3-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+inline const Vector3 lerp( float t, const Vector3 & vec0, const Vector3 & vec1 );
+
+// Spherical linear interpolation between two 3-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+inline const Vector3 slerp( float t, const Vector3 & unitVec0, const Vector3 & unitVec1 );
+
+// Conditionally select between two 3-D vectors
+// 
+inline const Vector3 select( const Vector3 & vec0, const Vector3 & vec1, bool select1 );
+
+// Load x, y, and z elements from the first three words of a float array.
+// 
+// 
+inline void loadXYZ( Vector3 & vec, const float * fptr );
+
+// Store x, y, and z elements of a 3-D vector in the first three words of a float array.
+// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
+// 
+inline void storeXYZ( const Vector3 & vec, float * fptr );
+
+// Load three-half-floats as a 3-D vector
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs.
+// 
+inline void loadHalfFloats( Vector3 & vec, const unsigned short * hfptr );
+
+// Store a 3-D vector as half-floats. Memory area of previous 16 bytes and next 32 bytes from <code><i>hfptr</i></code> might be accessed.
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs. Memory area of previous 16 bytes and next 32 bytes from hfptr might be accessed.
+// 
+inline void storeHalfFloats( const Vector3 & vec, unsigned short * hfptr );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Vector3 & vec );
+
+// Print a 3-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Vector3 & vec, const char * name );
+
+#endif
+
+// A 4-D vector in array-of-structures format
+//
+class Vector4
+{
+    float mX;
+    float mY;
+    float mZ;
+    float mW;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Vector4( ) { };
+
+    // Copy a 4-D vector
+    // 
+    inline Vector4( const Vector4 & vec );
+
+    // Construct a 4-D vector from x, y, z, and w elements
+    // 
+    inline Vector4( float x, float y, float z, float w );
+
+    // Construct a 4-D vector from a 3-D vector and a scalar
+    // 
+    inline Vector4( const Vector3 & xyz, float w );
+
+    // Copy x, y, and z from a 3-D vector into a 4-D vector, and set w to 0
+    // 
+    explicit inline Vector4( const Vector3 & vec );
+
+    // Copy x, y, and z from a 3-D point into a 4-D vector, and set w to 1
+    // 
+    explicit inline Vector4( const Point3 & pnt );
+
+    // Copy elements from a quaternion into a 4-D vector
+    // 
+    explicit inline Vector4( const Quat & quat );
+
+    // Set all elements of a 4-D vector to the same scalar value
+    // 
+    explicit inline Vector4( float scalar );
+
+    // Assign one 4-D vector to another
+    // 
+    inline Vector4 & operator =( const Vector4 & vec );
+
+    // Set the x, y, and z elements of a 4-D vector
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    inline Vector4 & setXYZ( const Vector3 & vec );
+
+    // Get the x, y, and z elements of a 4-D vector
+    // 
+    inline const Vector3 getXYZ( ) const;
+
+    // Set the x element of a 4-D vector
+    // 
+    inline Vector4 & setX( float x );
+
+    // Set the y element of a 4-D vector
+    // 
+    inline Vector4 & setY( float y );
+
+    // Set the z element of a 4-D vector
+    // 
+    inline Vector4 & setZ( float z );
+
+    // Set the w element of a 4-D vector
+    // 
+    inline Vector4 & setW( float w );
+
+    // Get the x element of a 4-D vector
+    // 
+    inline float getX( ) const;
+
+    // Get the y element of a 4-D vector
+    // 
+    inline float getY( ) const;
+
+    // Get the z element of a 4-D vector
+    // 
+    inline float getZ( ) const;
+
+    // Get the w element of a 4-D vector
+    // 
+    inline float getW( ) const;
+
+    // Set an x, y, z, or w element of a 4-D vector by index
+    // 
+    inline Vector4 & setElem( int idx, float value );
+
+    // Get an x, y, z, or w element of a 4-D vector by index
+    // 
+    inline float getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+
+    // Add two 4-D vectors
+    // 
+    inline const Vector4 operator +( const Vector4 & vec ) const;
+
+    // Subtract a 4-D vector from another 4-D vector
+    // 
+    inline const Vector4 operator -( const Vector4 & vec ) const;
+
+    // Multiply a 4-D vector by a scalar
+    // 
+    inline const Vector4 operator *( float scalar ) const;
+
+    // Divide a 4-D vector by a scalar
+    // 
+    inline const Vector4 operator /( float scalar ) const;
+
+    // Perform compound assignment and addition with a 4-D vector
+    // 
+    inline Vector4 & operator +=( const Vector4 & vec );
+
+    // Perform compound assignment and subtraction by a 4-D vector
+    // 
+    inline Vector4 & operator -=( const Vector4 & vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Vector4 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    inline Vector4 & operator /=( float scalar );
+
+    // Negate all elements of a 4-D vector
+    // 
+    inline const Vector4 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static inline const Vector4 xAxis( );
+
+    // Construct y axis
+    // 
+    static inline const Vector4 yAxis( );
+
+    // Construct z axis
+    // 
+    static inline const Vector4 zAxis( );
+
+    // Construct w axis
+    // 
+    static inline const Vector4 wAxis( );
+
+}
+#ifdef __GNUC__
+__attribute__ ((aligned(16)))
+#endif
+;
+
+// Multiply a 4-D vector by a scalar
+// 
+inline const Vector4 operator *( float scalar, const Vector4 & vec );
+
+// Multiply two 4-D vectors per element
+// 
+inline const Vector4 mulPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Divide two 4-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+inline const Vector4 divPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Compute the reciprocal of a 4-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+inline const Vector4 recipPerElem( const Vector4 & vec );
+
+// Compute the square root of a 4-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function sqrtf4.
+// 
+inline const Vector4 sqrtPerElem( const Vector4 & vec );
+
+// Compute the reciprocal square root of a 4-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function rsqrtf4.
+// 
+inline const Vector4 rsqrtPerElem( const Vector4 & vec );
+
+// Compute the absolute value of a 4-D vector per element
+// 
+inline const Vector4 absPerElem( const Vector4 & vec );
+
+// Copy sign from one 4-D vector to another, per element
+// 
+inline const Vector4 copySignPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Maximum of two 4-D vectors per element
+// 
+inline const Vector4 maxPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Minimum of two 4-D vectors per element
+// 
+inline const Vector4 minPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Maximum element of a 4-D vector
+// 
+inline float maxElem( const Vector4 & vec );
+
+// Minimum element of a 4-D vector
+// 
+inline float minElem( const Vector4 & vec );
+
+// Compute the sum of all elements of a 4-D vector
+// 
+inline float sum( const Vector4 & vec );
+
+// Compute the dot product of two 4-D vectors
+// 
+inline float dot( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Compute the square of the length of a 4-D vector
+// 
+inline float lengthSqr( const Vector4 & vec );
+
+// Compute the length of a 4-D vector
+// 
+inline float length( const Vector4 & vec );
+
+// Normalize a 4-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+inline const Vector4 normalize( const Vector4 & vec );
+
+// Outer product of two 4-D vectors
+// 
+inline const Matrix4 outer( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Linear interpolation between two 4-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+inline const Vector4 lerp( float t, const Vector4 & vec0, const Vector4 & vec1 );
+
+// Spherical linear interpolation between two 4-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+inline const Vector4 slerp( float t, const Vector4 & unitVec0, const Vector4 & unitVec1 );
+
+// Conditionally select between two 4-D vectors
+// 
+inline const Vector4 select( const Vector4 & vec0, const Vector4 & vec1, bool select1 );
+
+// Load x, y, z, and w elements from the first four words of a float array.
+// 
+// 
+inline void loadXYZW( Vector4 & vec, const float * fptr );
+
+// Store x, y, z, and w elements of a 4-D vector in the first four words of a float array.
+// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
+// 
+inline void storeXYZW( const Vector4 & vec, float * fptr );
+
+// Load four-half-floats as a 4-D vector
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs.
+// 
+inline void loadHalfFloats( Vector4 & vec, const unsigned short * hfptr );
+
+// Store a 4-D vector as half-floats. Memory area of previous 16 bytes and next 32 bytes from <code><i>hfptr</i></code> might be accessed.
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs. Memory area of previous 16 bytes and next 32 bytes from hfptr might be accessed.
+// 
+inline void storeHalfFloats( const Vector4 & vec, unsigned short * hfptr );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Vector4 & vec );
+
+// Print a 4-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Vector4 & vec, const char * name );
+
+#endif
+
+// A 3-D point in array-of-structures format
+//
+class Point3
+{
+    float mX;
+    float mY;
+    float mZ;
+#ifndef __GNUC__
+    float d;
+#endif
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Point3( ) { };
+
+    // Copy a 3-D point
+    // 
+    inline Point3( const Point3 & pnt );
+
+    // Construct a 3-D point from x, y, and z elements
+    // 
+    inline Point3( float x, float y, float z );
+
+    // Copy elements from a 3-D vector into a 3-D point
+    // 
+    explicit inline Point3( const Vector3 & vec );
+
+    // Set all elements of a 3-D point to the same scalar value
+    // 
+    explicit inline Point3( float scalar );
+
+    // Assign one 3-D point to another
+    // 
+    inline Point3 & operator =( const Point3 & pnt );
+
+    // Set the x element of a 3-D point
+    // 
+    inline Point3 & setX( float x );
+
+    // Set the y element of a 3-D point
+    // 
+    inline Point3 & setY( float y );
+
+    // Set the z element of a 3-D point
+    // 
+    inline Point3 & setZ( float z );
+
+    // Get the x element of a 3-D point
+    // 
+    inline float getX( ) const;
+
+    // Get the y element of a 3-D point
+    // 
+    inline float getY( ) const;
+
+    // Get the z element of a 3-D point
+    // 
+    inline float getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D point by index
+    // 
+    inline Point3 & setElem( int idx, float value );
+
+    // Get an x, y, or z element of a 3-D point by index
+    // 
+    inline float getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+
+    // Subtract a 3-D point from another 3-D point
+    // 
+    inline const Vector3 operator -( const Point3 & pnt ) const;
+
+    // Add a 3-D point to a 3-D vector
+    // 
+    inline const Point3 operator +( const Vector3 & vec ) const;
+
+    // Subtract a 3-D vector from a 3-D point
+    // 
+    inline const Point3 operator -( const Vector3 & vec ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    inline Point3 & operator +=( const Vector3 & vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    inline Point3 & operator -=( const Vector3 & vec );
+
+}
+#ifdef __GNUC__
+__attribute__ ((aligned(16)))
+#endif
+;
+
+// Multiply two 3-D points per element
+// 
+inline const Point3 mulPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Divide two 3-D points per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+inline const Point3 divPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Compute the reciprocal of a 3-D point per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+inline const Point3 recipPerElem( const Point3 & pnt );
+
+// Compute the square root of a 3-D point per element
+// NOTE: 
+// Floating-point behavior matches standard library function sqrtf4.
+// 
+inline const Point3 sqrtPerElem( const Point3 & pnt );
+
+// Compute the reciprocal square root of a 3-D point per element
+// NOTE: 
+// Floating-point behavior matches standard library function rsqrtf4.
+// 
+inline const Point3 rsqrtPerElem( const Point3 & pnt );
+
+// Compute the absolute value of a 3-D point per element
+// 
+inline const Point3 absPerElem( const Point3 & pnt );
+
+// Copy sign from one 3-D point to another, per element
+// 
+inline const Point3 copySignPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Maximum of two 3-D points per element
+// 
+inline const Point3 maxPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Minimum of two 3-D points per element
+// 
+inline const Point3 minPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Maximum element of a 3-D point
+// 
+inline float maxElem( const Point3 & pnt );
+
+// Minimum element of a 3-D point
+// 
+inline float minElem( const Point3 & pnt );
+
+// Compute the sum of all elements of a 3-D point
+// 
+inline float sum( const Point3 & pnt );
+
+// Apply uniform scale to a 3-D point
+// 
+inline const Point3 scale( const Point3 & pnt, float scaleVal );
+
+// Apply non-uniform scale to a 3-D point
+// 
+inline const Point3 scale( const Point3 & pnt, const Vector3 & scaleVec );
+
+// Scalar projection of a 3-D point on a unit-length 3-D vector
+// 
+inline float projection( const Point3 & pnt, const Vector3 & unitVec );
+
+// Compute the square of the distance of a 3-D point from the coordinate-system origin
+// 
+inline float distSqrFromOrigin( const Point3 & pnt );
+
+// Compute the distance of a 3-D point from the coordinate-system origin
+// 
+inline float distFromOrigin( const Point3 & pnt );
+
+// Compute the square of the distance between two 3-D points
+// 
+inline float distSqr( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Compute the distance between two 3-D points
+// 
+inline float dist( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Linear interpolation between two 3-D points
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+inline const Point3 lerp( float t, const Point3 & pnt0, const Point3 & pnt1 );
+
+// Conditionally select between two 3-D points
+// 
+inline const Point3 select( const Point3 & pnt0, const Point3 & pnt1, bool select1 );
+
+// Load x, y, and z elements from the first three words of a float array.
+// 
+// 
+inline void loadXYZ( Point3 & pnt, const float * fptr );
+
+// Store x, y, and z elements of a 3-D point in the first three words of a float array.
+// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
+// 
+inline void storeXYZ( const Point3 & pnt, float * fptr );
+
+// Load three-half-floats as a 3-D point
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs.
+// 
+inline void loadHalfFloats( Point3 & pnt, const unsigned short * hfptr );
+
+// Store a 3-D point as half-floats. Memory area of previous 16 bytes and next 32 bytes from <code><i>hfptr</i></code> might be accessed.
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs. Memory area of previous 16 bytes and next 32 bytes from hfptr might be accessed.
+// 
+inline void storeHalfFloats( const Point3 & pnt, unsigned short * hfptr );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D point
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Point3 & pnt );
+
+// Print a 3-D point and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Point3 & pnt, const char * name );
+
+#endif
+
+// A quaternion in array-of-structures format
+//
+class Quat
+{
+#if defined( __APPLE__ ) && defined( BT_USE_NEON )
+    union{
+        float32x4_t vXYZW;
+        float mXYZW[4];
+    };
+#else
+    float mX;
+    float mY;
+    float mZ;
+    float mW;
+#endif
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Quat( ) { };
+
+    // Copy a quaternion
+    // 
+    inline Quat( const Quat & quat );
+
+    // Construct a quaternion from x, y, z, and w elements
+    // 
+    inline Quat( float x, float y, float z, float w );
+    
+    // Construct a quaternion from vector of x, y, z, and w elements
+    // 
+    inline Quat( float32x4_t fXYZW );
+
+    // Construct a quaternion from a 3-D vector and a scalar
+    // 
+    inline Quat( const Vector3 & xyz, float w );
+
+    // Copy elements from a 4-D vector into a quaternion
+    // 
+    explicit inline Quat( const Vector4 & vec );
+
+    // Convert a rotation matrix to a unit-length quaternion
+    // 
+    explicit inline Quat( const Matrix3 & rotMat );
+
+    // Set all elements of a quaternion to the same scalar value
+    // 
+    explicit inline Quat( float scalar );
+    
+    // Assign one quaternion to another
+    // 
+    inline Quat & operator =( const Quat & quat );
+
+    // Set the x, y, and z elements of a quaternion
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    inline Quat & setXYZ( const Vector3 & vec );
+
+    // Get the x, y, and z elements of a quaternion
+    // 
+    inline const Vector3 getXYZ( ) const;
+
+    // Set the x element of a quaternion
+    // 
+    inline Quat & setX( float x );
+
+    // Set the y element of a quaternion
+    // 
+    inline Quat & setY( float y );
+
+    // Set the z element of a quaternion
+    // 
+    inline Quat & setZ( float z );
+
+    // Set the w element of a quaternion
+    // 
+    inline Quat & setW( float w );
+
+#if defined( __APPLE__ ) && defined( BT_USE_NEON )
+    inline float32x4_t getvXYZW( ) const;
+#endif
+    
+    // Get the x element of a quaternion
+    // 
+    inline float getX( ) const;
+
+    // Get the y element of a quaternion
+    // 
+    inline float getY( ) const;
+
+    // Get the z element of a quaternion
+    // 
+    inline float getZ( ) const;
+
+    // Get the w element of a quaternion
+    // 
+    inline float getW( ) const;
+
+    // Set an x, y, z, or w element of a quaternion by index
+    // 
+    inline Quat & setElem( int idx, float value );
+
+    // Get an x, y, z, or w element of a quaternion by index
+    // 
+    inline float getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+
+    // Add two quaternions
+    // 
+    inline const Quat operator +( const Quat & quat ) const;
+
+    // Subtract a quaternion from another quaternion
+    // 
+    inline const Quat operator -( const Quat & quat ) const;
+
+    // Multiply two quaternions
+    // 
+    inline const Quat operator *( const Quat & quat ) const;
+
+    // Multiply a quaternion by a scalar
+    // 
+    inline const Quat operator *( float scalar ) const;
+
+    // Divide a quaternion by a scalar
+    // 
+    inline const Quat operator /( float scalar ) const;
+
+    // Perform compound assignment and addition with a quaternion
+    // 
+    inline Quat & operator +=( const Quat & quat );
+
+    // Perform compound assignment and subtraction by a quaternion
+    // 
+    inline Quat & operator -=( const Quat & quat );
+
+    // Perform compound assignment and multiplication by a quaternion
+    // 
+    inline Quat & operator *=( const Quat & quat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Quat & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    inline Quat & operator /=( float scalar );
+
+    // Negate all elements of a quaternion
+    // 
+    inline const Quat operator -( ) const;
+
+    // Construct an identity quaternion
+    // 
+    static inline const Quat identity( );
+
+    // Construct a quaternion to rotate between two unit-length 3-D vectors
+    // NOTE: 
+    // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions.
+    // 
+    static inline const Quat rotation( const Vector3 & unitVec0, const Vector3 & unitVec1 );
+
+    // Construct a quaternion to rotate around a unit-length 3-D vector
+    // 
+    static inline const Quat rotation( float radians, const Vector3 & unitVec );
+
+    // Construct a quaternion to rotate around the x axis
+    // 
+    static inline const Quat rotationX( float radians );
+
+    // Construct a quaternion to rotate around the y axis
+    // 
+    static inline const Quat rotationY( float radians );
+
+    // Construct a quaternion to rotate around the z axis
+    // 
+    static inline const Quat rotationZ( float radians );
+
+}
+#ifdef __GNUC__
+__attribute__ ((aligned(16)))
+#endif
+;
+
+// Multiply a quaternion by a scalar
+// 
+inline const Quat operator *( float scalar, const Quat & quat );
+
+// Compute the conjugate of a quaternion
+// 
+inline const Quat conj( const Quat & quat );
+
+// Use a unit-length quaternion to rotate a 3-D vector
+// 
+inline const Vector3 rotate( const Quat & unitQuat, const Vector3 & vec );
+
+// Compute the dot product of two quaternions
+// 
+inline float dot( const Quat & quat0, const Quat & quat1 );
+
+// Compute the norm of a quaternion
+// 
+inline float norm( const Quat & quat );
+
+// Compute the length of a quaternion
+// 
+inline float length( const Quat & quat );
+
+// Normalize a quaternion
+// NOTE: 
+// The result is unpredictable when all elements of quat are at or near zero.
+// 
+inline const Quat normalize( const Quat & quat );
+
+// Linear interpolation between two quaternions
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+inline const Quat lerp( float t, const Quat & quat0, const Quat & quat1 );
+
+// Spherical linear interpolation between two quaternions
+// NOTE: 
+// Interpolates along the shortest path between orientations.
+// Does not clamp t between 0 and 1.
+// 
+inline const Quat slerp( float t, const Quat & unitQuat0, const Quat & unitQuat1 );
+
+// Spherical quadrangle interpolation
+// 
+inline const Quat squad( float t, const Quat & unitQuat0, const Quat & unitQuat1, const Quat & unitQuat2, const Quat & unitQuat3 );
+
+// Conditionally select between two quaternions
+// 
+inline const Quat select( const Quat & quat0, const Quat & quat1, bool select1 );
+
+// Load x, y, z, and w elements from the first four words of a float array.
+// 
+// 
+inline void loadXYZW( Quat & quat, const float * fptr );
+
+// Store x, y, z, and w elements of a quaternion in the first four words of a float array.
+// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
+// 
+inline void storeXYZW( const Quat & quat, float * fptr );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a quaternion
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Quat & quat );
+
+// Print a quaternion and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Quat & quat, const char * name );
+
+#endif
+
+// A 3x3 matrix in array-of-structures format
+//
+class Matrix3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Matrix3( ) { };
+
+    // Copy a 3x3 matrix
+    // 
+    inline Matrix3( const Matrix3 & mat );
+
+    // Construct a 3x3 matrix containing the specified columns
+    // 
+    inline Matrix3( const Vector3 & col0, const Vector3 & col1, const Vector3 & col2 );
+
+    // Construct a 3x3 rotation matrix from a unit-length quaternion
+    // 
+    explicit inline Matrix3( const Quat & unitQuat );
+
+    // Set all elements of a 3x3 matrix to the same scalar value
+    // 
+    explicit inline Matrix3( float scalar );
+
+    // Assign one 3x3 matrix to another
+    // 
+    inline Matrix3 & operator =( const Matrix3 & mat );
+
+    // Set column 0 of a 3x3 matrix
+    // 
+    inline Matrix3 & setCol0( const Vector3 & col0 );
+
+    // Set column 1 of a 3x3 matrix
+    // 
+    inline Matrix3 & setCol1( const Vector3 & col1 );
+
+    // Set column 2 of a 3x3 matrix
+    // 
+    inline Matrix3 & setCol2( const Vector3 & col2 );
+
+    // Get column 0 of a 3x3 matrix
+    // 
+    inline const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x3 matrix
+    // 
+    inline const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x3 matrix
+    // 
+    inline const Vector3 getCol2( ) const;
+
+    // Set the column of a 3x3 matrix referred to by the specified index
+    // 
+    inline Matrix3 & setCol( int col, const Vector3 & vec );
+
+    // Set the row of a 3x3 matrix referred to by the specified index
+    // 
+    inline Matrix3 & setRow( int row, const Vector3 & vec );
+
+    // Get the column of a 3x3 matrix referred to by the specified index
+    // 
+    inline const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x3 matrix referred to by the specified index
+    // 
+    inline const Vector3 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    inline Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    inline const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x3 matrix referred to by column and row indices
+    // 
+    inline Matrix3 & setElem( int col, int row, float val );
+
+    // Get the element of a 3x3 matrix referred to by column and row indices
+    // 
+    inline float getElem( int col, int row ) const;
+
+    // Add two 3x3 matrices
+    // 
+    inline const Matrix3 operator +( const Matrix3 & mat ) const;
+
+    // Subtract a 3x3 matrix from another 3x3 matrix
+    // 
+    inline const Matrix3 operator -( const Matrix3 & mat ) const;
+
+    // Negate all elements of a 3x3 matrix
+    // 
+    inline const Matrix3 operator -( ) const;
+
+    // Multiply a 3x3 matrix by a scalar
+    // 
+    inline const Matrix3 operator *( float scalar ) const;
+
+    // Multiply a 3x3 matrix by a 3-D vector
+    // 
+    inline const Vector3 operator *( const Vector3 & vec ) const;
+
+    // Multiply two 3x3 matrices
+    // 
+    inline const Matrix3 operator *( const Matrix3 & mat ) const;
+
+    // Perform compound assignment and addition with a 3x3 matrix
+    // 
+    inline Matrix3 & operator +=( const Matrix3 & mat );
+
+    // Perform compound assignment and subtraction by a 3x3 matrix
+    // 
+    inline Matrix3 & operator -=( const Matrix3 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Matrix3 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a 3x3 matrix
+    // 
+    inline Matrix3 & operator *=( const Matrix3 & mat );
+
+    // Construct an identity 3x3 matrix
+    // 
+    static inline const Matrix3 identity( );
+
+    // Construct a 3x3 matrix to rotate around the x axis
+    // 
+    static inline const Matrix3 rotationX( float radians );
+
+    // Construct a 3x3 matrix to rotate around the y axis
+    // 
+    static inline const Matrix3 rotationY( float radians );
+
+    // Construct a 3x3 matrix to rotate around the z axis
+    // 
+    static inline const Matrix3 rotationZ( float radians );
+
+    // Construct a 3x3 matrix to rotate around the x, y, and z axes
+    // 
+    static inline const Matrix3 rotationZYX( const Vector3 & radiansXYZ );
+
+    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector
+    // 
+    static inline const Matrix3 rotation( float radians, const Vector3 & unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static inline const Matrix3 rotation( const Quat & unitQuat );
+
+    // Construct a 3x3 matrix to perform scaling
+    // 
+    static inline const Matrix3 scale( const Vector3 & scaleVec );
+
+};
+// Multiply a 3x3 matrix by a scalar
+// 
+inline const Matrix3 operator *( float scalar, const Matrix3 & mat );
+
+// Append (post-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 & scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Matrix3 prependScale( const Vector3 & scaleVec, const Matrix3 & mat );
+
+// Multiply two 3x3 matrices per element
+// 
+inline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 );
+
+// Compute the absolute value of a 3x3 matrix per element
+// 
+inline const Matrix3 absPerElem( const Matrix3 & mat );
+
+// Transpose of a 3x3 matrix
+// 
+inline const Matrix3 transpose( const Matrix3 & mat );
+
+// Compute the inverse of a 3x3 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+inline const Matrix3 inverse( const Matrix3 & mat );
+
+// Determinant of a 3x3 matrix
+// 
+inline float determinant( const Matrix3 & mat );
+
+// Conditionally select between two 3x3 matrices
+// 
+inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x3 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Matrix3 & mat );
+
+// Print a 3x3 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Matrix3 & mat, const char * name );
+
+#endif
+
+// A 4x4 matrix in array-of-structures format
+//
+class Matrix4
+{
+    Vector4 mCol0;
+    Vector4 mCol1;
+    Vector4 mCol2;
+    Vector4 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Matrix4( ) { };
+
+    // Copy a 4x4 matrix
+    // 
+    inline Matrix4( const Matrix4 & mat );
+
+    // Construct a 4x4 matrix containing the specified columns
+    // 
+    inline Matrix4( const Vector4 & col0, const Vector4 & col1, const Vector4 & col2, const Vector4 & col3 );
+
+    // Construct a 4x4 matrix from a 3x4 transformation matrix
+    // 
+    explicit inline Matrix4( const Transform3 & mat );
+
+    // Construct a 4x4 matrix from a 3x3 matrix and a 3-D vector
+    // 
+    inline Matrix4( const Matrix3 & mat, const Vector3 & translateVec );
+
+    // Construct a 4x4 matrix from a unit-length quaternion and a 3-D vector
+    // 
+    inline Matrix4( const Quat & unitQuat, const Vector3 & translateVec );
+
+    // Set all elements of a 4x4 matrix to the same scalar value
+    // 
+    explicit inline Matrix4( float scalar );
+
+    // Assign one 4x4 matrix to another
+    // 
+    inline Matrix4 & operator =( const Matrix4 & mat );
+
+    // Set the upper-left 3x3 submatrix
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    inline Matrix4 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 4x4 matrix
+    // 
+    inline const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    inline Matrix4 & setTranslation( const Vector3 & translateVec );
+
+    // Get the translation component of a 4x4 matrix
+    // 
+    inline const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 4x4 matrix
+    // 
+    inline Matrix4 & setCol0( const Vector4 & col0 );
+
+    // Set column 1 of a 4x4 matrix
+    // 
+    inline Matrix4 & setCol1( const Vector4 & col1 );
+
+    // Set column 2 of a 4x4 matrix
+    // 
+    inline Matrix4 & setCol2( const Vector4 & col2 );
+
+    // Set column 3 of a 4x4 matrix
+    // 
+    inline Matrix4 & setCol3( const Vector4 & col3 );
+
+    // Get column 0 of a 4x4 matrix
+    // 
+    inline const Vector4 getCol0( ) const;
+
+    // Get column 1 of a 4x4 matrix
+    // 
+    inline const Vector4 getCol1( ) const;
+
+    // Get column 2 of a 4x4 matrix
+    // 
+    inline const Vector4 getCol2( ) const;
+
+    // Get column 3 of a 4x4 matrix
+    // 
+    inline const Vector4 getCol3( ) const;
+
+    // Set the column of a 4x4 matrix referred to by the specified index
+    // 
+    inline Matrix4 & setCol( int col, const Vector4 & vec );
+
+    // Set the row of a 4x4 matrix referred to by the specified index
+    // 
+    inline Matrix4 & setRow( int row, const Vector4 & vec );
+
+    // Get the column of a 4x4 matrix referred to by the specified index
+    // 
+    inline const Vector4 getCol( int col ) const;
+
+    // Get the row of a 4x4 matrix referred to by the specified index
+    // 
+    inline const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    inline Vector4 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    inline const Vector4 operator []( int col ) const;
+
+    // Set the element of a 4x4 matrix referred to by column and row indices
+    // 
+    inline Matrix4 & setElem( int col, int row, float val );
+
+    // Get the element of a 4x4 matrix referred to by column and row indices
+    // 
+    inline float getElem( int col, int row ) const;
+
+    // Add two 4x4 matrices
+    // 
+    inline const Matrix4 operator +( const Matrix4 & mat ) const;
+
+    // Subtract a 4x4 matrix from another 4x4 matrix
+    // 
+    inline const Matrix4 operator -( const Matrix4 & mat ) const;
+
+    // Negate all elements of a 4x4 matrix
+    // 
+    inline const Matrix4 operator -( ) const;
+
+    // Multiply a 4x4 matrix by a scalar
+    // 
+    inline const Matrix4 operator *( float scalar ) const;
+
+    // Multiply a 4x4 matrix by a 4-D vector
+    // 
+    inline const Vector4 operator *( const Vector4 & vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D vector
+    // 
+    inline const Vector4 operator *( const Vector3 & vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D point
+    // 
+    inline const Vector4 operator *( const Point3 & pnt ) const;
+
+    // Multiply two 4x4 matrices
+    // 
+    inline const Matrix4 operator *( const Matrix4 & mat ) const;
+
+    // Multiply a 4x4 matrix by a 3x4 transformation matrix
+    // 
+    inline const Matrix4 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and addition with a 4x4 matrix
+    // 
+    inline Matrix4 & operator +=( const Matrix4 & mat );
+
+    // Perform compound assignment and subtraction by a 4x4 matrix
+    // 
+    inline Matrix4 & operator -=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Matrix4 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a 4x4 matrix
+    // 
+    inline Matrix4 & operator *=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    inline Matrix4 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 4x4 matrix
+    // 
+    static inline const Matrix4 identity( );
+
+    // Construct a 4x4 matrix to rotate around the x axis
+    // 
+    static inline const Matrix4 rotationX( float radians );
+
+    // Construct a 4x4 matrix to rotate around the y axis
+    // 
+    static inline const Matrix4 rotationY( float radians );
+
+    // Construct a 4x4 matrix to rotate around the z axis
+    // 
+    static inline const Matrix4 rotationZ( float radians );
+
+    // Construct a 4x4 matrix to rotate around the x, y, and z axes
+    // 
+    static inline const Matrix4 rotationZYX( const Vector3 & radiansXYZ );
+
+    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector
+    // 
+    static inline const Matrix4 rotation( float radians, const Vector3 & unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static inline const Matrix4 rotation( const Quat & unitQuat );
+
+    // Construct a 4x4 matrix to perform scaling
+    // 
+    static inline const Matrix4 scale( const Vector3 & scaleVec );
+
+    // Construct a 4x4 matrix to perform translation
+    // 
+    static inline const Matrix4 translation( const Vector3 & translateVec );
+
+    // Construct viewing matrix based on eye position, position looked at, and up direction
+    // 
+    static inline const Matrix4 lookAt( const Point3 & eyePos, const Point3 & lookAtPos, const Vector3 & upVec );
+
+    // Construct a perspective projection matrix
+    // 
+    static inline const Matrix4 perspective( float fovyRadians, float aspect, float zNear, float zFar );
+
+    // Construct a perspective projection matrix based on frustum
+    // 
+    static inline const Matrix4 frustum( float left, float right, float bottom, float top, float zNear, float zFar );
+
+    // Construct an orthographic projection matrix
+    // 
+    static inline const Matrix4 orthographic( float left, float right, float bottom, float top, float zNear, float zFar );
+
+};
+// Multiply a 4x4 matrix by a scalar
+// 
+inline const Matrix4 operator *( float scalar, const Matrix4 & mat );
+
+// Append (post-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 & scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Matrix4 prependScale( const Vector3 & scaleVec, const Matrix4 & mat );
+
+// Multiply two 4x4 matrices per element
+// 
+inline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 );
+
+// Compute the absolute value of a 4x4 matrix per element
+// 
+inline const Matrix4 absPerElem( const Matrix4 & mat );
+
+// Transpose of a 4x4 matrix
+// 
+inline const Matrix4 transpose( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+inline const Matrix4 inverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.  The result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+inline const Matrix4 affineInverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix with an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.
+// 
+inline const Matrix4 orthoInverse( const Matrix4 & mat );
+
+// Determinant of a 4x4 matrix
+// 
+inline float determinant( const Matrix4 & mat );
+
+// Conditionally select between two 4x4 matrices
+// 
+inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4x4 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Matrix4 & mat );
+
+// Print a 4x4 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Matrix4 & mat, const char * name );
+
+#endif
+
+// A 3x4 transformation matrix in array-of-structures format
+//
+class Transform3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+    Vector3 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Transform3( ) { };
+
+    // Copy a 3x4 transformation matrix
+    // 
+    inline Transform3( const Transform3 & tfrm );
+
+    // Construct a 3x4 transformation matrix containing the specified columns
+    // 
+    inline Transform3( const Vector3 & col0, const Vector3 & col1, const Vector3 & col2, const Vector3 & col3 );
+
+    // Construct a 3x4 transformation matrix from a 3x3 matrix and a 3-D vector
+    // 
+    inline Transform3( const Matrix3 & tfrm, const Vector3 & translateVec );
+
+    // Construct a 3x4 transformation matrix from a unit-length quaternion and a 3-D vector
+    // 
+    inline Transform3( const Quat & unitQuat, const Vector3 & translateVec );
+
+    // Set all elements of a 3x4 transformation matrix to the same scalar value
+    // 
+    explicit inline Transform3( float scalar );
+
+    // Assign one 3x4 transformation matrix to another
+    // 
+    inline Transform3 & operator =( const Transform3 & tfrm );
+
+    // Set the upper-left 3x3 submatrix
+    // 
+    inline Transform3 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 3x4 transformation matrix
+    // 
+    inline const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // 
+    inline Transform3 & setTranslation( const Vector3 & translateVec );
+
+    // Get the translation component of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 3x4 transformation matrix
+    // 
+    inline Transform3 & setCol0( const Vector3 & col0 );
+
+    // Set column 1 of a 3x4 transformation matrix
+    // 
+    inline Transform3 & setCol1( const Vector3 & col1 );
+
+    // Set column 2 of a 3x4 transformation matrix
+    // 
+    inline Transform3 & setCol2( const Vector3 & col2 );
+
+    // Set column 3 of a 3x4 transformation matrix
+    // 
+    inline Transform3 & setCol3( const Vector3 & col3 );
+
+    // Get column 0 of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getCol2( ) const;
+
+    // Get column 3 of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getCol3( ) const;
+
+    // Set the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    inline Transform3 & setCol( int col, const Vector3 & vec );
+
+    // Set the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    inline Transform3 & setRow( int row, const Vector4 & vec );
+
+    // Get the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    inline const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    inline const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    inline Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    inline const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    inline Transform3 & setElem( int col, int row, float val );
+
+    // Get the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    inline float getElem( int col, int row ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D vector
+    // 
+    inline const Vector3 operator *( const Vector3 & vec ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D point
+    // 
+    inline const Point3 operator *( const Point3 & pnt ) const;
+
+    // Multiply two 3x4 transformation matrices
+    // 
+    inline const Transform3 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    inline Transform3 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 3x4 transformation matrix
+    // 
+    static inline const Transform3 identity( );
+
+    // Construct a 3x4 transformation matrix to rotate around the x axis
+    // 
+    static inline const Transform3 rotationX( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the y axis
+    // 
+    static inline const Transform3 rotationY( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the z axis
+    // 
+    static inline const Transform3 rotationZ( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the x, y, and z axes
+    // 
+    static inline const Transform3 rotationZYX( const Vector3 & radiansXYZ );
+
+    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector
+    // 
+    static inline const Transform3 rotation( float radians, const Vector3 & unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static inline const Transform3 rotation( const Quat & unitQuat );
+
+    // Construct a 3x4 transformation matrix to perform scaling
+    // 
+    static inline const Transform3 scale( const Vector3 & scaleVec );
+
+    // Construct a 3x4 transformation matrix to perform translation
+    // 
+    static inline const Transform3 translation( const Vector3 & translateVec );
+
+};
+// Append (post-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 & scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Transform3 prependScale( const Vector3 & scaleVec, const Transform3 & tfrm );
+
+// Multiply two 3x4 transformation matrices per element
+// 
+inline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 );
+
+// Compute the absolute value of a 3x4 transformation matrix per element
+// 
+inline const Transform3 absPerElem( const Transform3 & tfrm );
+
+// Inverse of a 3x4 transformation matrix
+// NOTE: 
+// Result is unpredictable when the determinant of the left 3x3 submatrix is equal to or near 0.
+// 
+inline const Transform3 inverse( const Transform3 & tfrm );
+
+// Compute the inverse of a 3x4 transformation matrix, expected to have an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 3x4 transformation matrix meets the given restrictions.
+// 
+inline const Transform3 orthoInverse( const Transform3 & tfrm );
+
+// Conditionally select between two 3x4 transformation matrices
+// 
+inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x4 transformation matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Transform3 & tfrm );
+
+// Print a 3x4 transformation matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Transform3 & tfrm, const char * name );
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#include "vec_aos.h"
+#include "quat_aos.h"
+#include "mat_aos.h"
+
+#endif
+
diff --git a/test/Bullet2/vectormath/scalar/boolInVec.h b/test/Bullet2/vectormath/scalar/boolInVec.h
new file mode 100644
index 000000000..c5eeeebd7
--- /dev/null
+++ b/test/Bullet2/vectormath/scalar/boolInVec.h
@@ -0,0 +1,225 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef _BOOLINVEC_H
+#define _BOOLINVEC_H
+
+#include <math.h>
+namespace Vectormath {
+
+class floatInVec;
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec class
+//
+
+class boolInVec
+{
+private:
+    unsigned int mData;
+
+public:
+    // Default constructor; does no initialization
+    //
+    inline boolInVec( ) { };
+
+    // Construct from a value converted from float
+    //
+    inline boolInVec(floatInVec vec);
+
+    // Explicit cast from bool
+    //
+    explicit inline boolInVec(bool scalar);
+
+    // Explicit cast to bool
+    //
+    inline bool getAsBool() const;
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+    // Implicit cast to bool
+    //
+    inline operator bool() const;
+#endif
+
+    // Boolean negation operator
+    //
+    inline const boolInVec operator ! () const;
+
+    // Assignment operator
+    //
+    inline boolInVec& operator = (boolInVec vec);
+
+    // Boolean and assignment operator
+    //
+    inline boolInVec& operator &= (boolInVec vec);
+
+    // Boolean exclusive or assignment operator
+    //
+    inline boolInVec& operator ^= (boolInVec vec);
+
+    // Boolean or assignment operator
+    //
+    inline boolInVec& operator |= (boolInVec vec);
+
+};
+
+// Equal operator
+//
+inline const boolInVec operator == (boolInVec vec0, boolInVec vec1);
+
+// Not equal operator
+//
+inline const boolInVec operator != (boolInVec vec0, boolInVec vec1);
+
+// And operator
+//
+inline const boolInVec operator & (boolInVec vec0, boolInVec vec1);
+
+// Exclusive or operator
+//
+inline const boolInVec operator ^ (boolInVec vec0, boolInVec vec1);
+
+// Or operator
+//
+inline const boolInVec operator | (boolInVec vec0, boolInVec vec1);
+
+// Conditionally select between two values
+//
+inline const boolInVec select(boolInVec vec0, boolInVec vec1, boolInVec select_vec1);
+
+
+} // namespace Vectormath
+
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec implementation
+//
+
+#include "floatInVec.h"
+
+namespace Vectormath {
+
+inline
+boolInVec::boolInVec(floatInVec vec)
+{
+    *this = (vec != floatInVec(0.0f));
+}
+
+inline
+boolInVec::boolInVec(bool scalar)
+{
+    mData = -(int)scalar;
+}
+
+inline
+bool
+boolInVec::getAsBool() const
+{
+    return (mData > 0);
+}
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+inline
+boolInVec::operator bool() const
+{
+    return getAsBool();
+}
+#endif
+
+inline
+const boolInVec
+boolInVec::operator ! () const
+{
+    return boolInVec(!mData);
+}
+
+inline
+boolInVec&
+boolInVec::operator = (boolInVec vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator &= (boolInVec vec)
+{
+    *this = *this & vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator ^= (boolInVec vec)
+{
+    *this = *this ^ vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator |= (boolInVec vec)
+{
+    *this = *this | vec;
+    return *this;
+}
+
+inline
+const boolInVec
+operator == (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() == vec1.getAsBool());
+}
+
+inline
+const boolInVec
+operator != (boolInVec vec0, boolInVec vec1)
+{
+    return !(vec0 == vec1);
+}
+
+inline
+const boolInVec
+operator & (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() & vec1.getAsBool());
+}
+
+inline
+const boolInVec
+operator | (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() | vec1.getAsBool());
+}
+
+inline
+const boolInVec
+operator ^ (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() ^ vec1.getAsBool());
+}
+
+inline
+const boolInVec
+select(boolInVec vec0, boolInVec vec1, boolInVec select_vec1)
+{
+    return (select_vec1.getAsBool() == 0) ? vec0 : vec1;
+}
+
+} // namespace Vectormath
+
+#endif // boolInVec_h
diff --git a/test/Bullet2/vectormath/scalar/floatInVec.h b/test/Bullet2/vectormath/scalar/floatInVec.h
new file mode 100644
index 000000000..12d89e43d
--- /dev/null
+++ b/test/Bullet2/vectormath/scalar/floatInVec.h
@@ -0,0 +1,343 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+#ifndef _FLOATINVEC_H
+#define _FLOATINVEC_H
+
+#include <math.h>
+namespace Vectormath {
+
+class boolInVec;
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec class
+//
+
+// A class representing a scalar float value contained in a vector register
+// This class does not support fastmath
+class floatInVec
+{
+private:
+    float mData;
+
+public:
+    // Default constructor; does no initialization
+    //
+    inline floatInVec( ) { };
+
+    // Construct from a value converted from bool
+    //
+    inline floatInVec(boolInVec vec);
+
+    // Explicit cast from float
+    //
+    explicit inline floatInVec(float scalar);
+
+    // Explicit cast to float
+    //
+    inline float getAsFloat() const;
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+    // Implicit cast to float
+    //
+    inline operator float() const;
+#endif
+
+    // Post increment (add 1.0f)
+    //
+    inline const floatInVec operator ++ (int);
+
+    // Post decrement (subtract 1.0f)
+    //
+    inline const floatInVec operator -- (int);
+
+    // Pre increment (add 1.0f)
+    //
+    inline floatInVec& operator ++ ();
+
+    // Pre decrement (subtract 1.0f)
+    //
+    inline floatInVec& operator -- ();
+
+    // Negation operator
+    //
+    inline const floatInVec operator - () const;
+
+    // Assignment operator
+    //
+    inline floatInVec& operator = (floatInVec vec);
+
+    // Multiplication assignment operator
+    //
+    inline floatInVec& operator *= (floatInVec vec);
+
+    // Division assignment operator
+    //
+    inline floatInVec& operator /= (floatInVec vec);
+
+    // Addition assignment operator
+    //
+    inline floatInVec& operator += (floatInVec vec);
+
+    // Subtraction assignment operator
+    //
+    inline floatInVec& operator -= (floatInVec vec);
+
+};
+
+// Multiplication operator
+//
+inline const floatInVec operator * (floatInVec vec0, floatInVec vec1);
+
+// Division operator
+//
+inline const floatInVec operator / (floatInVec vec0, floatInVec vec1);
+
+// Addition operator
+//
+inline const floatInVec operator + (floatInVec vec0, floatInVec vec1);
+
+// Subtraction operator
+//
+inline const floatInVec operator - (floatInVec vec0, floatInVec vec1);
+
+// Less than operator
+//
+inline const boolInVec operator < (floatInVec vec0, floatInVec vec1);
+
+// Less than or equal operator
+//
+inline const boolInVec operator <= (floatInVec vec0, floatInVec vec1);
+
+// Greater than operator
+//
+inline const boolInVec operator > (floatInVec vec0, floatInVec vec1);
+
+// Greater than or equal operator
+//
+inline const boolInVec operator >= (floatInVec vec0, floatInVec vec1);
+
+// Equal operator
+//
+inline const boolInVec operator == (floatInVec vec0, floatInVec vec1);
+
+// Not equal operator
+//
+inline const boolInVec operator != (floatInVec vec0, floatInVec vec1);
+
+// Conditionally select between two values
+//
+inline const floatInVec select(floatInVec vec0, floatInVec vec1, boolInVec select_vec1);
+
+
+} // namespace Vectormath
+
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec implementation
+//
+
+#include "boolInVec.h"
+
+namespace Vectormath {
+
+inline
+floatInVec::floatInVec(boolInVec vec)
+{
+    mData = float(vec.getAsBool());
+}
+
+inline
+floatInVec::floatInVec(float scalar)
+{
+    mData = scalar;
+}
+
+inline
+float
+floatInVec::getAsFloat() const
+{
+    return mData;
+}
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+inline
+floatInVec::operator float() const
+{
+    return getAsFloat();
+}
+#endif
+
+inline
+const floatInVec
+floatInVec::operator ++ (int)
+{
+    float olddata = mData;
+    operator ++();
+    return floatInVec(olddata);
+}
+
+inline
+const floatInVec
+floatInVec::operator -- (int)
+{
+    float olddata = mData;
+    operator --();
+    return floatInVec(olddata);
+}
+
+inline
+floatInVec&
+floatInVec::operator ++ ()
+{
+    *this += floatInVec(1.0f);
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -- ()
+{
+    *this -= floatInVec(1.0f);
+    return *this;
+}
+
+inline
+const floatInVec
+floatInVec::operator - () const
+{
+    return floatInVec(-mData);
+}
+
+inline
+floatInVec&
+floatInVec::operator = (floatInVec vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator *= (floatInVec vec)
+{
+    *this = *this * vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator /= (floatInVec vec)
+{
+    *this = *this / vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator += (floatInVec vec)
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -= (floatInVec vec)
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline
+const floatInVec
+operator * (floatInVec vec0, floatInVec vec1)
+{
+    return floatInVec(vec0.getAsFloat() * vec1.getAsFloat());
+}
+
+inline
+const floatInVec
+operator / (floatInVec num, floatInVec den)
+{
+    return floatInVec(num.getAsFloat() / den.getAsFloat());
+}
+
+inline
+const floatInVec
+operator + (floatInVec vec0, floatInVec vec1)
+{
+    return floatInVec(vec0.getAsFloat() + vec1.getAsFloat());
+}
+
+inline
+const floatInVec
+operator - (floatInVec vec0, floatInVec vec1)
+{
+    return floatInVec(vec0.getAsFloat() - vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator < (floatInVec vec0, floatInVec vec1)
+{
+    return boolInVec(vec0.getAsFloat() < vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator <= (floatInVec vec0, floatInVec vec1)
+{
+    return !(vec0 > vec1);
+}
+
+inline
+const boolInVec
+operator > (floatInVec vec0, floatInVec vec1)
+{
+    return boolInVec(vec0.getAsFloat() > vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator >= (floatInVec vec0, floatInVec vec1)
+{
+    return !(vec0 < vec1);
+}
+
+inline
+const boolInVec
+operator == (floatInVec vec0, floatInVec vec1)
+{
+    return boolInVec(vec0.getAsFloat() == vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator != (floatInVec vec0, floatInVec vec1)
+{
+    return !(vec0 == vec1);
+}
+
+inline
+const floatInVec
+select(floatInVec vec0, floatInVec vec1, boolInVec select_vec1)
+{
+    return (select_vec1.getAsBool() == 0) ? vec0 : vec1;
+}
+
+} // namespace Vectormath
+
+#endif // floatInVec_h
diff --git a/test/Bullet2/vectormath/scalar/mat_aos.h b/test/Bullet2/vectormath/scalar/mat_aos.h
new file mode 100644
index 000000000..e103243d1
--- /dev/null
+++ b/test/Bullet2/vectormath/scalar/mat_aos.h
@@ -0,0 +1,1630 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef _VECTORMATH_MAT_AOS_CPP_H
+#define _VECTORMATH_MAT_AOS_CPP_H
+
+namespace Vectormath {
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// Constants
+
+#define _VECTORMATH_PI_OVER_2 1.570796327f
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+inline Matrix3::Matrix3( const Matrix3 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+}
+
+inline Matrix3::Matrix3( float scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+}
+
+inline Matrix3::Matrix3( const Quat & unitQuat )
+{
+    float qx, qy, qz, qw, qx2, qy2, qz2, qxqx2, qyqy2, qzqz2, qxqy2, qyqz2, qzqw2, qxqz2, qyqw2, qxqw2;
+    qx = unitQuat.getX();
+    qy = unitQuat.getY();
+    qz = unitQuat.getZ();
+    qw = unitQuat.getW();
+    qx2 = ( qx + qx );
+    qy2 = ( qy + qy );
+    qz2 = ( qz + qz );
+    qxqx2 = ( qx * qx2 );
+    qxqy2 = ( qx * qy2 );
+    qxqz2 = ( qx * qz2 );
+    qxqw2 = ( qw * qx2 );
+    qyqy2 = ( qy * qy2 );
+    qyqz2 = ( qy * qz2 );
+    qyqw2 = ( qw * qy2 );
+    qzqz2 = ( qz * qz2 );
+    qzqw2 = ( qw * qz2 );
+    mCol0 = Vector3( ( ( 1.0f - qyqy2 ) - qzqz2 ), ( qxqy2 + qzqw2 ), ( qxqz2 - qyqw2 ) );
+    mCol1 = Vector3( ( qxqy2 - qzqw2 ), ( ( 1.0f - qxqx2 ) - qzqz2 ), ( qyqz2 + qxqw2 ) );
+    mCol2 = Vector3( ( qxqz2 + qyqw2 ), ( qyqz2 - qxqw2 ), ( ( 1.0f - qxqx2 ) - qyqy2 ) );
+}
+
+inline Matrix3::Matrix3( const Vector3 & _col0, const Vector3 & _col1, const Vector3 & _col2 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+}
+
+inline Matrix3 & Matrix3::setCol0( const Vector3 & _col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setCol1( const Vector3 & _col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setCol2( const Vector3 & _col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setCol( int col, const Vector3 & vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setRow( int row, const Vector3 & vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    return *this;
+}
+
+inline Matrix3 & Matrix3::setElem( int col, int row, float val )
+{
+    Vector3 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+inline float Matrix3::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+inline const Vector3 Matrix3::getCol0( ) const
+{
+    return mCol0;
+}
+
+inline const Vector3 Matrix3::getCol1( ) const
+{
+    return mCol1;
+}
+
+inline const Vector3 Matrix3::getCol2( ) const
+{
+    return mCol2;
+}
+
+inline const Vector3 Matrix3::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector3 Matrix3::getRow( int row ) const
+{
+    return Vector3( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ) );
+}
+
+inline Vector3 & Matrix3::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector3 Matrix3::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline Matrix3 & Matrix3::operator =( const Matrix3 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    return *this;
+}
+
+inline const Matrix3 transpose( const Matrix3 & mat )
+{
+    return Matrix3(
+        Vector3( mat.getCol0().getX(), mat.getCol1().getX(), mat.getCol2().getX() ),
+        Vector3( mat.getCol0().getY(), mat.getCol1().getY(), mat.getCol2().getY() ),
+        Vector3( mat.getCol0().getZ(), mat.getCol1().getZ(), mat.getCol2().getZ() )
+    );
+}
+
+inline const Matrix3 inverse( const Matrix3 & mat )
+{
+    Vector3 tmp0, tmp1, tmp2;
+    float detinv;
+    tmp0 = cross( mat.getCol1(), mat.getCol2() );
+    tmp1 = cross( mat.getCol2(), mat.getCol0() );
+    tmp2 = cross( mat.getCol0(), mat.getCol1() );
+    detinv = ( 1.0f / dot( mat.getCol2(), tmp2 ) );
+    return Matrix3(
+        Vector3( ( tmp0.getX() * detinv ), ( tmp1.getX() * detinv ), ( tmp2.getX() * detinv ) ),
+        Vector3( ( tmp0.getY() * detinv ), ( tmp1.getY() * detinv ), ( tmp2.getY() * detinv ) ),
+        Vector3( ( tmp0.getZ() * detinv ), ( tmp1.getZ() * detinv ), ( tmp2.getZ() * detinv ) )
+    );
+}
+
+inline float determinant( const Matrix3 & mat )
+{
+    return dot( mat.getCol2(), cross( mat.getCol0(), mat.getCol1() ) );
+}
+
+inline const Matrix3 Matrix3::operator +( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( mCol0 + mat.mCol0 ),
+        ( mCol1 + mat.mCol1 ),
+        ( mCol2 + mat.mCol2 )
+    );
+}
+
+inline const Matrix3 Matrix3::operator -( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( mCol0 - mat.mCol0 ),
+        ( mCol1 - mat.mCol1 ),
+        ( mCol2 - mat.mCol2 )
+    );
+}
+
+inline Matrix3 & Matrix3::operator +=( const Matrix3 & mat )
+{
+    *this = *this + mat;
+    return *this;
+}
+
+inline Matrix3 & Matrix3::operator -=( const Matrix3 & mat )
+{
+    *this = *this - mat;
+    return *this;
+}
+
+inline const Matrix3 Matrix3::operator -( ) const
+{
+    return Matrix3(
+        ( -mCol0 ),
+        ( -mCol1 ),
+        ( -mCol2 )
+    );
+}
+
+inline const Matrix3 absPerElem( const Matrix3 & mat )
+{
+    return Matrix3(
+        absPerElem( mat.getCol0() ),
+        absPerElem( mat.getCol1() ),
+        absPerElem( mat.getCol2() )
+    );
+}
+
+inline const Matrix3 Matrix3::operator *( float scalar ) const
+{
+    return Matrix3(
+        ( mCol0 * scalar ),
+        ( mCol1 * scalar ),
+        ( mCol2 * scalar )
+    );
+}
+
+inline Matrix3 & Matrix3::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Matrix3 operator *( float scalar, const Matrix3 & mat )
+{
+    return mat * scalar;
+}
+
+inline const Vector3 Matrix3::operator *( const Vector3 & vec ) const
+{
+    return Vector3(
+        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
+        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
+        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
+    );
+}
+
+inline const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( *this * mat.mCol0 ),
+        ( *this * mat.mCol1 ),
+        ( *this * mat.mCol2 )
+    );
+}
+
+inline Matrix3 & Matrix3::operator *=( const Matrix3 & mat )
+{
+    *this = *this * mat;
+    return *this;
+}
+
+inline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 )
+{
+    return Matrix3(
+        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
+        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
+        mulPerElem( mat0.getCol2(), mat1.getCol2() )
+    );
+}
+
+inline const Matrix3 Matrix3::identity( )
+{
+    return Matrix3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( )
+    );
+}
+
+inline const Matrix3 Matrix3::rotationX( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix3(
+        Vector3::xAxis( ),
+        Vector3( 0.0f, c, s ),
+        Vector3( 0.0f, -s, c )
+    );
+}
+
+inline const Matrix3 Matrix3::rotationY( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix3(
+        Vector3( c, 0.0f, -s ),
+        Vector3::yAxis( ),
+        Vector3( s, 0.0f, c )
+    );
+}
+
+inline const Matrix3 Matrix3::rotationZ( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix3(
+        Vector3( c, s, 0.0f ),
+        Vector3( -s, c, 0.0f ),
+        Vector3::zAxis( )
+    );
+}
+
+inline const Matrix3 Matrix3::rotationZYX( const Vector3 & radiansXYZ )
+{
+    float sX, cX, sY, cY, sZ, cZ, tmp0, tmp1;
+    sX = sinf( radiansXYZ.getX() );
+    cX = cosf( radiansXYZ.getX() );
+    sY = sinf( radiansXYZ.getY() );
+    cY = cosf( radiansXYZ.getY() );
+    sZ = sinf( radiansXYZ.getZ() );
+    cZ = cosf( radiansXYZ.getZ() );
+    tmp0 = ( cZ * sY );
+    tmp1 = ( sZ * sY );
+    return Matrix3(
+        Vector3( ( cZ * cY ), ( sZ * cY ), -sY ),
+        Vector3( ( ( tmp0 * sX ) - ( sZ * cX ) ), ( ( tmp1 * sX ) + ( cZ * cX ) ), ( cY * sX ) ),
+        Vector3( ( ( tmp0 * cX ) + ( sZ * sX ) ), ( ( tmp1 * cX ) - ( cZ * sX ) ), ( cY * cX ) )
+    );
+}
+
+inline const Matrix3 Matrix3::rotation( float radians, const Vector3 & unitVec )
+{
+    float x, y, z, s, c, oneMinusC, xy, yz, zx;
+    s = sinf( radians );
+    c = cosf( radians );
+    x = unitVec.getX();
+    y = unitVec.getY();
+    z = unitVec.getZ();
+    xy = ( x * y );
+    yz = ( y * z );
+    zx = ( z * x );
+    oneMinusC = ( 1.0f - c );
+    return Matrix3(
+        Vector3( ( ( ( x * x ) * oneMinusC ) + c ), ( ( xy * oneMinusC ) + ( z * s ) ), ( ( zx * oneMinusC ) - ( y * s ) ) ),
+        Vector3( ( ( xy * oneMinusC ) - ( z * s ) ), ( ( ( y * y ) * oneMinusC ) + c ), ( ( yz * oneMinusC ) + ( x * s ) ) ),
+        Vector3( ( ( zx * oneMinusC ) + ( y * s ) ), ( ( yz * oneMinusC ) - ( x * s ) ), ( ( ( z * z ) * oneMinusC ) + c ) )
+    );
+}
+
+inline const Matrix3 Matrix3::rotation( const Quat & unitQuat )
+{
+    return Matrix3( unitQuat );
+}
+
+inline const Matrix3 Matrix3::scale( const Vector3 & scaleVec )
+{
+    return Matrix3(
+        Vector3( scaleVec.getX(), 0.0f, 0.0f ),
+        Vector3( 0.0f, scaleVec.getY(), 0.0f ),
+        Vector3( 0.0f, 0.0f, scaleVec.getZ() )
+    );
+}
+
+inline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 & scaleVec )
+{
+    return Matrix3(
+        ( mat.getCol0() * scaleVec.getX( ) ),
+        ( mat.getCol1() * scaleVec.getY( ) ),
+        ( mat.getCol2() * scaleVec.getZ( ) )
+    );
+}
+
+inline const Matrix3 prependScale( const Vector3 & scaleVec, const Matrix3 & mat )
+{
+    return Matrix3(
+        mulPerElem( mat.getCol0(), scaleVec ),
+        mulPerElem( mat.getCol1(), scaleVec ),
+        mulPerElem( mat.getCol2(), scaleVec )
+    );
+}
+
+inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 )
+{
+    return Matrix3(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Matrix3 & mat )
+{
+    print( mat.getRow( 0 ) );
+    print( mat.getRow( 1 ) );
+    print( mat.getRow( 2 ) );
+}
+
+inline void print( const Matrix3 & mat, const char * name )
+{
+    printf("%s:\n", name);
+    print( mat );
+}
+
+#endif
+
+inline Matrix4::Matrix4( const Matrix4 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    mCol3 = mat.mCol3;
+}
+
+inline Matrix4::Matrix4( float scalar )
+{
+    mCol0 = Vector4( scalar );
+    mCol1 = Vector4( scalar );
+    mCol2 = Vector4( scalar );
+    mCol3 = Vector4( scalar );
+}
+
+inline Matrix4::Matrix4( const Transform3 & mat )
+{
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( mat.getCol3(), 1.0f );
+}
+
+inline Matrix4::Matrix4( const Vector4 & _col0, const Vector4 & _col1, const Vector4 & _col2, const Vector4 & _col3 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+    mCol3 = _col3;
+}
+
+inline Matrix4::Matrix4( const Matrix3 & mat, const Vector3 & translateVec )
+{
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( translateVec, 1.0f );
+}
+
+inline Matrix4::Matrix4( const Quat & unitQuat, const Vector3 & translateVec )
+{
+    Matrix3 mat;
+    mat = Matrix3( unitQuat );
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( translateVec, 1.0f );
+}
+
+inline Matrix4 & Matrix4::setCol0( const Vector4 & _col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setCol1( const Vector4 & _col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setCol2( const Vector4 & _col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setCol3( const Vector4 & _col3 )
+{
+    mCol3 = _col3;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setCol( int col, const Vector4 & vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setRow( int row, const Vector4 & vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    mCol3.setElem( row, vec.getElem( 3 ) );
+    return *this;
+}
+
+inline Matrix4 & Matrix4::setElem( int col, int row, float val )
+{
+    Vector4 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+inline float Matrix4::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+inline const Vector4 Matrix4::getCol0( ) const
+{
+    return mCol0;
+}
+
+inline const Vector4 Matrix4::getCol1( ) const
+{
+    return mCol1;
+}
+
+inline const Vector4 Matrix4::getCol2( ) const
+{
+    return mCol2;
+}
+
+inline const Vector4 Matrix4::getCol3( ) const
+{
+    return mCol3;
+}
+
+inline const Vector4 Matrix4::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector4 Matrix4::getRow( int row ) const
+{
+    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
+}
+
+inline Vector4 & Matrix4::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector4 Matrix4::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline Matrix4 & Matrix4::operator =( const Matrix4 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    mCol3 = mat.mCol3;
+    return *this;
+}
+
+inline const Matrix4 transpose( const Matrix4 & mat )
+{
+    return Matrix4(
+        Vector4( mat.getCol0().getX(), mat.getCol1().getX(), mat.getCol2().getX(), mat.getCol3().getX() ),
+        Vector4( mat.getCol0().getY(), mat.getCol1().getY(), mat.getCol2().getY(), mat.getCol3().getY() ),
+        Vector4( mat.getCol0().getZ(), mat.getCol1().getZ(), mat.getCol2().getZ(), mat.getCol3().getZ() ),
+        Vector4( mat.getCol0().getW(), mat.getCol1().getW(), mat.getCol2().getW(), mat.getCol3().getW() )
+    );
+}
+
+inline const Matrix4 inverse( const Matrix4 & mat )
+{
+    Vector4 res0, res1, res2, res3;
+    float mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, detInv;
+    mA = mat.getCol0().getX();
+    mB = mat.getCol0().getY();
+    mC = mat.getCol0().getZ();
+    mD = mat.getCol0().getW();
+    mE = mat.getCol1().getX();
+    mF = mat.getCol1().getY();
+    mG = mat.getCol1().getZ();
+    mH = mat.getCol1().getW();
+    mI = mat.getCol2().getX();
+    mJ = mat.getCol2().getY();
+    mK = mat.getCol2().getZ();
+    mL = mat.getCol2().getW();
+    mM = mat.getCol3().getX();
+    mN = mat.getCol3().getY();
+    mO = mat.getCol3().getZ();
+    mP = mat.getCol3().getW();
+    tmp0 = ( ( mK * mD ) - ( mC * mL ) );
+    tmp1 = ( ( mO * mH ) - ( mG * mP ) );
+    tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
+    tmp3 = ( ( mF * mO ) - ( mN * mG ) );
+    tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
+    tmp5 = ( ( mN * mH ) - ( mF * mP ) );
+    res0.setX( ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) ) );
+    res0.setY( ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) ) );
+    res0.setZ( ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) ) );
+    res0.setW( ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) ) );
+    detInv = ( 1.0f / ( ( ( ( mA * res0.getX() ) + ( mE * res0.getY() ) ) + ( mI * res0.getZ() ) ) + ( mM * res0.getW() ) ) );
+    res1.setX( ( mI * tmp1 ) );
+    res1.setY( ( mM * tmp0 ) );
+    res1.setZ( ( mA * tmp1 ) );
+    res1.setW( ( mE * tmp0 ) );
+    res3.setX( ( mI * tmp3 ) );
+    res3.setY( ( mM * tmp2 ) );
+    res3.setZ( ( mA * tmp3 ) );
+    res3.setW( ( mE * tmp2 ) );
+    res2.setX( ( mI * tmp5 ) );
+    res2.setY( ( mM * tmp4 ) );
+    res2.setZ( ( mA * tmp5 ) );
+    res2.setW( ( mE * tmp4 ) );
+    tmp0 = ( ( mI * mB ) - ( mA * mJ ) );
+    tmp1 = ( ( mM * mF ) - ( mE * mN ) );
+    tmp2 = ( ( mI * mD ) - ( mA * mL ) );
+    tmp3 = ( ( mM * mH ) - ( mE * mP ) );
+    tmp4 = ( ( mI * mC ) - ( mA * mK ) );
+    tmp5 = ( ( mM * mG ) - ( mE * mO ) );
+    res2.setX( ( ( ( mL * tmp1 ) - ( mJ * tmp3 ) ) + res2.getX() ) );
+    res2.setY( ( ( ( mP * tmp0 ) - ( mN * tmp2 ) ) + res2.getY() ) );
+    res2.setZ( ( ( ( mB * tmp3 ) - ( mD * tmp1 ) ) - res2.getZ() ) );
+    res2.setW( ( ( ( mF * tmp2 ) - ( mH * tmp0 ) ) - res2.getW() ) );
+    res3.setX( ( ( ( mJ * tmp5 ) - ( mK * tmp1 ) ) + res3.getX() ) );
+    res3.setY( ( ( ( mN * tmp4 ) - ( mO * tmp0 ) ) + res3.getY() ) );
+    res3.setZ( ( ( ( mC * tmp1 ) - ( mB * tmp5 ) ) - res3.getZ() ) );
+    res3.setW( ( ( ( mG * tmp0 ) - ( mF * tmp4 ) ) - res3.getW() ) );
+    res1.setX( ( ( ( mK * tmp3 ) - ( mL * tmp5 ) ) - res1.getX() ) );
+    res1.setY( ( ( ( mO * tmp2 ) - ( mP * tmp4 ) ) - res1.getY() ) );
+    res1.setZ( ( ( ( mD * tmp5 ) - ( mC * tmp3 ) ) + res1.getZ() ) );
+    res1.setW( ( ( ( mH * tmp4 ) - ( mG * tmp2 ) ) + res1.getW() ) );
+    return Matrix4(
+        ( res0 * detInv ),
+        ( res1 * detInv ),
+        ( res2 * detInv ),
+        ( res3 * detInv )
+    );
+}
+
+inline const Matrix4 affineInverse( const Matrix4 & mat )
+{
+    Transform3 affineMat;
+    affineMat.setCol0( mat.getCol0().getXYZ( ) );
+    affineMat.setCol1( mat.getCol1().getXYZ( ) );
+    affineMat.setCol2( mat.getCol2().getXYZ( ) );
+    affineMat.setCol3( mat.getCol3().getXYZ( ) );
+    return Matrix4( inverse( affineMat ) );
+}
+
+inline const Matrix4 orthoInverse( const Matrix4 & mat )
+{
+    Transform3 affineMat;
+    affineMat.setCol0( mat.getCol0().getXYZ( ) );
+    affineMat.setCol1( mat.getCol1().getXYZ( ) );
+    affineMat.setCol2( mat.getCol2().getXYZ( ) );
+    affineMat.setCol3( mat.getCol3().getXYZ( ) );
+    return Matrix4( orthoInverse( affineMat ) );
+}
+
+inline float determinant( const Matrix4 & mat )
+{
+    float dx, dy, dz, dw, mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    mA = mat.getCol0().getX();
+    mB = mat.getCol0().getY();
+    mC = mat.getCol0().getZ();
+    mD = mat.getCol0().getW();
+    mE = mat.getCol1().getX();
+    mF = mat.getCol1().getY();
+    mG = mat.getCol1().getZ();
+    mH = mat.getCol1().getW();
+    mI = mat.getCol2().getX();
+    mJ = mat.getCol2().getY();
+    mK = mat.getCol2().getZ();
+    mL = mat.getCol2().getW();
+    mM = mat.getCol3().getX();
+    mN = mat.getCol3().getY();
+    mO = mat.getCol3().getZ();
+    mP = mat.getCol3().getW();
+    tmp0 = ( ( mK * mD ) - ( mC * mL ) );
+    tmp1 = ( ( mO * mH ) - ( mG * mP ) );
+    tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
+    tmp3 = ( ( mF * mO ) - ( mN * mG ) );
+    tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
+    tmp5 = ( ( mN * mH ) - ( mF * mP ) );
+    dx = ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) );
+    dy = ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) );
+    dz = ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) );
+    dw = ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) );
+    return ( ( ( ( mA * dx ) + ( mE * dy ) ) + ( mI * dz ) ) + ( mM * dw ) );
+}
+
+inline const Matrix4 Matrix4::operator +( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( mCol0 + mat.mCol0 ),
+        ( mCol1 + mat.mCol1 ),
+        ( mCol2 + mat.mCol2 ),
+        ( mCol3 + mat.mCol3 )
+    );
+}
+
+inline const Matrix4 Matrix4::operator -( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( mCol0 - mat.mCol0 ),
+        ( mCol1 - mat.mCol1 ),
+        ( mCol2 - mat.mCol2 ),
+        ( mCol3 - mat.mCol3 )
+    );
+}
+
+inline Matrix4 & Matrix4::operator +=( const Matrix4 & mat )
+{
+    *this = *this + mat;
+    return *this;
+}
+
+inline Matrix4 & Matrix4::operator -=( const Matrix4 & mat )
+{
+    *this = *this - mat;
+    return *this;
+}
+
+inline const Matrix4 Matrix4::operator -( ) const
+{
+    return Matrix4(
+        ( -mCol0 ),
+        ( -mCol1 ),
+        ( -mCol2 ),
+        ( -mCol3 )
+    );
+}
+
+inline const Matrix4 absPerElem( const Matrix4 & mat )
+{
+    return Matrix4(
+        absPerElem( mat.getCol0() ),
+        absPerElem( mat.getCol1() ),
+        absPerElem( mat.getCol2() ),
+        absPerElem( mat.getCol3() )
+    );
+}
+
+inline const Matrix4 Matrix4::operator *( float scalar ) const
+{
+    return Matrix4(
+        ( mCol0 * scalar ),
+        ( mCol1 * scalar ),
+        ( mCol2 * scalar ),
+        ( mCol3 * scalar )
+    );
+}
+
+inline Matrix4 & Matrix4::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Matrix4 operator *( float scalar, const Matrix4 & mat )
+{
+    return mat * scalar;
+}
+
+inline const Vector4 Matrix4::operator *( const Vector4 & vec ) const
+{
+    return Vector4(
+        ( ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ) + ( mCol3.getX() * vec.getW() ) ),
+        ( ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ) + ( mCol3.getY() * vec.getW() ) ),
+        ( ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) ) + ( mCol3.getZ() * vec.getW() ) ),
+        ( ( ( ( mCol0.getW() * vec.getX() ) + ( mCol1.getW() * vec.getY() ) ) + ( mCol2.getW() * vec.getZ() ) ) + ( mCol3.getW() * vec.getW() ) )
+    );
+}
+
+inline const Vector4 Matrix4::operator *( const Vector3 & vec ) const
+{
+    return Vector4(
+        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
+        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
+        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) ),
+        ( ( ( mCol0.getW() * vec.getX() ) + ( mCol1.getW() * vec.getY() ) ) + ( mCol2.getW() * vec.getZ() ) )
+    );
+}
+
+inline const Vector4 Matrix4::operator *( const Point3 & pnt ) const
+{
+    return Vector4(
+        ( ( ( ( mCol0.getX() * pnt.getX() ) + ( mCol1.getX() * pnt.getY() ) ) + ( mCol2.getX() * pnt.getZ() ) ) + mCol3.getX() ),
+        ( ( ( ( mCol0.getY() * pnt.getX() ) + ( mCol1.getY() * pnt.getY() ) ) + ( mCol2.getY() * pnt.getZ() ) ) + mCol3.getY() ),
+        ( ( ( ( mCol0.getZ() * pnt.getX() ) + ( mCol1.getZ() * pnt.getY() ) ) + ( mCol2.getZ() * pnt.getZ() ) ) + mCol3.getZ() ),
+        ( ( ( ( mCol0.getW() * pnt.getX() ) + ( mCol1.getW() * pnt.getY() ) ) + ( mCol2.getW() * pnt.getZ() ) ) + mCol3.getW() )
+    );
+}
+
+inline const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( *this * mat.mCol0 ),
+        ( *this * mat.mCol1 ),
+        ( *this * mat.mCol2 ),
+        ( *this * mat.mCol3 )
+    );
+}
+
+inline Matrix4 & Matrix4::operator *=( const Matrix4 & mat )
+{
+    *this = *this * mat;
+    return *this;
+}
+
+inline const Matrix4 Matrix4::operator *( const Transform3 & tfrm ) const
+{
+    return Matrix4(
+        ( *this * tfrm.getCol0() ),
+        ( *this * tfrm.getCol1() ),
+        ( *this * tfrm.getCol2() ),
+        ( *this * Point3( tfrm.getCol3() ) )
+    );
+}
+
+inline Matrix4 & Matrix4::operator *=( const Transform3 & tfrm )
+{
+    *this = *this * tfrm;
+    return *this;
+}
+
+inline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 )
+{
+    return Matrix4(
+        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
+        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
+        mulPerElem( mat0.getCol2(), mat1.getCol2() ),
+        mulPerElem( mat0.getCol3(), mat1.getCol3() )
+    );
+}
+
+inline const Matrix4 Matrix4::identity( )
+{
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4::yAxis( ),
+        Vector4::zAxis( ),
+        Vector4::wAxis( )
+    );
+}
+
+inline Matrix4 & Matrix4::setUpper3x3( const Matrix3 & mat3 )
+{
+    mCol0.setXYZ( mat3.getCol0() );
+    mCol1.setXYZ( mat3.getCol1() );
+    mCol2.setXYZ( mat3.getCol2() );
+    return *this;
+}
+
+inline const Matrix3 Matrix4::getUpper3x3( ) const
+{
+    return Matrix3(
+        mCol0.getXYZ( ),
+        mCol1.getXYZ( ),
+        mCol2.getXYZ( )
+    );
+}
+
+inline Matrix4 & Matrix4::setTranslation( const Vector3 & translateVec )
+{
+    mCol3.setXYZ( translateVec );
+    return *this;
+}
+
+inline const Vector3 Matrix4::getTranslation( ) const
+{
+    return mCol3.getXYZ( );
+}
+
+inline const Matrix4 Matrix4::rotationX( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4( 0.0f, c, s, 0.0f ),
+        Vector4( 0.0f, -s, c, 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotationY( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix4(
+        Vector4( c, 0.0f, -s, 0.0f ),
+        Vector4::yAxis( ),
+        Vector4( s, 0.0f, c, 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotationZ( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Matrix4(
+        Vector4( c, s, 0.0f, 0.0f ),
+        Vector4( -s, c, 0.0f, 0.0f ),
+        Vector4::zAxis( ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotationZYX( const Vector3 & radiansXYZ )
+{
+    float sX, cX, sY, cY, sZ, cZ, tmp0, tmp1;
+    sX = sinf( radiansXYZ.getX() );
+    cX = cosf( radiansXYZ.getX() );
+    sY = sinf( radiansXYZ.getY() );
+    cY = cosf( radiansXYZ.getY() );
+    sZ = sinf( radiansXYZ.getZ() );
+    cZ = cosf( radiansXYZ.getZ() );
+    tmp0 = ( cZ * sY );
+    tmp1 = ( sZ * sY );
+    return Matrix4(
+        Vector4( ( cZ * cY ), ( sZ * cY ), -sY, 0.0f ),
+        Vector4( ( ( tmp0 * sX ) - ( sZ * cX ) ), ( ( tmp1 * sX ) + ( cZ * cX ) ), ( cY * sX ), 0.0f ),
+        Vector4( ( ( tmp0 * cX ) + ( sZ * sX ) ), ( ( tmp1 * cX ) - ( cZ * sX ) ), ( cY * cX ), 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotation( float radians, const Vector3 & unitVec )
+{
+    float x, y, z, s, c, oneMinusC, xy, yz, zx;
+    s = sinf( radians );
+    c = cosf( radians );
+    x = unitVec.getX();
+    y = unitVec.getY();
+    z = unitVec.getZ();
+    xy = ( x * y );
+    yz = ( y * z );
+    zx = ( z * x );
+    oneMinusC = ( 1.0f - c );
+    return Matrix4(
+        Vector4( ( ( ( x * x ) * oneMinusC ) + c ), ( ( xy * oneMinusC ) + ( z * s ) ), ( ( zx * oneMinusC ) - ( y * s ) ), 0.0f ),
+        Vector4( ( ( xy * oneMinusC ) - ( z * s ) ), ( ( ( y * y ) * oneMinusC ) + c ), ( ( yz * oneMinusC ) + ( x * s ) ), 0.0f ),
+        Vector4( ( ( zx * oneMinusC ) + ( y * s ) ), ( ( yz * oneMinusC ) - ( x * s ) ), ( ( ( z * z ) * oneMinusC ) + c ), 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 Matrix4::rotation( const Quat & unitQuat )
+{
+    return Matrix4( Transform3::rotation( unitQuat ) );
+}
+
+inline const Matrix4 Matrix4::scale( const Vector3 & scaleVec )
+{
+    return Matrix4(
+        Vector4( scaleVec.getX(), 0.0f, 0.0f, 0.0f ),
+        Vector4( 0.0f, scaleVec.getY(), 0.0f, 0.0f ),
+        Vector4( 0.0f, 0.0f, scaleVec.getZ(), 0.0f ),
+        Vector4::wAxis( )
+    );
+}
+
+inline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 & scaleVec )
+{
+    return Matrix4(
+        ( mat.getCol0() * scaleVec.getX( ) ),
+        ( mat.getCol1() * scaleVec.getY( ) ),
+        ( mat.getCol2() * scaleVec.getZ( ) ),
+        mat.getCol3()
+    );
+}
+
+inline const Matrix4 prependScale( const Vector3 & scaleVec, const Matrix4 & mat )
+{
+    Vector4 scale4;
+    scale4 = Vector4( scaleVec, 1.0f );
+    return Matrix4(
+        mulPerElem( mat.getCol0(), scale4 ),
+        mulPerElem( mat.getCol1(), scale4 ),
+        mulPerElem( mat.getCol2(), scale4 ),
+        mulPerElem( mat.getCol3(), scale4 )
+    );
+}
+
+inline const Matrix4 Matrix4::translation( const Vector3 & translateVec )
+{
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4::yAxis( ),
+        Vector4::zAxis( ),
+        Vector4( translateVec, 1.0f )
+    );
+}
+
+inline const Matrix4 Matrix4::lookAt( const Point3 & eyePos, const Point3 & lookAtPos, const Vector3 & upVec )
+{
+    Matrix4 m4EyeFrame;
+    Vector3 v3X, v3Y, v3Z;
+    v3Y = normalize( upVec );
+    v3Z = normalize( ( eyePos - lookAtPos ) );
+    v3X = normalize( cross( v3Y, v3Z ) );
+    v3Y = cross( v3Z, v3X );
+    m4EyeFrame = Matrix4( Vector4( v3X ), Vector4( v3Y ), Vector4( v3Z ), Vector4( eyePos ) );
+    return orthoInverse( m4EyeFrame );
+}
+
+inline const Matrix4 Matrix4::perspective( float fovyRadians, float aspect, float zNear, float zFar )
+{
+    float f, rangeInv;
+    f = tanf( ( (float)( _VECTORMATH_PI_OVER_2 ) - ( 0.5f * fovyRadians ) ) );
+    rangeInv = ( 1.0f / ( zNear - zFar ) );
+    return Matrix4(
+        Vector4( ( f / aspect ), 0.0f, 0.0f, 0.0f ),
+        Vector4( 0.0f, f, 0.0f, 0.0f ),
+        Vector4( 0.0f, 0.0f, ( ( zNear + zFar ) * rangeInv ), -1.0f ),
+        Vector4( 0.0f, 0.0f, ( ( ( zNear * zFar ) * rangeInv ) * 2.0f ), 0.0f )
+    );
+}
+
+inline const Matrix4 Matrix4::frustum( float left, float right, float bottom, float top, float zNear, float zFar )
+{
+    float sum_rl, sum_tb, sum_nf, inv_rl, inv_tb, inv_nf, n2;
+    sum_rl = ( right + left );
+    sum_tb = ( top + bottom );
+    sum_nf = ( zNear + zFar );
+    inv_rl = ( 1.0f / ( right - left ) );
+    inv_tb = ( 1.0f / ( top - bottom ) );
+    inv_nf = ( 1.0f / ( zNear - zFar ) );
+    n2 = ( zNear + zNear );
+    return Matrix4(
+        Vector4( ( n2 * inv_rl ), 0.0f, 0.0f, 0.0f ),
+        Vector4( 0.0f, ( n2 * inv_tb ), 0.0f, 0.0f ),
+        Vector4( ( sum_rl * inv_rl ), ( sum_tb * inv_tb ), ( sum_nf * inv_nf ), -1.0f ),
+        Vector4( 0.0f, 0.0f, ( ( n2 * inv_nf ) * zFar ), 0.0f )
+    );
+}
+
+inline const Matrix4 Matrix4::orthographic( float left, float right, float bottom, float top, float zNear, float zFar )
+{
+    float sum_rl, sum_tb, sum_nf, inv_rl, inv_tb, inv_nf;
+    sum_rl = ( right + left );
+    sum_tb = ( top + bottom );
+    sum_nf = ( zNear + zFar );
+    inv_rl = ( 1.0f / ( right - left ) );
+    inv_tb = ( 1.0f / ( top - bottom ) );
+    inv_nf = ( 1.0f / ( zNear - zFar ) );
+    return Matrix4(
+        Vector4( ( inv_rl + inv_rl ), 0.0f, 0.0f, 0.0f ),
+        Vector4( 0.0f, ( inv_tb + inv_tb ), 0.0f, 0.0f ),
+        Vector4( 0.0f, 0.0f, ( inv_nf + inv_nf ), 0.0f ),
+        Vector4( ( -sum_rl * inv_rl ), ( -sum_tb * inv_tb ), ( sum_nf * inv_nf ), 1.0f )
+    );
+}
+
+inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 )
+{
+    return Matrix4(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 ),
+        select( mat0.getCol3(), mat1.getCol3(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Matrix4 & mat )
+{
+    print( mat.getRow( 0 ) );
+    print( mat.getRow( 1 ) );
+    print( mat.getRow( 2 ) );
+    print( mat.getRow( 3 ) );
+}
+
+inline void print( const Matrix4 & mat, const char * name )
+{
+    printf("%s:\n", name);
+    print( mat );
+}
+
+#endif
+
+inline Transform3::Transform3( const Transform3 & tfrm )
+{
+    mCol0 = tfrm.mCol0;
+    mCol1 = tfrm.mCol1;
+    mCol2 = tfrm.mCol2;
+    mCol3 = tfrm.mCol3;
+}
+
+inline Transform3::Transform3( float scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+    mCol3 = Vector3( scalar );
+}
+
+inline Transform3::Transform3( const Vector3 & _col0, const Vector3 & _col1, const Vector3 & _col2, const Vector3 & _col3 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+    mCol3 = _col3;
+}
+
+inline Transform3::Transform3( const Matrix3 & tfrm, const Vector3 & translateVec )
+{
+    this->setUpper3x3( tfrm );
+    this->setTranslation( translateVec );
+}
+
+inline Transform3::Transform3( const Quat & unitQuat, const Vector3 & translateVec )
+{
+    this->setUpper3x3( Matrix3( unitQuat ) );
+    this->setTranslation( translateVec );
+}
+
+inline Transform3 & Transform3::setCol0( const Vector3 & _col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+inline Transform3 & Transform3::setCol1( const Vector3 & _col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+inline Transform3 & Transform3::setCol2( const Vector3 & _col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+inline Transform3 & Transform3::setCol3( const Vector3 & _col3 )
+{
+    mCol3 = _col3;
+    return *this;
+}
+
+inline Transform3 & Transform3::setCol( int col, const Vector3 & vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+inline Transform3 & Transform3::setRow( int row, const Vector4 & vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    mCol3.setElem( row, vec.getElem( 3 ) );
+    return *this;
+}
+
+inline Transform3 & Transform3::setElem( int col, int row, float val )
+{
+    Vector3 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+inline float Transform3::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+inline const Vector3 Transform3::getCol0( ) const
+{
+    return mCol0;
+}
+
+inline const Vector3 Transform3::getCol1( ) const
+{
+    return mCol1;
+}
+
+inline const Vector3 Transform3::getCol2( ) const
+{
+    return mCol2;
+}
+
+inline const Vector3 Transform3::getCol3( ) const
+{
+    return mCol3;
+}
+
+inline const Vector3 Transform3::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector4 Transform3::getRow( int row ) const
+{
+    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
+}
+
+inline Vector3 & Transform3::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+inline const Vector3 Transform3::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+inline Transform3 & Transform3::operator =( const Transform3 & tfrm )
+{
+    mCol0 = tfrm.mCol0;
+    mCol1 = tfrm.mCol1;
+    mCol2 = tfrm.mCol2;
+    mCol3 = tfrm.mCol3;
+    return *this;
+}
+
+inline const Transform3 inverse( const Transform3 & tfrm )
+{
+    Vector3 tmp0, tmp1, tmp2, inv0, inv1, inv2;
+    float detinv;
+    tmp0 = cross( tfrm.getCol1(), tfrm.getCol2() );
+    tmp1 = cross( tfrm.getCol2(), tfrm.getCol0() );
+    tmp2 = cross( tfrm.getCol0(), tfrm.getCol1() );
+    detinv = ( 1.0f / dot( tfrm.getCol2(), tmp2 ) );
+    inv0 = Vector3( ( tmp0.getX() * detinv ), ( tmp1.getX() * detinv ), ( tmp2.getX() * detinv ) );
+    inv1 = Vector3( ( tmp0.getY() * detinv ), ( tmp1.getY() * detinv ), ( tmp2.getY() * detinv ) );
+    inv2 = Vector3( ( tmp0.getZ() * detinv ), ( tmp1.getZ() * detinv ), ( tmp2.getZ() * detinv ) );
+    return Transform3(
+        inv0,
+        inv1,
+        inv2,
+        Vector3( ( -( ( inv0 * tfrm.getCol3().getX() ) + ( ( inv1 * tfrm.getCol3().getY() ) + ( inv2 * tfrm.getCol3().getZ() ) ) ) ) )
+    );
+}
+
+inline const Transform3 orthoInverse( const Transform3 & tfrm )
+{
+    Vector3 inv0, inv1, inv2;
+    inv0 = Vector3( tfrm.getCol0().getX(), tfrm.getCol1().getX(), tfrm.getCol2().getX() );
+    inv1 = Vector3( tfrm.getCol0().getY(), tfrm.getCol1().getY(), tfrm.getCol2().getY() );
+    inv2 = Vector3( tfrm.getCol0().getZ(), tfrm.getCol1().getZ(), tfrm.getCol2().getZ() );
+    return Transform3(
+        inv0,
+        inv1,
+        inv2,
+        Vector3( ( -( ( inv0 * tfrm.getCol3().getX() ) + ( ( inv1 * tfrm.getCol3().getY() ) + ( inv2 * tfrm.getCol3().getZ() ) ) ) ) )
+    );
+}
+
+inline const Transform3 absPerElem( const Transform3 & tfrm )
+{
+    return Transform3(
+        absPerElem( tfrm.getCol0() ),
+        absPerElem( tfrm.getCol1() ),
+        absPerElem( tfrm.getCol2() ),
+        absPerElem( tfrm.getCol3() )
+    );
+}
+
+inline const Vector3 Transform3::operator *( const Vector3 & vec ) const
+{
+    return Vector3(
+        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
+        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
+        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
+    );
+}
+
+inline const Point3 Transform3::operator *( const Point3 & pnt ) const
+{
+    return Point3(
+        ( ( ( ( mCol0.getX() * pnt.getX() ) + ( mCol1.getX() * pnt.getY() ) ) + ( mCol2.getX() * pnt.getZ() ) ) + mCol3.getX() ),
+        ( ( ( ( mCol0.getY() * pnt.getX() ) + ( mCol1.getY() * pnt.getY() ) ) + ( mCol2.getY() * pnt.getZ() ) ) + mCol3.getY() ),
+        ( ( ( ( mCol0.getZ() * pnt.getX() ) + ( mCol1.getZ() * pnt.getY() ) ) + ( mCol2.getZ() * pnt.getZ() ) ) + mCol3.getZ() )
+    );
+}
+
+inline const Transform3 Transform3::operator *( const Transform3 & tfrm ) const
+{
+    return Transform3(
+        ( *this * tfrm.mCol0 ),
+        ( *this * tfrm.mCol1 ),
+        ( *this * tfrm.mCol2 ),
+        Vector3( ( *this * Point3( tfrm.mCol3 ) ) )
+    );
+}
+
+inline Transform3 & Transform3::operator *=( const Transform3 & tfrm )
+{
+    *this = *this * tfrm;
+    return *this;
+}
+
+inline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 )
+{
+    return Transform3(
+        mulPerElem( tfrm0.getCol0(), tfrm1.getCol0() ),
+        mulPerElem( tfrm0.getCol1(), tfrm1.getCol1() ),
+        mulPerElem( tfrm0.getCol2(), tfrm1.getCol2() ),
+        mulPerElem( tfrm0.getCol3(), tfrm1.getCol3() )
+    );
+}
+
+inline const Transform3 Transform3::identity( )
+{
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( ),
+        Vector3( 0.0f )
+    );
+}
+
+inline Transform3 & Transform3::setUpper3x3( const Matrix3 & tfrm )
+{
+    mCol0 = tfrm.getCol0();
+    mCol1 = tfrm.getCol1();
+    mCol2 = tfrm.getCol2();
+    return *this;
+}
+
+inline const Matrix3 Transform3::getUpper3x3( ) const
+{
+    return Matrix3( mCol0, mCol1, mCol2 );
+}
+
+inline Transform3 & Transform3::setTranslation( const Vector3 & translateVec )
+{
+    mCol3 = translateVec;
+    return *this;
+}
+
+inline const Vector3 Transform3::getTranslation( ) const
+{
+    return mCol3;
+}
+
+inline const Transform3 Transform3::rotationX( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3( 0.0f, c, s ),
+        Vector3( 0.0f, -s, c ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 Transform3::rotationY( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Transform3(
+        Vector3( c, 0.0f, -s ),
+        Vector3::yAxis( ),
+        Vector3( s, 0.0f, c ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 Transform3::rotationZ( float radians )
+{
+    float s, c;
+    s = sinf( radians );
+    c = cosf( radians );
+    return Transform3(
+        Vector3( c, s, 0.0f ),
+        Vector3( -s, c, 0.0f ),
+        Vector3::zAxis( ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 Transform3::rotationZYX( const Vector3 & radiansXYZ )
+{
+    float sX, cX, sY, cY, sZ, cZ, tmp0, tmp1;
+    sX = sinf( radiansXYZ.getX() );
+    cX = cosf( radiansXYZ.getX() );
+    sY = sinf( radiansXYZ.getY() );
+    cY = cosf( radiansXYZ.getY() );
+    sZ = sinf( radiansXYZ.getZ() );
+    cZ = cosf( radiansXYZ.getZ() );
+    tmp0 = ( cZ * sY );
+    tmp1 = ( sZ * sY );
+    return Transform3(
+        Vector3( ( cZ * cY ), ( sZ * cY ), -sY ),
+        Vector3( ( ( tmp0 * sX ) - ( sZ * cX ) ), ( ( tmp1 * sX ) + ( cZ * cX ) ), ( cY * sX ) ),
+        Vector3( ( ( tmp0 * cX ) + ( sZ * sX ) ), ( ( tmp1 * cX ) - ( cZ * sX ) ), ( cY * cX ) ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 Transform3::rotation( float radians, const Vector3 & unitVec )
+{
+    return Transform3( Matrix3::rotation( radians, unitVec ), Vector3( 0.0f ) );
+}
+
+inline const Transform3 Transform3::rotation( const Quat & unitQuat )
+{
+    return Transform3( Matrix3( unitQuat ), Vector3( 0.0f ) );
+}
+
+inline const Transform3 Transform3::scale( const Vector3 & scaleVec )
+{
+    return Transform3(
+        Vector3( scaleVec.getX(), 0.0f, 0.0f ),
+        Vector3( 0.0f, scaleVec.getY(), 0.0f ),
+        Vector3( 0.0f, 0.0f, scaleVec.getZ() ),
+        Vector3( 0.0f )
+    );
+}
+
+inline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 & scaleVec )
+{
+    return Transform3(
+        ( tfrm.getCol0() * scaleVec.getX( ) ),
+        ( tfrm.getCol1() * scaleVec.getY( ) ),
+        ( tfrm.getCol2() * scaleVec.getZ( ) ),
+        tfrm.getCol3()
+    );
+}
+
+inline const Transform3 prependScale( const Vector3 & scaleVec, const Transform3 & tfrm )
+{
+    return Transform3(
+        mulPerElem( tfrm.getCol0(), scaleVec ),
+        mulPerElem( tfrm.getCol1(), scaleVec ),
+        mulPerElem( tfrm.getCol2(), scaleVec ),
+        mulPerElem( tfrm.getCol3(), scaleVec )
+    );
+}
+
+inline const Transform3 Transform3::translation( const Vector3 & translateVec )
+{
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( ),
+        translateVec
+    );
+}
+
+inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 )
+{
+    return Transform3(
+        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
+        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
+        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
+        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Transform3 & tfrm )
+{
+    print( tfrm.getRow( 0 ) );
+    print( tfrm.getRow( 1 ) );
+    print( tfrm.getRow( 2 ) );
+}
+
+inline void print( const Transform3 & tfrm, const char * name )
+{
+    printf("%s:\n", name);
+    print( tfrm );
+}
+
+#endif
+
+inline Quat::Quat( const Matrix3 & tfrm )
+{
+    float trace, radicand, scale, xx, yx, zx, xy, yy, zy, xz, yz, zz, tmpx, tmpy, tmpz, tmpw, qx, qy, qz, qw;
+    int negTrace, ZgtX, ZgtY, YgtX;
+    int largestXorY, largestYorZ, largestZorX;
+
+    xx = tfrm.getCol0().getX();
+    yx = tfrm.getCol0().getY();
+    zx = tfrm.getCol0().getZ();
+    xy = tfrm.getCol1().getX();
+    yy = tfrm.getCol1().getY();
+    zy = tfrm.getCol1().getZ();
+    xz = tfrm.getCol2().getX();
+    yz = tfrm.getCol2().getY();
+    zz = tfrm.getCol2().getZ();
+
+    trace = ( ( xx + yy ) + zz );
+
+    negTrace = ( trace < 0.0f );
+    ZgtX = zz > xx;
+    ZgtY = zz > yy;
+    YgtX = yy > xx;
+    largestXorY = ( !ZgtX || !ZgtY ) && negTrace;
+    largestYorZ = ( YgtX || ZgtX ) && negTrace;
+    largestZorX = ( ZgtY || !YgtX ) && negTrace;
+    
+    if ( largestXorY )
+    {
+        zz = -zz;
+        xy = -xy;
+    }
+    if ( largestYorZ )
+    {
+        xx = -xx;
+        yz = -yz;
+    }
+    if ( largestZorX )
+    {
+        yy = -yy;
+        zx = -zx;
+    }
+
+    radicand = ( ( ( xx + yy ) + zz ) + 1.0f );
+    scale = ( 0.5f * ( 1.0f / sqrtf( radicand ) ) );
+
+    tmpx = ( ( zy - yz ) * scale );
+    tmpy = ( ( xz - zx ) * scale );
+    tmpz = ( ( yx - xy ) * scale );
+    tmpw = ( radicand * scale );
+    qx = tmpx;
+    qy = tmpy;
+    qz = tmpz;
+    qw = tmpw;
+
+    if ( largestXorY )
+    {
+        qx = tmpw;
+        qy = tmpz;
+        qz = tmpy;
+        qw = tmpx;
+    }
+    if ( largestYorZ )
+    {
+        tmpx = qx;
+        tmpz = qz;
+        qx = qy;
+        qy = tmpx;
+        qz = qw;
+        qw = tmpz;
+    }
+
+    mX = qx;
+    mY = qy;
+    mZ = qz;
+    mW = qw;
+}
+
+inline const Matrix3 outer( const Vector3 & tfrm0, const Vector3 & tfrm1 )
+{
+    return Matrix3(
+        ( tfrm0 * tfrm1.getX( ) ),
+        ( tfrm0 * tfrm1.getY( ) ),
+        ( tfrm0 * tfrm1.getZ( ) )
+    );
+}
+
+inline const Matrix4 outer( const Vector4 & tfrm0, const Vector4 & tfrm1 )
+{
+    return Matrix4(
+        ( tfrm0 * tfrm1.getX( ) ),
+        ( tfrm0 * tfrm1.getY( ) ),
+        ( tfrm0 * tfrm1.getZ( ) ),
+        ( tfrm0 * tfrm1.getW( ) )
+    );
+}
+
+inline const Vector3 rowMul( const Vector3 & vec, const Matrix3 & mat )
+{
+    return Vector3(
+        ( ( ( vec.getX() * mat.getCol0().getX() ) + ( vec.getY() * mat.getCol0().getY() ) ) + ( vec.getZ() * mat.getCol0().getZ() ) ),
+        ( ( ( vec.getX() * mat.getCol1().getX() ) + ( vec.getY() * mat.getCol1().getY() ) ) + ( vec.getZ() * mat.getCol1().getZ() ) ),
+        ( ( ( vec.getX() * mat.getCol2().getX() ) + ( vec.getY() * mat.getCol2().getY() ) ) + ( vec.getZ() * mat.getCol2().getZ() ) )
+    );
+}
+
+inline const Matrix3 crossMatrix( const Vector3 & vec )
+{
+    return Matrix3(
+        Vector3( 0.0f, vec.getZ(), -vec.getY() ),
+        Vector3( -vec.getZ(), 0.0f, vec.getX() ),
+        Vector3( vec.getY(), -vec.getX(), 0.0f )
+    );
+}
+
+inline const Matrix3 crossMatrixMul( const Vector3 & vec, const Matrix3 & mat )
+{
+    return Matrix3( cross( vec, mat.getCol0() ), cross( vec, mat.getCol1() ), cross( vec, mat.getCol2() ) );
+}
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/test/Bullet2/vectormath/scalar/quat_aos.h b/test/Bullet2/vectormath/scalar/quat_aos.h
new file mode 100644
index 000000000..764e01708
--- /dev/null
+++ b/test/Bullet2/vectormath/scalar/quat_aos.h
@@ -0,0 +1,433 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef _VECTORMATH_QUAT_AOS_CPP_H
+#define _VECTORMATH_QUAT_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+inline Quat::Quat( const Quat & quat )
+{
+    mX = quat.mX;
+    mY = quat.mY;
+    mZ = quat.mZ;
+    mW = quat.mW;
+}
+
+inline Quat::Quat( float _x, float _y, float _z, float _w )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+    mW = _w;
+}
+
+inline Quat::Quat( const Vector3 & xyz, float _w )
+{
+    this->setXYZ( xyz );
+    this->setW( _w );
+}
+
+inline Quat::Quat( const Vector4 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    mW = vec.getW();
+}
+
+inline Quat::Quat( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+    mW = scalar;
+}
+
+inline const Quat Quat::identity( )
+{
+    return Quat( 0.0f, 0.0f, 0.0f, 1.0f );
+}
+
+inline const Quat lerp( float t, const Quat & quat0, const Quat & quat1 )
+{
+    return ( quat0 + ( ( quat1 - quat0 ) * t ) );
+}
+
+inline const Quat slerp( float t, const Quat & unitQuat0, const Quat & unitQuat1 )
+{
+    Quat start;
+    float recipSinAngle, scale0, scale1, cosAngle, angle;
+    cosAngle = dot( unitQuat0, unitQuat1 );
+    if ( cosAngle < 0.0f ) {
+        cosAngle = -cosAngle;
+        start = ( -unitQuat0 );
+    } else {
+        start = unitQuat0;
+    }
+    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
+        angle = acosf( cosAngle );
+        recipSinAngle = ( 1.0f / sinf( angle ) );
+        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
+        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
+    } else {
+        scale0 = ( 1.0f - t );
+        scale1 = t;
+    }
+    return ( ( start * scale0 ) + ( unitQuat1 * scale1 ) );
+}
+
+inline const Quat squad( float t, const Quat & unitQuat0, const Quat & unitQuat1, const Quat & unitQuat2, const Quat & unitQuat3 )
+{
+    Quat tmp0, tmp1;
+    tmp0 = slerp( t, unitQuat0, unitQuat3 );
+    tmp1 = slerp( t, unitQuat1, unitQuat2 );
+    return slerp( ( ( 2.0f * t ) * ( 1.0f - t ) ), tmp0, tmp1 );
+}
+
+inline void loadXYZW( Quat & quat, const float * fptr )
+{
+    quat = Quat( fptr[0], fptr[1], fptr[2], fptr[3] );
+}
+
+inline void storeXYZW( const Quat & quat, float * fptr )
+{
+    fptr[0] = quat.getX();
+    fptr[1] = quat.getY();
+    fptr[2] = quat.getZ();
+    fptr[3] = quat.getW();
+}
+
+inline Quat & Quat::operator =( const Quat & quat )
+{
+    mX = quat.mX;
+    mY = quat.mY;
+    mZ = quat.mZ;
+    mW = quat.mW;
+    return *this;
+}
+
+inline Quat & Quat::setXYZ( const Vector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    return *this;
+}
+
+inline const Vector3 Quat::getXYZ( ) const
+{
+    return Vector3( mX, mY, mZ );
+}
+
+inline Quat & Quat::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Quat::getX( ) const
+{
+    return mX;
+}
+
+inline Quat & Quat::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Quat::getY( ) const
+{
+    return mY;
+}
+
+inline Quat & Quat::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Quat::getZ( ) const
+{
+    return mZ;
+}
+
+inline Quat & Quat::setW( float _w )
+{
+    mW = _w;
+    return *this;
+}
+
+inline float Quat::getW( ) const
+{
+    return mW;
+}
+
+inline Quat & Quat::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Quat::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Quat::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Quat::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Quat Quat::operator +( const Quat & quat ) const
+{
+    return Quat(
+        ( mX + quat.mX ),
+        ( mY + quat.mY ),
+        ( mZ + quat.mZ ),
+        ( mW + quat.mW )
+    );
+}
+
+inline const Quat Quat::operator -( const Quat & quat ) const
+{
+    return Quat(
+        ( mX - quat.mX ),
+        ( mY - quat.mY ),
+        ( mZ - quat.mZ ),
+        ( mW - quat.mW )
+    );
+}
+
+inline const Quat Quat::operator *( float scalar ) const
+{
+    return Quat(
+        ( mX * scalar ),
+        ( mY * scalar ),
+        ( mZ * scalar ),
+        ( mW * scalar )
+    );
+}
+
+inline Quat & Quat::operator +=( const Quat & quat )
+{
+    *this = *this + quat;
+    return *this;
+}
+
+inline Quat & Quat::operator -=( const Quat & quat )
+{
+    *this = *this - quat;
+    return *this;
+}
+
+inline Quat & Quat::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Quat Quat::operator /( float scalar ) const
+{
+    return Quat(
+        ( mX / scalar ),
+        ( mY / scalar ),
+        ( mZ / scalar ),
+        ( mW / scalar )
+    );
+}
+
+inline Quat & Quat::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+inline const Quat Quat::operator -( ) const
+{
+    return Quat(
+        -mX,
+        -mY,
+        -mZ,
+        -mW
+    );
+}
+
+inline const Quat operator *( float scalar, const Quat & quat )
+{
+    return quat * scalar;
+}
+
+inline float dot( const Quat & quat0, const Quat & quat1 )
+{
+    float result;
+    result = ( quat0.getX() * quat1.getX() );
+    result = ( result + ( quat0.getY() * quat1.getY() ) );
+    result = ( result + ( quat0.getZ() * quat1.getZ() ) );
+    result = ( result + ( quat0.getW() * quat1.getW() ) );
+    return result;
+}
+
+inline float norm( const Quat & quat )
+{
+    float result;
+    result = ( quat.getX() * quat.getX() );
+    result = ( result + ( quat.getY() * quat.getY() ) );
+    result = ( result + ( quat.getZ() * quat.getZ() ) );
+    result = ( result + ( quat.getW() * quat.getW() ) );
+    return result;
+}
+
+inline float length( const Quat & quat )
+{
+    return ::sqrtf( norm( quat ) );
+}
+
+inline const Quat normalize( const Quat & quat )
+{
+    float lenSqr, lenInv;
+    lenSqr = norm( quat );
+    lenInv = ( 1.0f / sqrtf( lenSqr ) );
+    return Quat(
+        ( quat.getX() * lenInv ),
+        ( quat.getY() * lenInv ),
+        ( quat.getZ() * lenInv ),
+        ( quat.getW() * lenInv )
+    );
+}
+
+inline const Quat Quat::rotation( const Vector3 & unitVec0, const Vector3 & unitVec1 )
+{
+    float cosHalfAngleX2, recipCosHalfAngleX2;
+    cosHalfAngleX2 = sqrtf( ( 2.0f * ( 1.0f + dot( unitVec0, unitVec1 ) ) ) );
+    recipCosHalfAngleX2 = ( 1.0f / cosHalfAngleX2 );
+    return Quat( ( cross( unitVec0, unitVec1 ) * recipCosHalfAngleX2 ), ( cosHalfAngleX2 * 0.5f ) );
+}
+
+inline const Quat Quat::rotation( float radians, const Vector3 & unitVec )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat( ( unitVec * s ), c );
+}
+
+inline const Quat Quat::rotationX( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat( s, 0.0f, 0.0f, c );
+}
+
+inline const Quat Quat::rotationY( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat( 0.0f, s, 0.0f, c );
+}
+
+inline const Quat Quat::rotationZ( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat( 0.0f, 0.0f, s, c );
+}
+
+inline const Quat Quat::operator *( const Quat & quat ) const
+{
+    return Quat(
+        ( ( ( ( mW * quat.mX ) + ( mX * quat.mW ) ) + ( mY * quat.mZ ) ) - ( mZ * quat.mY ) ),
+        ( ( ( ( mW * quat.mY ) + ( mY * quat.mW ) ) + ( mZ * quat.mX ) ) - ( mX * quat.mZ ) ),
+        ( ( ( ( mW * quat.mZ ) + ( mZ * quat.mW ) ) + ( mX * quat.mY ) ) - ( mY * quat.mX ) ),
+        ( ( ( ( mW * quat.mW ) - ( mX * quat.mX ) ) - ( mY * quat.mY ) ) - ( mZ * quat.mZ ) )
+    );
+}
+
+inline Quat & Quat::operator *=( const Quat & quat )
+{
+    *this = *this * quat;
+    return *this;
+}
+
+inline const Vector3 rotate( const Quat & quat, const Vector3 & vec )
+{
+    float tmpX, tmpY, tmpZ, tmpW;
+    tmpX = ( ( ( quat.getW() * vec.getX() ) + ( quat.getY() * vec.getZ() ) ) - ( quat.getZ() * vec.getY() ) );
+    tmpY = ( ( ( quat.getW() * vec.getY() ) + ( quat.getZ() * vec.getX() ) ) - ( quat.getX() * vec.getZ() ) );
+    tmpZ = ( ( ( quat.getW() * vec.getZ() ) + ( quat.getX() * vec.getY() ) ) - ( quat.getY() * vec.getX() ) );
+    tmpW = ( ( ( quat.getX() * vec.getX() ) + ( quat.getY() * vec.getY() ) ) + ( quat.getZ() * vec.getZ() ) );
+    return Vector3(
+        ( ( ( ( tmpW * quat.getX() ) + ( tmpX * quat.getW() ) ) - ( tmpY * quat.getZ() ) ) + ( tmpZ * quat.getY() ) ),
+        ( ( ( ( tmpW * quat.getY() ) + ( tmpY * quat.getW() ) ) - ( tmpZ * quat.getX() ) ) + ( tmpX * quat.getZ() ) ),
+        ( ( ( ( tmpW * quat.getZ() ) + ( tmpZ * quat.getW() ) ) - ( tmpX * quat.getY() ) ) + ( tmpY * quat.getX() ) )
+    );
+}
+
+inline const Quat conj( const Quat & quat )
+{
+    return Quat( -quat.getX(), -quat.getY(), -quat.getZ(), quat.getW() );
+}
+
+inline const Quat select( const Quat & quat0, const Quat & quat1, bool select1 )
+{
+    return Quat(
+        ( select1 )? quat1.getX() : quat0.getX(),
+        ( select1 )? quat1.getY() : quat0.getY(),
+        ( select1 )? quat1.getZ() : quat0.getZ(),
+        ( select1 )? quat1.getW() : quat0.getW()
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Quat & quat )
+{
+    printf( "( %f %f %f %f )\n", quat.getX(), quat.getY(), quat.getZ(), quat.getW() );
+}
+
+inline void print( const Quat & quat, const char * name )
+{
+    printf( "%s: ( %f %f %f %f )\n", name, quat.getX(), quat.getY(), quat.getZ(), quat.getW() );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/test/Bullet2/vectormath/scalar/vec_aos.h b/test/Bullet2/vectormath/scalar/vec_aos.h
new file mode 100644
index 000000000..46d4d6b3e
--- /dev/null
+++ b/test/Bullet2/vectormath/scalar/vec_aos.h
@@ -0,0 +1,1426 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef _VECTORMATH_VEC_AOS_CPP_H
+#define _VECTORMATH_VEC_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Constants
+
+#define _VECTORMATH_SLERP_TOL 0.999f
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+inline Vector3::Vector3( const Vector3 & vec )
+{
+    mX = vec.mX;
+    mY = vec.mY;
+    mZ = vec.mZ;
+}
+
+inline Vector3::Vector3( float _x, float _y, float _z )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+}
+
+inline Vector3::Vector3( const Point3 & pnt )
+{
+    mX = pnt.getX();
+    mY = pnt.getY();
+    mZ = pnt.getZ();
+}
+
+inline Vector3::Vector3( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+}
+
+inline const Vector3 Vector3::xAxis( )
+{
+    return Vector3( 1.0f, 0.0f, 0.0f );
+}
+
+inline const Vector3 Vector3::yAxis( )
+{
+    return Vector3( 0.0f, 1.0f, 0.0f );
+}
+
+inline const Vector3 Vector3::zAxis( )
+{
+    return Vector3( 0.0f, 0.0f, 1.0f );
+}
+
+inline const Vector3 lerp( float t, const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
+}
+
+inline const Vector3 slerp( float t, const Vector3 & unitVec0, const Vector3 & unitVec1 )
+{
+    float recipSinAngle, scale0, scale1, cosAngle, angle;
+    cosAngle = dot( unitVec0, unitVec1 );
+    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
+        angle = acosf( cosAngle );
+        recipSinAngle = ( 1.0f / sinf( angle ) );
+        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
+        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
+    } else {
+        scale0 = ( 1.0f - t );
+        scale1 = t;
+    }
+    return ( ( unitVec0 * scale0 ) + ( unitVec1 * scale1 ) );
+}
+
+inline void loadXYZ( Vector3 & vec, const float * fptr )
+{
+    vec = Vector3( fptr[0], fptr[1], fptr[2] );
+}
+
+inline void storeXYZ( const Vector3 & vec, float * fptr )
+{
+    fptr[0] = vec.getX();
+    fptr[1] = vec.getY();
+    fptr[2] = vec.getZ();
+}
+
+inline void loadHalfFloats( Vector3 & vec, const unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 3; i++) {
+        unsigned short fp16 = hfptr[i];
+        unsigned int sign = fp16 >> 15;
+        unsigned int exponent = (fp16 >> 10) & ((1 << 5) - 1);
+        unsigned int mantissa = fp16 & ((1 << 10) - 1);
+
+        if (exponent == 0) {
+            // zero
+            mantissa = 0;
+
+        } else if (exponent == 31) {
+            // infinity or nan -> infinity
+            exponent = 255;
+	    mantissa = 0;
+
+        } else {
+            exponent += 127 - 15;
+            mantissa <<= 13;
+        }
+
+        Data32 d;
+        d.u32 = (sign << 31) | (exponent << 23) | mantissa;
+        vec[i] = d.f32;
+    }
+}
+
+inline void storeHalfFloats( const Vector3 & vec, unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 3; i++) {
+        Data32 d;
+        d.f32 = vec[i];
+
+        unsigned int sign = d.u32 >> 31;
+        unsigned int exponent = (d.u32 >> 23) & ((1 << 8) - 1);
+        unsigned int mantissa = d.u32 & ((1 << 23) - 1);;
+
+        if (exponent == 0) {
+            // zero or denorm -> zero
+            mantissa = 0;
+
+        } else if (exponent == 255 && mantissa != 0) {
+            // nan -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent >= 127 - 15 + 31) {
+            // overflow or infinity -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent <= 127 - 15) {
+            // underflow -> zero
+            exponent = 0;
+            mantissa = 0;
+
+        } else {
+            exponent -= 127 - 15;
+            mantissa >>= 13;
+        }
+
+        hfptr[i] = (unsigned short)((sign << 15) | (exponent << 10) | mantissa);
+    }
+}
+
+inline Vector3 & Vector3::operator =( const Vector3 & vec )
+{
+    mX = vec.mX;
+    mY = vec.mY;
+    mZ = vec.mZ;
+    return *this;
+}
+
+inline Vector3 & Vector3::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Vector3::getX( ) const
+{
+    return mX;
+}
+
+inline Vector3 & Vector3::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Vector3::getY( ) const
+{
+    return mY;
+}
+
+inline Vector3 & Vector3::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Vector3::getZ( ) const
+{
+    return mZ;
+}
+
+inline Vector3 & Vector3::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Vector3::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Vector3::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Vector3::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Vector3 Vector3::operator +( const Vector3 & vec ) const
+{
+    return Vector3(
+        ( mX + vec.mX ),
+        ( mY + vec.mY ),
+        ( mZ + vec.mZ )
+    );
+}
+
+inline const Vector3 Vector3::operator -( const Vector3 & vec ) const
+{
+    return Vector3(
+        ( mX - vec.mX ),
+        ( mY - vec.mY ),
+        ( mZ - vec.mZ )
+    );
+}
+
+inline const Point3 Vector3::operator +( const Point3 & pnt ) const
+{
+    return Point3(
+        ( mX + pnt.getX() ),
+        ( mY + pnt.getY() ),
+        ( mZ + pnt.getZ() )
+    );
+}
+
+inline const Vector3 Vector3::operator *( float scalar ) const
+{
+    return Vector3(
+        ( mX * scalar ),
+        ( mY * scalar ),
+        ( mZ * scalar )
+    );
+}
+
+inline Vector3 & Vector3::operator +=( const Vector3 & vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline Vector3 & Vector3::operator -=( const Vector3 & vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline Vector3 & Vector3::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Vector3 Vector3::operator /( float scalar ) const
+{
+    return Vector3(
+        ( mX / scalar ),
+        ( mY / scalar ),
+        ( mZ / scalar )
+    );
+}
+
+inline Vector3 & Vector3::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+inline const Vector3 Vector3::operator -( ) const
+{
+    return Vector3(
+        -mX,
+        -mY,
+        -mZ
+    );
+}
+
+inline const Vector3 operator *( float scalar, const Vector3 & vec )
+{
+    return vec * scalar;
+}
+
+inline const Vector3 mulPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        ( vec0.getX() * vec1.getX() ),
+        ( vec0.getY() * vec1.getY() ),
+        ( vec0.getZ() * vec1.getZ() )
+    );
+}
+
+inline const Vector3 divPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        ( vec0.getX() / vec1.getX() ),
+        ( vec0.getY() / vec1.getY() ),
+        ( vec0.getZ() / vec1.getZ() )
+    );
+}
+
+inline const Vector3 recipPerElem( const Vector3 & vec )
+{
+    return Vector3(
+        ( 1.0f / vec.getX() ),
+        ( 1.0f / vec.getY() ),
+        ( 1.0f / vec.getZ() )
+    );
+}
+
+inline const Vector3 sqrtPerElem( const Vector3 & vec )
+{
+    return Vector3(
+        sqrtf( vec.getX() ),
+        sqrtf( vec.getY() ),
+        sqrtf( vec.getZ() )
+    );
+}
+
+inline const Vector3 rsqrtPerElem( const Vector3 & vec )
+{
+    return Vector3(
+        ( 1.0f / sqrtf( vec.getX() ) ),
+        ( 1.0f / sqrtf( vec.getY() ) ),
+        ( 1.0f / sqrtf( vec.getZ() ) )
+    );
+}
+
+inline const Vector3 absPerElem( const Vector3 & vec )
+{
+    return Vector3(
+        fabsf( vec.getX() ),
+        fabsf( vec.getY() ),
+        fabsf( vec.getZ() )
+    );
+}
+
+inline const Vector3 copySignPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        ( vec1.getX() < 0.0f )? -fabsf( vec0.getX() ) : fabsf( vec0.getX() ),
+        ( vec1.getY() < 0.0f )? -fabsf( vec0.getY() ) : fabsf( vec0.getY() ),
+        ( vec1.getZ() < 0.0f )? -fabsf( vec0.getZ() ) : fabsf( vec0.getZ() )
+    );
+}
+
+inline const Vector3 maxPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        (vec0.getX() > vec1.getX())? vec0.getX() : vec1.getX(),
+        (vec0.getY() > vec1.getY())? vec0.getY() : vec1.getY(),
+        (vec0.getZ() > vec1.getZ())? vec0.getZ() : vec1.getZ()
+    );
+}
+
+inline float maxElem( const Vector3 & vec )
+{
+    float result;
+    result = (vec.getX() > vec.getY())? vec.getX() : vec.getY();
+    result = (vec.getZ() > result)? vec.getZ() : result;
+    return result;
+}
+
+inline const Vector3 minPerElem( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        (vec0.getX() < vec1.getX())? vec0.getX() : vec1.getX(),
+        (vec0.getY() < vec1.getY())? vec0.getY() : vec1.getY(),
+        (vec0.getZ() < vec1.getZ())? vec0.getZ() : vec1.getZ()
+    );
+}
+
+inline float minElem( const Vector3 & vec )
+{
+    float result;
+    result = (vec.getX() < vec.getY())? vec.getX() : vec.getY();
+    result = (vec.getZ() < result)? vec.getZ() : result;
+    return result;
+}
+
+inline float sum( const Vector3 & vec )
+{
+    float result;
+    result = ( vec.getX() + vec.getY() );
+    result = ( result + vec.getZ() );
+    return result;
+}
+
+inline float dot( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    float result;
+    result = ( vec0.getX() * vec1.getX() );
+    result = ( result + ( vec0.getY() * vec1.getY() ) );
+    result = ( result + ( vec0.getZ() * vec1.getZ() ) );
+    return result;
+}
+
+inline float lengthSqr( const Vector3 & vec )
+{
+    float result;
+    result = ( vec.getX() * vec.getX() );
+    result = ( result + ( vec.getY() * vec.getY() ) );
+    result = ( result + ( vec.getZ() * vec.getZ() ) );
+    return result;
+}
+
+inline float length( const Vector3 & vec )
+{
+    return ::sqrtf( lengthSqr( vec ) );
+}
+
+inline const Vector3 normalize( const Vector3 & vec )
+{
+    float lenSqr, lenInv;
+    lenSqr = lengthSqr( vec );
+    lenInv = ( 1.0f / sqrtf( lenSqr ) );
+    return Vector3(
+        ( vec.getX() * lenInv ),
+        ( vec.getY() * lenInv ),
+        ( vec.getZ() * lenInv )
+    );
+}
+
+inline const Vector3 cross( const Vector3 & vec0, const Vector3 & vec1 )
+{
+    return Vector3(
+        ( ( vec0.getY() * vec1.getZ() ) - ( vec0.getZ() * vec1.getY() ) ),
+        ( ( vec0.getZ() * vec1.getX() ) - ( vec0.getX() * vec1.getZ() ) ),
+        ( ( vec0.getX() * vec1.getY() ) - ( vec0.getY() * vec1.getX() ) )
+    );
+}
+
+inline const Vector3 select( const Vector3 & vec0, const Vector3 & vec1, bool select1 )
+{
+    return Vector3(
+        ( select1 )? vec1.getX() : vec0.getX(),
+        ( select1 )? vec1.getY() : vec0.getY(),
+        ( select1 )? vec1.getZ() : vec0.getZ()
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Vector3 & vec )
+{
+    printf( "( %f %f %f )\n", vec.getX(), vec.getY(), vec.getZ() );
+}
+
+inline void print( const Vector3 & vec, const char * name )
+{
+    printf( "%s: ( %f %f %f )\n", name, vec.getX(), vec.getY(), vec.getZ() );
+}
+
+#endif
+
+inline Vector4::Vector4( const Vector4 & vec )
+{
+    mX = vec.mX;
+    mY = vec.mY;
+    mZ = vec.mZ;
+    mW = vec.mW;
+}
+
+inline Vector4::Vector4( float _x, float _y, float _z, float _w )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+    mW = _w;
+}
+
+inline Vector4::Vector4( const Vector3 & xyz, float _w )
+{
+    this->setXYZ( xyz );
+    this->setW( _w );
+}
+
+inline Vector4::Vector4( const Vector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    mW = 0.0f;
+}
+
+inline Vector4::Vector4( const Point3 & pnt )
+{
+    mX = pnt.getX();
+    mY = pnt.getY();
+    mZ = pnt.getZ();
+    mW = 1.0f;
+}
+
+inline Vector4::Vector4( const Quat & quat )
+{
+    mX = quat.getX();
+    mY = quat.getY();
+    mZ = quat.getZ();
+    mW = quat.getW();
+}
+
+inline Vector4::Vector4( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+    mW = scalar;
+}
+
+inline const Vector4 Vector4::xAxis( )
+{
+    return Vector4( 1.0f, 0.0f, 0.0f, 0.0f );
+}
+
+inline const Vector4 Vector4::yAxis( )
+{
+    return Vector4( 0.0f, 1.0f, 0.0f, 0.0f );
+}
+
+inline const Vector4 Vector4::zAxis( )
+{
+    return Vector4( 0.0f, 0.0f, 1.0f, 0.0f );
+}
+
+inline const Vector4 Vector4::wAxis( )
+{
+    return Vector4( 0.0f, 0.0f, 0.0f, 1.0f );
+}
+
+inline const Vector4 lerp( float t, const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
+}
+
+inline const Vector4 slerp( float t, const Vector4 & unitVec0, const Vector4 & unitVec1 )
+{
+    float recipSinAngle, scale0, scale1, cosAngle, angle;
+    cosAngle = dot( unitVec0, unitVec1 );
+    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
+        angle = acosf( cosAngle );
+        recipSinAngle = ( 1.0f / sinf( angle ) );
+        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
+        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
+    } else {
+        scale0 = ( 1.0f - t );
+        scale1 = t;
+    }
+    return ( ( unitVec0 * scale0 ) + ( unitVec1 * scale1 ) );
+}
+
+inline void loadXYZW( Vector4 & vec, const float * fptr )
+{
+    vec = Vector4( fptr[0], fptr[1], fptr[2], fptr[3] );
+}
+
+inline void storeXYZW( const Vector4 & vec, float * fptr )
+{
+    fptr[0] = vec.getX();
+    fptr[1] = vec.getY();
+    fptr[2] = vec.getZ();
+    fptr[3] = vec.getW();
+}
+
+inline void loadHalfFloats( Vector4 & vec, const unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 4; i++) {
+        unsigned short fp16 = hfptr[i];
+        unsigned int sign = fp16 >> 15;
+        unsigned int exponent = (fp16 >> 10) & ((1 << 5) - 1);
+        unsigned int mantissa = fp16 & ((1 << 10) - 1);
+
+        if (exponent == 0) {
+            // zero
+            mantissa = 0;
+
+        } else if (exponent == 31) {
+            // infinity or nan -> infinity
+            exponent = 255;
+	    mantissa = 0;
+
+        } else {
+            exponent += 127 - 15;
+            mantissa <<= 13;
+        }
+
+        Data32 d;
+        d.u32 = (sign << 31) | (exponent << 23) | mantissa;
+        vec[i] = d.f32;
+    }
+}
+
+inline void storeHalfFloats( const Vector4 & vec, unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 4; i++) {
+        Data32 d;
+        d.f32 = vec[i];
+
+        unsigned int sign = d.u32 >> 31;
+        unsigned int exponent = (d.u32 >> 23) & ((1 << 8) - 1);
+        unsigned int mantissa = d.u32 & ((1 << 23) - 1);;
+
+        if (exponent == 0) {
+            // zero or denorm -> zero
+            mantissa = 0;
+
+        } else if (exponent == 255 && mantissa != 0) {
+            // nan -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent >= 127 - 15 + 31) {
+            // overflow or infinity -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent <= 127 - 15) {
+            // underflow -> zero
+            exponent = 0;
+            mantissa = 0;
+
+        } else {
+            exponent -= 127 - 15;
+            mantissa >>= 13;
+        }
+
+        hfptr[i] = (unsigned short)((sign << 15) | (exponent << 10) | mantissa);
+    }
+}
+
+inline Vector4 & Vector4::operator =( const Vector4 & vec )
+{
+    mX = vec.mX;
+    mY = vec.mY;
+    mZ = vec.mZ;
+    mW = vec.mW;
+    return *this;
+}
+
+inline Vector4 & Vector4::setXYZ( const Vector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    return *this;
+}
+
+inline const Vector3 Vector4::getXYZ( ) const
+{
+    return Vector3( mX, mY, mZ );
+}
+
+inline Vector4 & Vector4::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Vector4::getX( ) const
+{
+    return mX;
+}
+
+inline Vector4 & Vector4::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Vector4::getY( ) const
+{
+    return mY;
+}
+
+inline Vector4 & Vector4::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Vector4::getZ( ) const
+{
+    return mZ;
+}
+
+inline Vector4 & Vector4::setW( float _w )
+{
+    mW = _w;
+    return *this;
+}
+
+inline float Vector4::getW( ) const
+{
+    return mW;
+}
+
+inline Vector4 & Vector4::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Vector4::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Vector4::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Vector4::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Vector4 Vector4::operator +( const Vector4 & vec ) const
+{
+    return Vector4(
+        ( mX + vec.mX ),
+        ( mY + vec.mY ),
+        ( mZ + vec.mZ ),
+        ( mW + vec.mW )
+    );
+}
+
+inline const Vector4 Vector4::operator -( const Vector4 & vec ) const
+{
+    return Vector4(
+        ( mX - vec.mX ),
+        ( mY - vec.mY ),
+        ( mZ - vec.mZ ),
+        ( mW - vec.mW )
+    );
+}
+
+inline const Vector4 Vector4::operator *( float scalar ) const
+{
+    return Vector4(
+        ( mX * scalar ),
+        ( mY * scalar ),
+        ( mZ * scalar ),
+        ( mW * scalar )
+    );
+}
+
+inline Vector4 & Vector4::operator +=( const Vector4 & vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline Vector4 & Vector4::operator -=( const Vector4 & vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline Vector4 & Vector4::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Vector4 Vector4::operator /( float scalar ) const
+{
+    return Vector4(
+        ( mX / scalar ),
+        ( mY / scalar ),
+        ( mZ / scalar ),
+        ( mW / scalar )
+    );
+}
+
+inline Vector4 & Vector4::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+inline const Vector4 Vector4::operator -( ) const
+{
+    return Vector4(
+        -mX,
+        -mY,
+        -mZ,
+        -mW
+    );
+}
+
+inline const Vector4 operator *( float scalar, const Vector4 & vec )
+{
+    return vec * scalar;
+}
+
+inline const Vector4 mulPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        ( vec0.getX() * vec1.getX() ),
+        ( vec0.getY() * vec1.getY() ),
+        ( vec0.getZ() * vec1.getZ() ),
+        ( vec0.getW() * vec1.getW() )
+    );
+}
+
+inline const Vector4 divPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        ( vec0.getX() / vec1.getX() ),
+        ( vec0.getY() / vec1.getY() ),
+        ( vec0.getZ() / vec1.getZ() ),
+        ( vec0.getW() / vec1.getW() )
+    );
+}
+
+inline const Vector4 recipPerElem( const Vector4 & vec )
+{
+    return Vector4(
+        ( 1.0f / vec.getX() ),
+        ( 1.0f / vec.getY() ),
+        ( 1.0f / vec.getZ() ),
+        ( 1.0f / vec.getW() )
+    );
+}
+
+inline const Vector4 sqrtPerElem( const Vector4 & vec )
+{
+    return Vector4(
+        sqrtf( vec.getX() ),
+        sqrtf( vec.getY() ),
+        sqrtf( vec.getZ() ),
+        sqrtf( vec.getW() )
+    );
+}
+
+inline const Vector4 rsqrtPerElem( const Vector4 & vec )
+{
+    return Vector4(
+        ( 1.0f / sqrtf( vec.getX() ) ),
+        ( 1.0f / sqrtf( vec.getY() ) ),
+        ( 1.0f / sqrtf( vec.getZ() ) ),
+        ( 1.0f / sqrtf( vec.getW() ) )
+    );
+}
+
+inline const Vector4 absPerElem( const Vector4 & vec )
+{
+    return Vector4(
+        fabsf( vec.getX() ),
+        fabsf( vec.getY() ),
+        fabsf( vec.getZ() ),
+        fabsf( vec.getW() )
+    );
+}
+
+inline const Vector4 copySignPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        ( vec1.getX() < 0.0f )? -fabsf( vec0.getX() ) : fabsf( vec0.getX() ),
+        ( vec1.getY() < 0.0f )? -fabsf( vec0.getY() ) : fabsf( vec0.getY() ),
+        ( vec1.getZ() < 0.0f )? -fabsf( vec0.getZ() ) : fabsf( vec0.getZ() ),
+        ( vec1.getW() < 0.0f )? -fabsf( vec0.getW() ) : fabsf( vec0.getW() )
+    );
+}
+
+inline const Vector4 maxPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        (vec0.getX() > vec1.getX())? vec0.getX() : vec1.getX(),
+        (vec0.getY() > vec1.getY())? vec0.getY() : vec1.getY(),
+        (vec0.getZ() > vec1.getZ())? vec0.getZ() : vec1.getZ(),
+        (vec0.getW() > vec1.getW())? vec0.getW() : vec1.getW()
+    );
+}
+
+inline float maxElem( const Vector4 & vec )
+{
+    float result;
+    result = (vec.getX() > vec.getY())? vec.getX() : vec.getY();
+    result = (vec.getZ() > result)? vec.getZ() : result;
+    result = (vec.getW() > result)? vec.getW() : result;
+    return result;
+}
+
+inline const Vector4 minPerElem( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    return Vector4(
+        (vec0.getX() < vec1.getX())? vec0.getX() : vec1.getX(),
+        (vec0.getY() < vec1.getY())? vec0.getY() : vec1.getY(),
+        (vec0.getZ() < vec1.getZ())? vec0.getZ() : vec1.getZ(),
+        (vec0.getW() < vec1.getW())? vec0.getW() : vec1.getW()
+    );
+}
+
+inline float minElem( const Vector4 & vec )
+{
+    float result;
+    result = (vec.getX() < vec.getY())? vec.getX() : vec.getY();
+    result = (vec.getZ() < result)? vec.getZ() : result;
+    result = (vec.getW() < result)? vec.getW() : result;
+    return result;
+}
+
+inline float sum( const Vector4 & vec )
+{
+    float result;
+    result = ( vec.getX() + vec.getY() );
+    result = ( result + vec.getZ() );
+    result = ( result + vec.getW() );
+    return result;
+}
+
+inline float dot( const Vector4 & vec0, const Vector4 & vec1 )
+{
+    float result;
+    result = ( vec0.getX() * vec1.getX() );
+    result = ( result + ( vec0.getY() * vec1.getY() ) );
+    result = ( result + ( vec0.getZ() * vec1.getZ() ) );
+    result = ( result + ( vec0.getW() * vec1.getW() ) );
+    return result;
+}
+
+inline float lengthSqr( const Vector4 & vec )
+{
+    float result;
+    result = ( vec.getX() * vec.getX() );
+    result = ( result + ( vec.getY() * vec.getY() ) );
+    result = ( result + ( vec.getZ() * vec.getZ() ) );
+    result = ( result + ( vec.getW() * vec.getW() ) );
+    return result;
+}
+
+inline float length( const Vector4 & vec )
+{
+    return ::sqrtf( lengthSqr( vec ) );
+}
+
+inline const Vector4 normalize( const Vector4 & vec )
+{
+    float lenSqr, lenInv;
+    lenSqr = lengthSqr( vec );
+    lenInv = ( 1.0f / sqrtf( lenSqr ) );
+    return Vector4(
+        ( vec.getX() * lenInv ),
+        ( vec.getY() * lenInv ),
+        ( vec.getZ() * lenInv ),
+        ( vec.getW() * lenInv )
+    );
+}
+
+inline const Vector4 select( const Vector4 & vec0, const Vector4 & vec1, bool select1 )
+{
+    return Vector4(
+        ( select1 )? vec1.getX() : vec0.getX(),
+        ( select1 )? vec1.getY() : vec0.getY(),
+        ( select1 )? vec1.getZ() : vec0.getZ(),
+        ( select1 )? vec1.getW() : vec0.getW()
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Vector4 & vec )
+{
+    printf( "( %f %f %f %f )\n", vec.getX(), vec.getY(), vec.getZ(), vec.getW() );
+}
+
+inline void print( const Vector4 & vec, const char * name )
+{
+    printf( "%s: ( %f %f %f %f )\n", name, vec.getX(), vec.getY(), vec.getZ(), vec.getW() );
+}
+
+#endif
+
+inline Point3::Point3( const Point3 & pnt )
+{
+    mX = pnt.mX;
+    mY = pnt.mY;
+    mZ = pnt.mZ;
+}
+
+inline Point3::Point3( float _x, float _y, float _z )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+}
+
+inline Point3::Point3( const Vector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+}
+
+inline Point3::Point3( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+}
+
+inline const Point3 lerp( float t, const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return ( pnt0 + ( ( pnt1 - pnt0 ) * t ) );
+}
+
+inline void loadXYZ( Point3 & pnt, const float * fptr )
+{
+    pnt = Point3( fptr[0], fptr[1], fptr[2] );
+}
+
+inline void storeXYZ( const Point3 & pnt, float * fptr )
+{
+    fptr[0] = pnt.getX();
+    fptr[1] = pnt.getY();
+    fptr[2] = pnt.getZ();
+}
+
+inline void loadHalfFloats( Point3 & vec, const unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 3; i++) {
+        unsigned short fp16 = hfptr[i];
+        unsigned int sign = fp16 >> 15;
+        unsigned int exponent = (fp16 >> 10) & ((1 << 5) - 1);
+        unsigned int mantissa = fp16 & ((1 << 10) - 1);
+
+        if (exponent == 0) {
+            // zero
+            mantissa = 0;
+
+        } else if (exponent == 31) {
+            // infinity or nan -> infinity
+            exponent = 255;
+	    mantissa = 0;
+
+        } else {
+            exponent += 127 - 15;
+            mantissa <<= 13;
+        }
+
+        Data32 d;
+        d.u32 = (sign << 31) | (exponent << 23) | mantissa;
+        vec[i] = d.f32;
+    }
+}
+
+inline void storeHalfFloats( const Point3 & vec, unsigned short * hfptr )
+{
+    union Data32 {
+        unsigned int u32;
+        float f32;
+    };
+
+    for (int i = 0; i < 3; i++) {
+        Data32 d;
+        d.f32 = vec[i];
+
+        unsigned int sign = d.u32 >> 31;
+        unsigned int exponent = (d.u32 >> 23) & ((1 << 8) - 1);
+        unsigned int mantissa = d.u32 & ((1 << 23) - 1);;
+
+        if (exponent == 0) {
+            // zero or denorm -> zero
+            mantissa = 0;
+
+        } else if (exponent == 255 && mantissa != 0) {
+            // nan -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent >= 127 - 15 + 31) {
+            // overflow or infinity -> infinity
+            exponent = 31;
+            mantissa = 0;
+
+        } else if (exponent <= 127 - 15) {
+            // underflow -> zero
+            exponent = 0;
+            mantissa = 0;
+
+        } else {
+            exponent -= 127 - 15;
+            mantissa >>= 13;
+        }
+
+        hfptr[i] = (unsigned short)((sign << 15) | (exponent << 10) | mantissa);
+    }
+}
+
+inline Point3 & Point3::operator =( const Point3 & pnt )
+{
+    mX = pnt.mX;
+    mY = pnt.mY;
+    mZ = pnt.mZ;
+    return *this;
+}
+
+inline Point3 & Point3::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Point3::getX( ) const
+{
+    return mX;
+}
+
+inline Point3 & Point3::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Point3::getY( ) const
+{
+    return mY;
+}
+
+inline Point3 & Point3::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Point3::getZ( ) const
+{
+    return mZ;
+}
+
+inline Point3 & Point3::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Point3::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Point3::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Point3::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Vector3 Point3::operator -( const Point3 & pnt ) const
+{
+    return Vector3(
+        ( mX - pnt.mX ),
+        ( mY - pnt.mY ),
+        ( mZ - pnt.mZ )
+    );
+}
+
+inline const Point3 Point3::operator +( const Vector3 & vec ) const
+{
+    return Point3(
+        ( mX + vec.getX() ),
+        ( mY + vec.getY() ),
+        ( mZ + vec.getZ() )
+    );
+}
+
+inline const Point3 Point3::operator -( const Vector3 & vec ) const
+{
+    return Point3(
+        ( mX - vec.getX() ),
+        ( mY - vec.getY() ),
+        ( mZ - vec.getZ() )
+    );
+}
+
+inline Point3 & Point3::operator +=( const Vector3 & vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline Point3 & Point3::operator -=( const Vector3 & vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline const Point3 mulPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        ( pnt0.getX() * pnt1.getX() ),
+        ( pnt0.getY() * pnt1.getY() ),
+        ( pnt0.getZ() * pnt1.getZ() )
+    );
+}
+
+inline const Point3 divPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        ( pnt0.getX() / pnt1.getX() ),
+        ( pnt0.getY() / pnt1.getY() ),
+        ( pnt0.getZ() / pnt1.getZ() )
+    );
+}
+
+inline const Point3 recipPerElem( const Point3 & pnt )
+{
+    return Point3(
+        ( 1.0f / pnt.getX() ),
+        ( 1.0f / pnt.getY() ),
+        ( 1.0f / pnt.getZ() )
+    );
+}
+
+inline const Point3 sqrtPerElem( const Point3 & pnt )
+{
+    return Point3(
+        sqrtf( pnt.getX() ),
+        sqrtf( pnt.getY() ),
+        sqrtf( pnt.getZ() )
+    );
+}
+
+inline const Point3 rsqrtPerElem( const Point3 & pnt )
+{
+    return Point3(
+        ( 1.0f / sqrtf( pnt.getX() ) ),
+        ( 1.0f / sqrtf( pnt.getY() ) ),
+        ( 1.0f / sqrtf( pnt.getZ() ) )
+    );
+}
+
+inline const Point3 absPerElem( const Point3 & pnt )
+{
+    return Point3(
+        fabsf( pnt.getX() ),
+        fabsf( pnt.getY() ),
+        fabsf( pnt.getZ() )
+    );
+}
+
+inline const Point3 copySignPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        ( pnt1.getX() < 0.0f )? -fabsf( pnt0.getX() ) : fabsf( pnt0.getX() ),
+        ( pnt1.getY() < 0.0f )? -fabsf( pnt0.getY() ) : fabsf( pnt0.getY() ),
+        ( pnt1.getZ() < 0.0f )? -fabsf( pnt0.getZ() ) : fabsf( pnt0.getZ() )
+    );
+}
+
+inline const Point3 maxPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        (pnt0.getX() > pnt1.getX())? pnt0.getX() : pnt1.getX(),
+        (pnt0.getY() > pnt1.getY())? pnt0.getY() : pnt1.getY(),
+        (pnt0.getZ() > pnt1.getZ())? pnt0.getZ() : pnt1.getZ()
+    );
+}
+
+inline float maxElem( const Point3 & pnt )
+{
+    float result;
+    result = (pnt.getX() > pnt.getY())? pnt.getX() : pnt.getY();
+    result = (pnt.getZ() > result)? pnt.getZ() : result;
+    return result;
+}
+
+inline const Point3 minPerElem( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return Point3(
+        (pnt0.getX() < pnt1.getX())? pnt0.getX() : pnt1.getX(),
+        (pnt0.getY() < pnt1.getY())? pnt0.getY() : pnt1.getY(),
+        (pnt0.getZ() < pnt1.getZ())? pnt0.getZ() : pnt1.getZ()
+    );
+}
+
+inline float minElem( const Point3 & pnt )
+{
+    float result;
+    result = (pnt.getX() < pnt.getY())? pnt.getX() : pnt.getY();
+    result = (pnt.getZ() < result)? pnt.getZ() : result;
+    return result;
+}
+
+inline float sum( const Point3 & pnt )
+{
+    float result;
+    result = ( pnt.getX() + pnt.getY() );
+    result = ( result + pnt.getZ() );
+    return result;
+}
+
+inline const Point3 scale( const Point3 & pnt, float scaleVal )
+{
+    return mulPerElem( pnt, Point3( scaleVal ) );
+}
+
+inline const Point3 scale( const Point3 & pnt, const Vector3 & scaleVec )
+{
+    return mulPerElem( pnt, Point3( scaleVec ) );
+}
+
+inline float projection( const Point3 & pnt, const Vector3 & unitVec )
+{
+    float result;
+    result = ( pnt.getX() * unitVec.getX() );
+    result = ( result + ( pnt.getY() * unitVec.getY() ) );
+    result = ( result + ( pnt.getZ() * unitVec.getZ() ) );
+    return result;
+}
+
+inline float distSqrFromOrigin( const Point3 & pnt )
+{
+    return lengthSqr( Vector3( pnt ) );
+}
+
+inline float distFromOrigin( const Point3 & pnt )
+{
+    return length( Vector3( pnt ) );
+}
+
+inline float distSqr( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return lengthSqr( ( pnt1 - pnt0 ) );
+}
+
+inline float dist( const Point3 & pnt0, const Point3 & pnt1 )
+{
+    return length( ( pnt1 - pnt0 ) );
+}
+
+inline const Point3 select( const Point3 & pnt0, const Point3 & pnt1, bool select1 )
+{
+    return Point3(
+        ( select1 )? pnt1.getX() : pnt0.getX(),
+        ( select1 )? pnt1.getY() : pnt0.getY(),
+        ( select1 )? pnt1.getZ() : pnt0.getZ()
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Point3 & pnt )
+{
+    printf( "( %f %f %f )\n", pnt.getX(), pnt.getY(), pnt.getZ() );
+}
+
+inline void print( const Point3 & pnt, const char * name )
+{
+    printf( "%s: ( %f %f %f )\n", name, pnt.getX(), pnt.getY(), pnt.getZ() );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/test/Bullet2/vectormath/scalar/vectormath_aos.h b/test/Bullet2/vectormath/scalar/vectormath_aos.h
new file mode 100644
index 000000000..d00456dfe
--- /dev/null
+++ b/test/Bullet2/vectormath/scalar/vectormath_aos.h
@@ -0,0 +1,1872 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef _VECTORMATH_AOS_CPP_H
+#define _VECTORMATH_AOS_CPP_H
+
+#include <math.h>
+
+#ifdef _VECTORMATH_DEBUG
+#include <stdio.h>
+#endif
+
+namespace Vectormath {
+
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// Forward Declarations
+//
+
+class Vector3;
+class Vector4;
+class Point3;
+class Quat;
+class Matrix3;
+class Matrix4;
+class Transform3;
+
+// A 3-D vector in array-of-structures format
+//
+class Vector3
+{
+    float mX;
+    float mY;
+    float mZ;
+#ifndef __GNUC__
+    float d;
+#endif
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Vector3( ) { };
+
+    // Copy a 3-D vector
+    // 
+    inline Vector3( const Vector3 & vec );
+
+    // Construct a 3-D vector from x, y, and z elements
+    // 
+    inline Vector3( float x, float y, float z );
+
+    // Copy elements from a 3-D point into a 3-D vector
+    // 
+    explicit inline Vector3( const Point3 & pnt );
+
+    // Set all elements of a 3-D vector to the same scalar value
+    // 
+    explicit inline Vector3( float scalar );
+
+    // Assign one 3-D vector to another
+    // 
+    inline Vector3 & operator =( const Vector3 & vec );
+
+    // Set the x element of a 3-D vector
+    // 
+    inline Vector3 & setX( float x );
+
+    // Set the y element of a 3-D vector
+    // 
+    inline Vector3 & setY( float y );
+
+    // Set the z element of a 3-D vector
+    // 
+    inline Vector3 & setZ( float z );
+
+    // Get the x element of a 3-D vector
+    // 
+    inline float getX( ) const;
+
+    // Get the y element of a 3-D vector
+    // 
+    inline float getY( ) const;
+
+    // Get the z element of a 3-D vector
+    // 
+    inline float getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D vector by index
+    // 
+    inline Vector3 & setElem( int idx, float value );
+
+    // Get an x, y, or z element of a 3-D vector by index
+    // 
+    inline float getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+
+    // Add two 3-D vectors
+    // 
+    inline const Vector3 operator +( const Vector3 & vec ) const;
+
+    // Subtract a 3-D vector from another 3-D vector
+    // 
+    inline const Vector3 operator -( const Vector3 & vec ) const;
+
+    // Add a 3-D vector to a 3-D point
+    // 
+    inline const Point3 operator +( const Point3 & pnt ) const;
+
+    // Multiply a 3-D vector by a scalar
+    // 
+    inline const Vector3 operator *( float scalar ) const;
+
+    // Divide a 3-D vector by a scalar
+    // 
+    inline const Vector3 operator /( float scalar ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    inline Vector3 & operator +=( const Vector3 & vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    inline Vector3 & operator -=( const Vector3 & vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Vector3 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    inline Vector3 & operator /=( float scalar );
+
+    // Negate all elements of a 3-D vector
+    // 
+    inline const Vector3 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static inline const Vector3 xAxis( );
+
+    // Construct y axis
+    // 
+    static inline const Vector3 yAxis( );
+
+    // Construct z axis
+    // 
+    static inline const Vector3 zAxis( );
+
+}
+#ifdef __GNUC__
+__attribute__ ((aligned(16)))
+#endif
+;
+
+// Multiply a 3-D vector by a scalar
+// 
+inline const Vector3 operator *( float scalar, const Vector3 & vec );
+
+// Multiply two 3-D vectors per element
+// 
+inline const Vector3 mulPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Divide two 3-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+inline const Vector3 divPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Compute the reciprocal of a 3-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+inline const Vector3 recipPerElem( const Vector3 & vec );
+
+// Compute the square root of a 3-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function sqrtf4.
+// 
+inline const Vector3 sqrtPerElem( const Vector3 & vec );
+
+// Compute the reciprocal square root of a 3-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function rsqrtf4.
+// 
+inline const Vector3 rsqrtPerElem( const Vector3 & vec );
+
+// Compute the absolute value of a 3-D vector per element
+// 
+inline const Vector3 absPerElem( const Vector3 & vec );
+
+// Copy sign from one 3-D vector to another, per element
+// 
+inline const Vector3 copySignPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Maximum of two 3-D vectors per element
+// 
+inline const Vector3 maxPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Minimum of two 3-D vectors per element
+// 
+inline const Vector3 minPerElem( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Maximum element of a 3-D vector
+// 
+inline float maxElem( const Vector3 & vec );
+
+// Minimum element of a 3-D vector
+// 
+inline float minElem( const Vector3 & vec );
+
+// Compute the sum of all elements of a 3-D vector
+// 
+inline float sum( const Vector3 & vec );
+
+// Compute the dot product of two 3-D vectors
+// 
+inline float dot( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Compute the square of the length of a 3-D vector
+// 
+inline float lengthSqr( const Vector3 & vec );
+
+// Compute the length of a 3-D vector
+// 
+inline float length( const Vector3 & vec );
+
+// Normalize a 3-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+inline const Vector3 normalize( const Vector3 & vec );
+
+// Compute cross product of two 3-D vectors
+// 
+inline const Vector3 cross( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Outer product of two 3-D vectors
+// 
+inline const Matrix3 outer( const Vector3 & vec0, const Vector3 & vec1 );
+
+// Pre-multiply a row vector by a 3x3 matrix
+// 
+inline const Vector3 rowMul( const Vector3 & vec, const Matrix3 & mat );
+
+// Cross-product matrix of a 3-D vector
+// 
+inline const Matrix3 crossMatrix( const Vector3 & vec );
+
+// Create cross-product matrix and multiply
+// NOTE: 
+// Faster than separately creating a cross-product matrix and multiplying.
+// 
+inline const Matrix3 crossMatrixMul( const Vector3 & vec, const Matrix3 & mat );
+
+// Linear interpolation between two 3-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+inline const Vector3 lerp( float t, const Vector3 & vec0, const Vector3 & vec1 );
+
+// Spherical linear interpolation between two 3-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+inline const Vector3 slerp( float t, const Vector3 & unitVec0, const Vector3 & unitVec1 );
+
+// Conditionally select between two 3-D vectors
+// 
+inline const Vector3 select( const Vector3 & vec0, const Vector3 & vec1, bool select1 );
+
+// Load x, y, and z elements from the first three words of a float array.
+// 
+// 
+inline void loadXYZ( Vector3 & vec, const float * fptr );
+
+// Store x, y, and z elements of a 3-D vector in the first three words of a float array.
+// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
+// 
+inline void storeXYZ( const Vector3 & vec, float * fptr );
+
+// Load three-half-floats as a 3-D vector
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs.
+// 
+inline void loadHalfFloats( Vector3 & vec, const unsigned short * hfptr );
+
+// Store a 3-D vector as half-floats. Memory area of previous 16 bytes and next 32 bytes from <code><i>hfptr</i></code> might be accessed.
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs. Memory area of previous 16 bytes and next 32 bytes from hfptr might be accessed.
+// 
+inline void storeHalfFloats( const Vector3 & vec, unsigned short * hfptr );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Vector3 & vec );
+
+// Print a 3-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Vector3 & vec, const char * name );
+
+#endif
+
+// A 4-D vector in array-of-structures format
+//
+class Vector4
+{
+    float mX;
+    float mY;
+    float mZ;
+    float mW;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Vector4( ) { };
+
+    // Copy a 4-D vector
+    // 
+    inline Vector4( const Vector4 & vec );
+
+    // Construct a 4-D vector from x, y, z, and w elements
+    // 
+    inline Vector4( float x, float y, float z, float w );
+
+    // Construct a 4-D vector from a 3-D vector and a scalar
+    // 
+    inline Vector4( const Vector3 & xyz, float w );
+
+    // Copy x, y, and z from a 3-D vector into a 4-D vector, and set w to 0
+    // 
+    explicit inline Vector4( const Vector3 & vec );
+
+    // Copy x, y, and z from a 3-D point into a 4-D vector, and set w to 1
+    // 
+    explicit inline Vector4( const Point3 & pnt );
+
+    // Copy elements from a quaternion into a 4-D vector
+    // 
+    explicit inline Vector4( const Quat & quat );
+
+    // Set all elements of a 4-D vector to the same scalar value
+    // 
+    explicit inline Vector4( float scalar );
+
+    // Assign one 4-D vector to another
+    // 
+    inline Vector4 & operator =( const Vector4 & vec );
+
+    // Set the x, y, and z elements of a 4-D vector
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    inline Vector4 & setXYZ( const Vector3 & vec );
+
+    // Get the x, y, and z elements of a 4-D vector
+    // 
+    inline const Vector3 getXYZ( ) const;
+
+    // Set the x element of a 4-D vector
+    // 
+    inline Vector4 & setX( float x );
+
+    // Set the y element of a 4-D vector
+    // 
+    inline Vector4 & setY( float y );
+
+    // Set the z element of a 4-D vector
+    // 
+    inline Vector4 & setZ( float z );
+
+    // Set the w element of a 4-D vector
+    // 
+    inline Vector4 & setW( float w );
+
+    // Get the x element of a 4-D vector
+    // 
+    inline float getX( ) const;
+
+    // Get the y element of a 4-D vector
+    // 
+    inline float getY( ) const;
+
+    // Get the z element of a 4-D vector
+    // 
+    inline float getZ( ) const;
+
+    // Get the w element of a 4-D vector
+    // 
+    inline float getW( ) const;
+
+    // Set an x, y, z, or w element of a 4-D vector by index
+    // 
+    inline Vector4 & setElem( int idx, float value );
+
+    // Get an x, y, z, or w element of a 4-D vector by index
+    // 
+    inline float getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+
+    // Add two 4-D vectors
+    // 
+    inline const Vector4 operator +( const Vector4 & vec ) const;
+
+    // Subtract a 4-D vector from another 4-D vector
+    // 
+    inline const Vector4 operator -( const Vector4 & vec ) const;
+
+    // Multiply a 4-D vector by a scalar
+    // 
+    inline const Vector4 operator *( float scalar ) const;
+
+    // Divide a 4-D vector by a scalar
+    // 
+    inline const Vector4 operator /( float scalar ) const;
+
+    // Perform compound assignment and addition with a 4-D vector
+    // 
+    inline Vector4 & operator +=( const Vector4 & vec );
+
+    // Perform compound assignment and subtraction by a 4-D vector
+    // 
+    inline Vector4 & operator -=( const Vector4 & vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Vector4 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    inline Vector4 & operator /=( float scalar );
+
+    // Negate all elements of a 4-D vector
+    // 
+    inline const Vector4 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static inline const Vector4 xAxis( );
+
+    // Construct y axis
+    // 
+    static inline const Vector4 yAxis( );
+
+    // Construct z axis
+    // 
+    static inline const Vector4 zAxis( );
+
+    // Construct w axis
+    // 
+    static inline const Vector4 wAxis( );
+
+}
+#ifdef __GNUC__
+__attribute__ ((aligned(16)))
+#endif
+;
+
+// Multiply a 4-D vector by a scalar
+// 
+inline const Vector4 operator *( float scalar, const Vector4 & vec );
+
+// Multiply two 4-D vectors per element
+// 
+inline const Vector4 mulPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Divide two 4-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+inline const Vector4 divPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Compute the reciprocal of a 4-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+inline const Vector4 recipPerElem( const Vector4 & vec );
+
+// Compute the square root of a 4-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function sqrtf4.
+// 
+inline const Vector4 sqrtPerElem( const Vector4 & vec );
+
+// Compute the reciprocal square root of a 4-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function rsqrtf4.
+// 
+inline const Vector4 rsqrtPerElem( const Vector4 & vec );
+
+// Compute the absolute value of a 4-D vector per element
+// 
+inline const Vector4 absPerElem( const Vector4 & vec );
+
+// Copy sign from one 4-D vector to another, per element
+// 
+inline const Vector4 copySignPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Maximum of two 4-D vectors per element
+// 
+inline const Vector4 maxPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Minimum of two 4-D vectors per element
+// 
+inline const Vector4 minPerElem( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Maximum element of a 4-D vector
+// 
+inline float maxElem( const Vector4 & vec );
+
+// Minimum element of a 4-D vector
+// 
+inline float minElem( const Vector4 & vec );
+
+// Compute the sum of all elements of a 4-D vector
+// 
+inline float sum( const Vector4 & vec );
+
+// Compute the dot product of two 4-D vectors
+// 
+inline float dot( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Compute the square of the length of a 4-D vector
+// 
+inline float lengthSqr( const Vector4 & vec );
+
+// Compute the length of a 4-D vector
+// 
+inline float length( const Vector4 & vec );
+
+// Normalize a 4-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+inline const Vector4 normalize( const Vector4 & vec );
+
+// Outer product of two 4-D vectors
+// 
+inline const Matrix4 outer( const Vector4 & vec0, const Vector4 & vec1 );
+
+// Linear interpolation between two 4-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+inline const Vector4 lerp( float t, const Vector4 & vec0, const Vector4 & vec1 );
+
+// Spherical linear interpolation between two 4-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+inline const Vector4 slerp( float t, const Vector4 & unitVec0, const Vector4 & unitVec1 );
+
+// Conditionally select between two 4-D vectors
+// 
+inline const Vector4 select( const Vector4 & vec0, const Vector4 & vec1, bool select1 );
+
+// Load x, y, z, and w elements from the first four words of a float array.
+// 
+// 
+inline void loadXYZW( Vector4 & vec, const float * fptr );
+
+// Store x, y, z, and w elements of a 4-D vector in the first four words of a float array.
+// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
+// 
+inline void storeXYZW( const Vector4 & vec, float * fptr );
+
+// Load four-half-floats as a 4-D vector
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs.
+// 
+inline void loadHalfFloats( Vector4 & vec, const unsigned short * hfptr );
+
+// Store a 4-D vector as half-floats. Memory area of previous 16 bytes and next 32 bytes from <code><i>hfptr</i></code> might be accessed.
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs. Memory area of previous 16 bytes and next 32 bytes from hfptr might be accessed.
+// 
+inline void storeHalfFloats( const Vector4 & vec, unsigned short * hfptr );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Vector4 & vec );
+
+// Print a 4-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Vector4 & vec, const char * name );
+
+#endif
+
+// A 3-D point in array-of-structures format
+//
+class Point3
+{
+    float mX;
+    float mY;
+    float mZ;
+#ifndef __GNUC__
+    float d;
+#endif
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Point3( ) { };
+
+    // Copy a 3-D point
+    // 
+    inline Point3( const Point3 & pnt );
+
+    // Construct a 3-D point from x, y, and z elements
+    // 
+    inline Point3( float x, float y, float z );
+
+    // Copy elements from a 3-D vector into a 3-D point
+    // 
+    explicit inline Point3( const Vector3 & vec );
+
+    // Set all elements of a 3-D point to the same scalar value
+    // 
+    explicit inline Point3( float scalar );
+
+    // Assign one 3-D point to another
+    // 
+    inline Point3 & operator =( const Point3 & pnt );
+
+    // Set the x element of a 3-D point
+    // 
+    inline Point3 & setX( float x );
+
+    // Set the y element of a 3-D point
+    // 
+    inline Point3 & setY( float y );
+
+    // Set the z element of a 3-D point
+    // 
+    inline Point3 & setZ( float z );
+
+    // Get the x element of a 3-D point
+    // 
+    inline float getX( ) const;
+
+    // Get the y element of a 3-D point
+    // 
+    inline float getY( ) const;
+
+    // Get the z element of a 3-D point
+    // 
+    inline float getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D point by index
+    // 
+    inline Point3 & setElem( int idx, float value );
+
+    // Get an x, y, or z element of a 3-D point by index
+    // 
+    inline float getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+
+    // Subtract a 3-D point from another 3-D point
+    // 
+    inline const Vector3 operator -( const Point3 & pnt ) const;
+
+    // Add a 3-D point to a 3-D vector
+    // 
+    inline const Point3 operator +( const Vector3 & vec ) const;
+
+    // Subtract a 3-D vector from a 3-D point
+    // 
+    inline const Point3 operator -( const Vector3 & vec ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    inline Point3 & operator +=( const Vector3 & vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    inline Point3 & operator -=( const Vector3 & vec );
+
+}
+#ifdef __GNUC__
+__attribute__ ((aligned(16)))
+#endif
+;
+
+// Multiply two 3-D points per element
+// 
+inline const Point3 mulPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Divide two 3-D points per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+inline const Point3 divPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Compute the reciprocal of a 3-D point per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+inline const Point3 recipPerElem( const Point3 & pnt );
+
+// Compute the square root of a 3-D point per element
+// NOTE: 
+// Floating-point behavior matches standard library function sqrtf4.
+// 
+inline const Point3 sqrtPerElem( const Point3 & pnt );
+
+// Compute the reciprocal square root of a 3-D point per element
+// NOTE: 
+// Floating-point behavior matches standard library function rsqrtf4.
+// 
+inline const Point3 rsqrtPerElem( const Point3 & pnt );
+
+// Compute the absolute value of a 3-D point per element
+// 
+inline const Point3 absPerElem( const Point3 & pnt );
+
+// Copy sign from one 3-D point to another, per element
+// 
+inline const Point3 copySignPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Maximum of two 3-D points per element
+// 
+inline const Point3 maxPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Minimum of two 3-D points per element
+// 
+inline const Point3 minPerElem( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Maximum element of a 3-D point
+// 
+inline float maxElem( const Point3 & pnt );
+
+// Minimum element of a 3-D point
+// 
+inline float minElem( const Point3 & pnt );
+
+// Compute the sum of all elements of a 3-D point
+// 
+inline float sum( const Point3 & pnt );
+
+// Apply uniform scale to a 3-D point
+// 
+inline const Point3 scale( const Point3 & pnt, float scaleVal );
+
+// Apply non-uniform scale to a 3-D point
+// 
+inline const Point3 scale( const Point3 & pnt, const Vector3 & scaleVec );
+
+// Scalar projection of a 3-D point on a unit-length 3-D vector
+// 
+inline float projection( const Point3 & pnt, const Vector3 & unitVec );
+
+// Compute the square of the distance of a 3-D point from the coordinate-system origin
+// 
+inline float distSqrFromOrigin( const Point3 & pnt );
+
+// Compute the distance of a 3-D point from the coordinate-system origin
+// 
+inline float distFromOrigin( const Point3 & pnt );
+
+// Compute the square of the distance between two 3-D points
+// 
+inline float distSqr( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Compute the distance between two 3-D points
+// 
+inline float dist( const Point3 & pnt0, const Point3 & pnt1 );
+
+// Linear interpolation between two 3-D points
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+inline const Point3 lerp( float t, const Point3 & pnt0, const Point3 & pnt1 );
+
+// Conditionally select between two 3-D points
+// 
+inline const Point3 select( const Point3 & pnt0, const Point3 & pnt1, bool select1 );
+
+// Load x, y, and z elements from the first three words of a float array.
+// 
+// 
+inline void loadXYZ( Point3 & pnt, const float * fptr );
+
+// Store x, y, and z elements of a 3-D point in the first three words of a float array.
+// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
+// 
+inline void storeXYZ( const Point3 & pnt, float * fptr );
+
+// Load three-half-floats as a 3-D point
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs.
+// 
+inline void loadHalfFloats( Point3 & pnt, const unsigned short * hfptr );
+
+// Store a 3-D point as half-floats. Memory area of previous 16 bytes and next 32 bytes from <code><i>hfptr</i></code> might be accessed.
+// NOTE: 
+// This transformation does not support either denormalized numbers or NaNs. Memory area of previous 16 bytes and next 32 bytes from hfptr might be accessed.
+// 
+inline void storeHalfFloats( const Point3 & pnt, unsigned short * hfptr );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D point
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Point3 & pnt );
+
+// Print a 3-D point and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Point3 & pnt, const char * name );
+
+#endif
+
+// A quaternion in array-of-structures format
+//
+class Quat
+{
+    float mX;
+    float mY;
+    float mZ;
+    float mW;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Quat( ) { };
+
+    // Copy a quaternion
+    // 
+    inline Quat( const Quat & quat );
+
+    // Construct a quaternion from x, y, z, and w elements
+    // 
+    inline Quat( float x, float y, float z, float w );
+
+    // Construct a quaternion from a 3-D vector and a scalar
+    // 
+    inline Quat( const Vector3 & xyz, float w );
+
+    // Copy elements from a 4-D vector into a quaternion
+    // 
+    explicit inline Quat( const Vector4 & vec );
+
+    // Convert a rotation matrix to a unit-length quaternion
+    // 
+    explicit inline Quat( const Matrix3 & rotMat );
+
+    // Set all elements of a quaternion to the same scalar value
+    // 
+    explicit inline Quat( float scalar );
+
+    // Assign one quaternion to another
+    // 
+    inline Quat & operator =( const Quat & quat );
+
+    // Set the x, y, and z elements of a quaternion
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    inline Quat & setXYZ( const Vector3 & vec );
+
+    // Get the x, y, and z elements of a quaternion
+    // 
+    inline const Vector3 getXYZ( ) const;
+
+    // Set the x element of a quaternion
+    // 
+    inline Quat & setX( float x );
+
+    // Set the y element of a quaternion
+    // 
+    inline Quat & setY( float y );
+
+    // Set the z element of a quaternion
+    // 
+    inline Quat & setZ( float z );
+
+    // Set the w element of a quaternion
+    // 
+    inline Quat & setW( float w );
+
+    // Get the x element of a quaternion
+    // 
+    inline float getX( ) const;
+
+    // Get the y element of a quaternion
+    // 
+    inline float getY( ) const;
+
+    // Get the z element of a quaternion
+    // 
+    inline float getZ( ) const;
+
+    // Get the w element of a quaternion
+    // 
+    inline float getW( ) const;
+
+    // Set an x, y, z, or w element of a quaternion by index
+    // 
+    inline Quat & setElem( int idx, float value );
+
+    // Get an x, y, z, or w element of a quaternion by index
+    // 
+    inline float getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+
+    // Add two quaternions
+    // 
+    inline const Quat operator +( const Quat & quat ) const;
+
+    // Subtract a quaternion from another quaternion
+    // 
+    inline const Quat operator -( const Quat & quat ) const;
+
+    // Multiply two quaternions
+    // 
+    inline const Quat operator *( const Quat & quat ) const;
+
+    // Multiply a quaternion by a scalar
+    // 
+    inline const Quat operator *( float scalar ) const;
+
+    // Divide a quaternion by a scalar
+    // 
+    inline const Quat operator /( float scalar ) const;
+
+    // Perform compound assignment and addition with a quaternion
+    // 
+    inline Quat & operator +=( const Quat & quat );
+
+    // Perform compound assignment and subtraction by a quaternion
+    // 
+    inline Quat & operator -=( const Quat & quat );
+
+    // Perform compound assignment and multiplication by a quaternion
+    // 
+    inline Quat & operator *=( const Quat & quat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Quat & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    inline Quat & operator /=( float scalar );
+
+    // Negate all elements of a quaternion
+    // 
+    inline const Quat operator -( ) const;
+
+    // Construct an identity quaternion
+    // 
+    static inline const Quat identity( );
+
+    // Construct a quaternion to rotate between two unit-length 3-D vectors
+    // NOTE: 
+    // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions.
+    // 
+    static inline const Quat rotation( const Vector3 & unitVec0, const Vector3 & unitVec1 );
+
+    // Construct a quaternion to rotate around a unit-length 3-D vector
+    // 
+    static inline const Quat rotation( float radians, const Vector3 & unitVec );
+
+    // Construct a quaternion to rotate around the x axis
+    // 
+    static inline const Quat rotationX( float radians );
+
+    // Construct a quaternion to rotate around the y axis
+    // 
+    static inline const Quat rotationY( float radians );
+
+    // Construct a quaternion to rotate around the z axis
+    // 
+    static inline const Quat rotationZ( float radians );
+
+}
+#ifdef __GNUC__
+__attribute__ ((aligned(16)))
+#endif
+;
+
+// Multiply a quaternion by a scalar
+// 
+inline const Quat operator *( float scalar, const Quat & quat );
+
+// Compute the conjugate of a quaternion
+// 
+inline const Quat conj( const Quat & quat );
+
+// Use a unit-length quaternion to rotate a 3-D vector
+// 
+inline const Vector3 rotate( const Quat & unitQuat, const Vector3 & vec );
+
+// Compute the dot product of two quaternions
+// 
+inline float dot( const Quat & quat0, const Quat & quat1 );
+
+// Compute the norm of a quaternion
+// 
+inline float norm( const Quat & quat );
+
+// Compute the length of a quaternion
+// 
+inline float length( const Quat & quat );
+
+// Normalize a quaternion
+// NOTE: 
+// The result is unpredictable when all elements of quat are at or near zero.
+// 
+inline const Quat normalize( const Quat & quat );
+
+// Linear interpolation between two quaternions
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+inline const Quat lerp( float t, const Quat & quat0, const Quat & quat1 );
+
+// Spherical linear interpolation between two quaternions
+// NOTE: 
+// Interpolates along the shortest path between orientations.
+// Does not clamp t between 0 and 1.
+// 
+inline const Quat slerp( float t, const Quat & unitQuat0, const Quat & unitQuat1 );
+
+// Spherical quadrangle interpolation
+// 
+inline const Quat squad( float t, const Quat & unitQuat0, const Quat & unitQuat1, const Quat & unitQuat2, const Quat & unitQuat3 );
+
+// Conditionally select between two quaternions
+// 
+inline const Quat select( const Quat & quat0, const Quat & quat1, bool select1 );
+
+// Load x, y, z, and w elements from the first four words of a float array.
+// 
+// 
+inline void loadXYZW( Quat & quat, const float * fptr );
+
+// Store x, y, z, and w elements of a quaternion in the first four words of a float array.
+// Memory area of previous 16 bytes and next 32 bytes from fptr might be accessed
+// 
+inline void storeXYZW( const Quat & quat, float * fptr );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a quaternion
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Quat & quat );
+
+// Print a quaternion and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Quat & quat, const char * name );
+
+#endif
+
+// A 3x3 matrix in array-of-structures format
+//
+class Matrix3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Matrix3( ) { };
+
+    // Copy a 3x3 matrix
+    // 
+    inline Matrix3( const Matrix3 & mat );
+
+    // Construct a 3x3 matrix containing the specified columns
+    // 
+    inline Matrix3( const Vector3 & col0, const Vector3 & col1, const Vector3 & col2 );
+
+    // Construct a 3x3 rotation matrix from a unit-length quaternion
+    // 
+    explicit inline Matrix3( const Quat & unitQuat );
+
+    // Set all elements of a 3x3 matrix to the same scalar value
+    // 
+    explicit inline Matrix3( float scalar );
+
+    // Assign one 3x3 matrix to another
+    // 
+    inline Matrix3 & operator =( const Matrix3 & mat );
+
+    // Set column 0 of a 3x3 matrix
+    // 
+    inline Matrix3 & setCol0( const Vector3 & col0 );
+
+    // Set column 1 of a 3x3 matrix
+    // 
+    inline Matrix3 & setCol1( const Vector3 & col1 );
+
+    // Set column 2 of a 3x3 matrix
+    // 
+    inline Matrix3 & setCol2( const Vector3 & col2 );
+
+    // Get column 0 of a 3x3 matrix
+    // 
+    inline const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x3 matrix
+    // 
+    inline const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x3 matrix
+    // 
+    inline const Vector3 getCol2( ) const;
+
+    // Set the column of a 3x3 matrix referred to by the specified index
+    // 
+    inline Matrix3 & setCol( int col, const Vector3 & vec );
+
+    // Set the row of a 3x3 matrix referred to by the specified index
+    // 
+    inline Matrix3 & setRow( int row, const Vector3 & vec );
+
+    // Get the column of a 3x3 matrix referred to by the specified index
+    // 
+    inline const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x3 matrix referred to by the specified index
+    // 
+    inline const Vector3 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    inline Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    inline const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x3 matrix referred to by column and row indices
+    // 
+    inline Matrix3 & setElem( int col, int row, float val );
+
+    // Get the element of a 3x3 matrix referred to by column and row indices
+    // 
+    inline float getElem( int col, int row ) const;
+
+    // Add two 3x3 matrices
+    // 
+    inline const Matrix3 operator +( const Matrix3 & mat ) const;
+
+    // Subtract a 3x3 matrix from another 3x3 matrix
+    // 
+    inline const Matrix3 operator -( const Matrix3 & mat ) const;
+
+    // Negate all elements of a 3x3 matrix
+    // 
+    inline const Matrix3 operator -( ) const;
+
+    // Multiply a 3x3 matrix by a scalar
+    // 
+    inline const Matrix3 operator *( float scalar ) const;
+
+    // Multiply a 3x3 matrix by a 3-D vector
+    // 
+    inline const Vector3 operator *( const Vector3 & vec ) const;
+
+    // Multiply two 3x3 matrices
+    // 
+    inline const Matrix3 operator *( const Matrix3 & mat ) const;
+
+    // Perform compound assignment and addition with a 3x3 matrix
+    // 
+    inline Matrix3 & operator +=( const Matrix3 & mat );
+
+    // Perform compound assignment and subtraction by a 3x3 matrix
+    // 
+    inline Matrix3 & operator -=( const Matrix3 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Matrix3 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a 3x3 matrix
+    // 
+    inline Matrix3 & operator *=( const Matrix3 & mat );
+
+    // Construct an identity 3x3 matrix
+    // 
+    static inline const Matrix3 identity( );
+
+    // Construct a 3x3 matrix to rotate around the x axis
+    // 
+    static inline const Matrix3 rotationX( float radians );
+
+    // Construct a 3x3 matrix to rotate around the y axis
+    // 
+    static inline const Matrix3 rotationY( float radians );
+
+    // Construct a 3x3 matrix to rotate around the z axis
+    // 
+    static inline const Matrix3 rotationZ( float radians );
+
+    // Construct a 3x3 matrix to rotate around the x, y, and z axes
+    // 
+    static inline const Matrix3 rotationZYX( const Vector3 & radiansXYZ );
+
+    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector
+    // 
+    static inline const Matrix3 rotation( float radians, const Vector3 & unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static inline const Matrix3 rotation( const Quat & unitQuat );
+
+    // Construct a 3x3 matrix to perform scaling
+    // 
+    static inline const Matrix3 scale( const Vector3 & scaleVec );
+
+};
+// Multiply a 3x3 matrix by a scalar
+// 
+inline const Matrix3 operator *( float scalar, const Matrix3 & mat );
+
+// Append (post-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 & scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Matrix3 prependScale( const Vector3 & scaleVec, const Matrix3 & mat );
+
+// Multiply two 3x3 matrices per element
+// 
+inline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 );
+
+// Compute the absolute value of a 3x3 matrix per element
+// 
+inline const Matrix3 absPerElem( const Matrix3 & mat );
+
+// Transpose of a 3x3 matrix
+// 
+inline const Matrix3 transpose( const Matrix3 & mat );
+
+// Compute the inverse of a 3x3 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+inline const Matrix3 inverse( const Matrix3 & mat );
+
+// Determinant of a 3x3 matrix
+// 
+inline float determinant( const Matrix3 & mat );
+
+// Conditionally select between two 3x3 matrices
+// 
+inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x3 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Matrix3 & mat );
+
+// Print a 3x3 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Matrix3 & mat, const char * name );
+
+#endif
+
+// A 4x4 matrix in array-of-structures format
+//
+class Matrix4
+{
+    Vector4 mCol0;
+    Vector4 mCol1;
+    Vector4 mCol2;
+    Vector4 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Matrix4( ) { };
+
+    // Copy a 4x4 matrix
+    // 
+    inline Matrix4( const Matrix4 & mat );
+
+    // Construct a 4x4 matrix containing the specified columns
+    // 
+    inline Matrix4( const Vector4 & col0, const Vector4 & col1, const Vector4 & col2, const Vector4 & col3 );
+
+    // Construct a 4x4 matrix from a 3x4 transformation matrix
+    // 
+    explicit inline Matrix4( const Transform3 & mat );
+
+    // Construct a 4x4 matrix from a 3x3 matrix and a 3-D vector
+    // 
+    inline Matrix4( const Matrix3 & mat, const Vector3 & translateVec );
+
+    // Construct a 4x4 matrix from a unit-length quaternion and a 3-D vector
+    // 
+    inline Matrix4( const Quat & unitQuat, const Vector3 & translateVec );
+
+    // Set all elements of a 4x4 matrix to the same scalar value
+    // 
+    explicit inline Matrix4( float scalar );
+
+    // Assign one 4x4 matrix to another
+    // 
+    inline Matrix4 & operator =( const Matrix4 & mat );
+
+    // Set the upper-left 3x3 submatrix
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    inline Matrix4 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 4x4 matrix
+    // 
+    inline const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    inline Matrix4 & setTranslation( const Vector3 & translateVec );
+
+    // Get the translation component of a 4x4 matrix
+    // 
+    inline const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 4x4 matrix
+    // 
+    inline Matrix4 & setCol0( const Vector4 & col0 );
+
+    // Set column 1 of a 4x4 matrix
+    // 
+    inline Matrix4 & setCol1( const Vector4 & col1 );
+
+    // Set column 2 of a 4x4 matrix
+    // 
+    inline Matrix4 & setCol2( const Vector4 & col2 );
+
+    // Set column 3 of a 4x4 matrix
+    // 
+    inline Matrix4 & setCol3( const Vector4 & col3 );
+
+    // Get column 0 of a 4x4 matrix
+    // 
+    inline const Vector4 getCol0( ) const;
+
+    // Get column 1 of a 4x4 matrix
+    // 
+    inline const Vector4 getCol1( ) const;
+
+    // Get column 2 of a 4x4 matrix
+    // 
+    inline const Vector4 getCol2( ) const;
+
+    // Get column 3 of a 4x4 matrix
+    // 
+    inline const Vector4 getCol3( ) const;
+
+    // Set the column of a 4x4 matrix referred to by the specified index
+    // 
+    inline Matrix4 & setCol( int col, const Vector4 & vec );
+
+    // Set the row of a 4x4 matrix referred to by the specified index
+    // 
+    inline Matrix4 & setRow( int row, const Vector4 & vec );
+
+    // Get the column of a 4x4 matrix referred to by the specified index
+    // 
+    inline const Vector4 getCol( int col ) const;
+
+    // Get the row of a 4x4 matrix referred to by the specified index
+    // 
+    inline const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    inline Vector4 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    inline const Vector4 operator []( int col ) const;
+
+    // Set the element of a 4x4 matrix referred to by column and row indices
+    // 
+    inline Matrix4 & setElem( int col, int row, float val );
+
+    // Get the element of a 4x4 matrix referred to by column and row indices
+    // 
+    inline float getElem( int col, int row ) const;
+
+    // Add two 4x4 matrices
+    // 
+    inline const Matrix4 operator +( const Matrix4 & mat ) const;
+
+    // Subtract a 4x4 matrix from another 4x4 matrix
+    // 
+    inline const Matrix4 operator -( const Matrix4 & mat ) const;
+
+    // Negate all elements of a 4x4 matrix
+    // 
+    inline const Matrix4 operator -( ) const;
+
+    // Multiply a 4x4 matrix by a scalar
+    // 
+    inline const Matrix4 operator *( float scalar ) const;
+
+    // Multiply a 4x4 matrix by a 4-D vector
+    // 
+    inline const Vector4 operator *( const Vector4 & vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D vector
+    // 
+    inline const Vector4 operator *( const Vector3 & vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D point
+    // 
+    inline const Vector4 operator *( const Point3 & pnt ) const;
+
+    // Multiply two 4x4 matrices
+    // 
+    inline const Matrix4 operator *( const Matrix4 & mat ) const;
+
+    // Multiply a 4x4 matrix by a 3x4 transformation matrix
+    // 
+    inline const Matrix4 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and addition with a 4x4 matrix
+    // 
+    inline Matrix4 & operator +=( const Matrix4 & mat );
+
+    // Perform compound assignment and subtraction by a 4x4 matrix
+    // 
+    inline Matrix4 & operator -=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Matrix4 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a 4x4 matrix
+    // 
+    inline Matrix4 & operator *=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    inline Matrix4 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 4x4 matrix
+    // 
+    static inline const Matrix4 identity( );
+
+    // Construct a 4x4 matrix to rotate around the x axis
+    // 
+    static inline const Matrix4 rotationX( float radians );
+
+    // Construct a 4x4 matrix to rotate around the y axis
+    // 
+    static inline const Matrix4 rotationY( float radians );
+
+    // Construct a 4x4 matrix to rotate around the z axis
+    // 
+    static inline const Matrix4 rotationZ( float radians );
+
+    // Construct a 4x4 matrix to rotate around the x, y, and z axes
+    // 
+    static inline const Matrix4 rotationZYX( const Vector3 & radiansXYZ );
+
+    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector
+    // 
+    static inline const Matrix4 rotation( float radians, const Vector3 & unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static inline const Matrix4 rotation( const Quat & unitQuat );
+
+    // Construct a 4x4 matrix to perform scaling
+    // 
+    static inline const Matrix4 scale( const Vector3 & scaleVec );
+
+    // Construct a 4x4 matrix to perform translation
+    // 
+    static inline const Matrix4 translation( const Vector3 & translateVec );
+
+    // Construct viewing matrix based on eye position, position looked at, and up direction
+    // 
+    static inline const Matrix4 lookAt( const Point3 & eyePos, const Point3 & lookAtPos, const Vector3 & upVec );
+
+    // Construct a perspective projection matrix
+    // 
+    static inline const Matrix4 perspective( float fovyRadians, float aspect, float zNear, float zFar );
+
+    // Construct a perspective projection matrix based on frustum
+    // 
+    static inline const Matrix4 frustum( float left, float right, float bottom, float top, float zNear, float zFar );
+
+    // Construct an orthographic projection matrix
+    // 
+    static inline const Matrix4 orthographic( float left, float right, float bottom, float top, float zNear, float zFar );
+
+};
+// Multiply a 4x4 matrix by a scalar
+// 
+inline const Matrix4 operator *( float scalar, const Matrix4 & mat );
+
+// Append (post-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 & scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Matrix4 prependScale( const Vector3 & scaleVec, const Matrix4 & mat );
+
+// Multiply two 4x4 matrices per element
+// 
+inline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 );
+
+// Compute the absolute value of a 4x4 matrix per element
+// 
+inline const Matrix4 absPerElem( const Matrix4 & mat );
+
+// Transpose of a 4x4 matrix
+// 
+inline const Matrix4 transpose( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+inline const Matrix4 inverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.  The result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+inline const Matrix4 affineInverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix with an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.
+// 
+inline const Matrix4 orthoInverse( const Matrix4 & mat );
+
+// Determinant of a 4x4 matrix
+// 
+inline float determinant( const Matrix4 & mat );
+
+// Conditionally select between two 4x4 matrices
+// 
+inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4x4 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Matrix4 & mat );
+
+// Print a 4x4 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Matrix4 & mat, const char * name );
+
+#endif
+
+// A 3x4 transformation matrix in array-of-structures format
+//
+class Transform3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+    Vector3 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Transform3( ) { };
+
+    // Copy a 3x4 transformation matrix
+    // 
+    inline Transform3( const Transform3 & tfrm );
+
+    // Construct a 3x4 transformation matrix containing the specified columns
+    // 
+    inline Transform3( const Vector3 & col0, const Vector3 & col1, const Vector3 & col2, const Vector3 & col3 );
+
+    // Construct a 3x4 transformation matrix from a 3x3 matrix and a 3-D vector
+    // 
+    inline Transform3( const Matrix3 & tfrm, const Vector3 & translateVec );
+
+    // Construct a 3x4 transformation matrix from a unit-length quaternion and a 3-D vector
+    // 
+    inline Transform3( const Quat & unitQuat, const Vector3 & translateVec );
+
+    // Set all elements of a 3x4 transformation matrix to the same scalar value
+    // 
+    explicit inline Transform3( float scalar );
+
+    // Assign one 3x4 transformation matrix to another
+    // 
+    inline Transform3 & operator =( const Transform3 & tfrm );
+
+    // Set the upper-left 3x3 submatrix
+    // 
+    inline Transform3 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 3x4 transformation matrix
+    // 
+    inline const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // 
+    inline Transform3 & setTranslation( const Vector3 & translateVec );
+
+    // Get the translation component of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 3x4 transformation matrix
+    // 
+    inline Transform3 & setCol0( const Vector3 & col0 );
+
+    // Set column 1 of a 3x4 transformation matrix
+    // 
+    inline Transform3 & setCol1( const Vector3 & col1 );
+
+    // Set column 2 of a 3x4 transformation matrix
+    // 
+    inline Transform3 & setCol2( const Vector3 & col2 );
+
+    // Set column 3 of a 3x4 transformation matrix
+    // 
+    inline Transform3 & setCol3( const Vector3 & col3 );
+
+    // Get column 0 of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getCol2( ) const;
+
+    // Get column 3 of a 3x4 transformation matrix
+    // 
+    inline const Vector3 getCol3( ) const;
+
+    // Set the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    inline Transform3 & setCol( int col, const Vector3 & vec );
+
+    // Set the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    inline Transform3 & setRow( int row, const Vector4 & vec );
+
+    // Get the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    inline const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    inline const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    inline Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    inline const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    inline Transform3 & setElem( int col, int row, float val );
+
+    // Get the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    inline float getElem( int col, int row ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D vector
+    // 
+    inline const Vector3 operator *( const Vector3 & vec ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D point
+    // 
+    inline const Point3 operator *( const Point3 & pnt ) const;
+
+    // Multiply two 3x4 transformation matrices
+    // 
+    inline const Transform3 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    inline Transform3 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 3x4 transformation matrix
+    // 
+    static inline const Transform3 identity( );
+
+    // Construct a 3x4 transformation matrix to rotate around the x axis
+    // 
+    static inline const Transform3 rotationX( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the y axis
+    // 
+    static inline const Transform3 rotationY( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the z axis
+    // 
+    static inline const Transform3 rotationZ( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the x, y, and z axes
+    // 
+    static inline const Transform3 rotationZYX( const Vector3 & radiansXYZ );
+
+    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector
+    // 
+    static inline const Transform3 rotation( float radians, const Vector3 & unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static inline const Transform3 rotation( const Quat & unitQuat );
+
+    // Construct a 3x4 transformation matrix to perform scaling
+    // 
+    static inline const Transform3 scale( const Vector3 & scaleVec );
+
+    // Construct a 3x4 transformation matrix to perform translation
+    // 
+    static inline const Transform3 translation( const Vector3 & translateVec );
+
+};
+// Append (post-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 & scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+inline const Transform3 prependScale( const Vector3 & scaleVec, const Transform3 & tfrm );
+
+// Multiply two 3x4 transformation matrices per element
+// 
+inline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 );
+
+// Compute the absolute value of a 3x4 transformation matrix per element
+// 
+inline const Transform3 absPerElem( const Transform3 & tfrm );
+
+// Inverse of a 3x4 transformation matrix
+// NOTE: 
+// Result is unpredictable when the determinant of the left 3x3 submatrix is equal to or near 0.
+// 
+inline const Transform3 inverse( const Transform3 & tfrm );
+
+// Compute the inverse of a 3x4 transformation matrix, expected to have an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 3x4 transformation matrix meets the given restrictions.
+// 
+inline const Transform3 orthoInverse( const Transform3 & tfrm );
+
+// Conditionally select between two 3x4 transformation matrices
+// 
+inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x4 transformation matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Transform3 & tfrm );
+
+// Print a 3x4 transformation matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+inline void print( const Transform3 & tfrm, const char * name );
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#include "vec_aos.h"
+#include "quat_aos.h"
+#include "mat_aos.h"
+
+#endif
diff --git a/test/Bullet2/vectormath/sse/boolInVec.h b/test/Bullet2/vectormath/sse/boolInVec.h
new file mode 100644
index 000000000..d21d25cbb
--- /dev/null
+++ b/test/Bullet2/vectormath/sse/boolInVec.h
@@ -0,0 +1,247 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _BOOLINVEC_H
+#define _BOOLINVEC_H
+
+#include <math.h>
+
+namespace Vectormath {
+
+class floatInVec;
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec class
+//
+
+class boolInVec
+{
+    private:
+        __m128 mData;
+
+        inline boolInVec(__m128 vec);
+    public:
+        inline boolInVec() {}
+
+        // matches standard type conversions
+        //
+        inline boolInVec(const floatInVec &vec);
+
+        // explicit cast from bool
+        //
+        explicit inline boolInVec(bool scalar);
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+        // explicit cast to bool
+        // 
+        inline bool getAsBool() const;
+#else
+        // implicit cast to bool
+        // 
+        inline operator bool() const;
+#endif
+        
+        // get vector data
+        // bool value is splatted across all word slots of vector as 0 (false) or -1 (true)
+        //
+        inline __m128 get128() const;
+
+        // operators
+        //
+        inline const boolInVec operator ! () const;
+        inline boolInVec& operator = (const boolInVec &vec);
+        inline boolInVec& operator &= (const boolInVec &vec);
+        inline boolInVec& operator ^= (const boolInVec &vec);
+        inline boolInVec& operator |= (const boolInVec &vec);
+
+        // friend functions
+        //
+        friend inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1);
+};
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec functions
+//
+
+// operators
+//
+inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1);
+
+// select between vec0 and vec1 using boolInVec.
+// false selects vec0, true selects vec1
+//
+inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1);
+
+} // namespace Vectormath
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec implementation
+//
+
+#include "floatInVec.h"
+
+namespace Vectormath {
+
+inline
+boolInVec::boolInVec(__m128 vec)
+{
+    mData = vec;
+}
+
+inline
+boolInVec::boolInVec(const floatInVec &vec)
+{
+    *this = (vec != floatInVec(0.0f));
+}
+
+inline
+boolInVec::boolInVec(bool scalar)
+{
+    unsigned int mask = -(int)scalar;
+	mData = _mm_set1_ps(*(float *)&mask); // TODO: Union
+}
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+inline
+bool
+boolInVec::getAsBool() const
+#else
+inline
+boolInVec::operator bool() const
+#endif
+{
+	return *(bool *)&mData;
+}
+
+inline
+__m128
+boolInVec::get128() const
+{
+    return mData;
+}
+
+inline
+const boolInVec
+boolInVec::operator ! () const
+{
+    return boolInVec(_mm_andnot_ps(mData, _mm_cmpneq_ps(_mm_setzero_ps(),_mm_setzero_ps())));
+}
+
+inline
+boolInVec&
+boolInVec::operator = (const boolInVec &vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator &= (const boolInVec &vec)
+{
+    *this = *this & vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator ^= (const boolInVec &vec)
+{
+    *this = *this ^ vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator |= (const boolInVec &vec)
+{
+    *this = *this | vec;
+    return *this;
+}
+
+inline
+const boolInVec
+operator == (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator != (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128()));
+}
+    
+inline
+const boolInVec
+operator & (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_and_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator | (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_or_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator ^ (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_xor_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1)
+{
+	return boolInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128()));
+}
+ 
+} // namespace Vectormath
+
+#endif // boolInVec_h
diff --git a/test/Bullet2/vectormath/sse/floatInVec.h b/test/Bullet2/vectormath/sse/floatInVec.h
new file mode 100644
index 000000000..e8ac5959e
--- /dev/null
+++ b/test/Bullet2/vectormath/sse/floatInVec.h
@@ -0,0 +1,340 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _FLOATINVEC_H
+#define _FLOATINVEC_H
+
+#include <math.h>
+#include <xmmintrin.h>
+
+namespace Vectormath {
+
+class boolInVec;
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec class
+//
+
+class floatInVec
+{
+    private:
+        __m128 mData;
+
+    public:
+        inline floatInVec(__m128 vec);
+
+        inline floatInVec() {}
+
+        // matches standard type conversions
+        //
+        inline floatInVec(const boolInVec &vec);
+
+        // construct from a slot of __m128
+        //
+        inline floatInVec(__m128 vec, int slot);
+        
+        // explicit cast from float
+        //
+        explicit inline floatInVec(float scalar);
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+        // explicit cast to float
+        // 
+        inline float getAsFloat() const;
+#else
+        // implicit cast to float
+        //
+        inline operator float() const;
+#endif
+
+        // get vector data
+        // float value is splatted across all word slots of vector
+        //
+        inline __m128 get128() const;
+
+        // operators
+        // 
+        inline const floatInVec operator ++ (int);
+        inline const floatInVec operator -- (int);
+        inline floatInVec& operator ++ ();
+        inline floatInVec& operator -- ();
+        inline const floatInVec operator - () const;
+        inline floatInVec& operator = (const floatInVec &vec);
+        inline floatInVec& operator *= (const floatInVec &vec);
+        inline floatInVec& operator /= (const floatInVec &vec);
+        inline floatInVec& operator += (const floatInVec &vec);
+        inline floatInVec& operator -= (const floatInVec &vec);
+
+        // friend functions
+        //
+        friend inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, boolInVec select_vec1);
+};
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec functions
+//
+
+// operators
+// 
+inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1);
+inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1);
+inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1);
+inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1);
+
+// select between vec0 and vec1 using boolInVec.
+// false selects vec0, true selects vec1
+//
+inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1);
+
+} // namespace Vectormath
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec implementation
+//
+
+#include "boolInVec.h"
+
+namespace Vectormath {
+
+inline
+floatInVec::floatInVec(__m128 vec)
+{
+    mData = vec;
+}
+
+inline
+floatInVec::floatInVec(const boolInVec &vec)
+{
+	mData = vec_sel(_mm_setzero_ps(), _mm_set1_ps(1.0f), vec.get128());
+}
+
+inline
+floatInVec::floatInVec(__m128 vec, int slot)
+{
+	SSEFloat v;
+	v.m128 = vec;
+	mData = _mm_set1_ps(v.f[slot]);
+}
+
+inline
+floatInVec::floatInVec(float scalar)
+{
+	mData = _mm_set1_ps(scalar);
+}
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+inline
+float
+floatInVec::getAsFloat() const
+#else
+inline
+floatInVec::operator float() const
+#endif
+{
+    return *((float *)&mData);
+}
+
+inline
+__m128
+floatInVec::get128() const
+{
+    return mData;
+}
+
+inline
+const floatInVec
+floatInVec::operator ++ (int)
+{
+    __m128 olddata = mData;
+    operator ++();
+    return floatInVec(olddata);
+}
+
+inline
+const floatInVec
+floatInVec::operator -- (int)
+{
+    __m128 olddata = mData;
+    operator --();
+    return floatInVec(olddata);
+}
+
+inline
+floatInVec&
+floatInVec::operator ++ ()
+{
+    *this += floatInVec(_mm_set1_ps(1.0f));
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -- ()
+{
+    *this -= floatInVec(_mm_set1_ps(1.0f));
+    return *this;
+}
+
+inline
+const floatInVec
+floatInVec::operator - () const
+{
+    return floatInVec(_mm_sub_ps(_mm_setzero_ps(), mData));
+}
+
+inline
+floatInVec&
+floatInVec::operator = (const floatInVec &vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator *= (const floatInVec &vec)
+{
+    *this = *this * vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator /= (const floatInVec &vec)
+{
+    *this = *this / vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator += (const floatInVec &vec)
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -= (const floatInVec &vec)
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline
+const floatInVec
+operator * (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return floatInVec(_mm_mul_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const floatInVec
+operator / (const floatInVec &num, const floatInVec &den)
+{
+    return floatInVec(_mm_div_ps(num.get128(), den.get128()));
+}
+
+inline
+const floatInVec
+operator + (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return floatInVec(_mm_add_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const floatInVec
+operator - (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return floatInVec(_mm_sub_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator < (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpgt_ps(vec1.get128(), vec0.get128()));
+}
+
+inline
+const boolInVec
+operator <= (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpge_ps(vec1.get128(), vec0.get128()));
+}
+
+inline
+const boolInVec
+operator > (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpgt_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator >= (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpge_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator == (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator != (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128()));
+}
+    
+inline
+const floatInVec
+select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1)
+{
+    return floatInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128()));
+}
+
+} // namespace Vectormath
+
+#endif // floatInVec_h
diff --git a/test/Bullet2/vectormath/sse/mat_aos.h b/test/Bullet2/vectormath/sse/mat_aos.h
new file mode 100644
index 000000000..a2c66cc5f
--- /dev/null
+++ b/test/Bullet2/vectormath/sse/mat_aos.h
@@ -0,0 +1,2190 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef _VECTORMATH_MAT_AOS_CPP_H
+#define _VECTORMATH_MAT_AOS_CPP_H
+
+namespace Vectormath {
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// Constants
+// for shuffles, words are labeled [x,y,z,w] [a,b,c,d]
+
+#define _VECTORMATH_PERM_ZBWX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_B, _VECTORMATH_PERM_W, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_XCYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_C, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_XYAB ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B })
+#define _VECTORMATH_PERM_ZWCD ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_W, _VECTORMATH_PERM_C, _VECTORMATH_PERM_D })
+#define _VECTORMATH_PERM_XZBX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_B, _VECTORMATH_PERM_X })     
+#define _VECTORMATH_PERM_CXXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_YAXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_XAZC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_C })
+#define _VECTORMATH_PERM_YXWZ ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X, _VECTORMATH_PERM_W, _VECTORMATH_PERM_Z })
+#define _VECTORMATH_PERM_YBWD ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_B, _VECTORMATH_PERM_W, _VECTORMATH_PERM_D })
+#define _VECTORMATH_PERM_XYCX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_YCXY ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y })
+#define _VECTORMATH_PERM_CXYC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C })
+#define _VECTORMATH_PERM_ZAYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_BZXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_B, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_XZYA ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A })
+#define _VECTORMATH_PERM_ZXXB ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_B })
+#define _VECTORMATH_PERM_YXXC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_C })
+#define _VECTORMATH_PERM_BBYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_B, _VECTORMATH_PERM_B, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PI_OVER_2 1.570796327f
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const Matrix3 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3::Matrix3( float scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+}
+
+VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const floatInVec &scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+}
+
+VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const Quat &unitQuat )
+{
+    __m128 xyzw_2, wwww, yzxw, zxyw, yzxw_2, zxyw_2;
+    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+	VM_ATTRIBUTE_ALIGN16 unsigned int sx[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int sz[4] = {0, 0, 0xffffffff, 0};
+	__m128 select_x = _mm_load_ps((float *)sx);
+	__m128 select_z = _mm_load_ps((float *)sz);
+
+    xyzw_2 = _mm_add_ps( unitQuat.get128(), unitQuat.get128() );
+    wwww = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,3,3,3) );
+	yzxw = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,0,2,1) );
+	zxyw = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,1,0,2) );
+    yzxw_2 = _mm_shuffle_ps( xyzw_2, xyzw_2, _MM_SHUFFLE(3,0,2,1) );
+    zxyw_2 = _mm_shuffle_ps( xyzw_2, xyzw_2, _MM_SHUFFLE(3,1,0,2) );
+
+    tmp0 = _mm_mul_ps( yzxw_2, wwww );									// tmp0 = 2yw, 2zw, 2xw, 2w2
+	tmp1 = _mm_sub_ps( _mm_set1_ps(1.0f), _mm_mul_ps(yzxw, yzxw_2) );	// tmp1 = 1 - 2y2, 1 - 2z2, 1 - 2x2, 1 - 2w2
+    tmp2 = _mm_mul_ps( yzxw, xyzw_2 );									// tmp2 = 2xy, 2yz, 2xz, 2w2
+    tmp0 = _mm_add_ps( _mm_mul_ps(zxyw, xyzw_2), tmp0 );				// tmp0 = 2yw + 2zx, 2zw + 2xy, 2xw + 2yz, 2w2 + 2w2
+    tmp1 = _mm_sub_ps( tmp1, _mm_mul_ps(zxyw, zxyw_2) );				// tmp1 = 1 - 2y2 - 2z2, 1 - 2z2 - 2x2, 1 - 2x2 - 2y2, 1 - 2w2 - 2w2
+    tmp2 = _mm_sub_ps( tmp2, _mm_mul_ps(zxyw_2, wwww) );				// tmp2 = 2xy - 2zw, 2yz - 2xw, 2xz - 2yw, 2w2 -2w2
+
+    tmp3 = vec_sel( tmp0, tmp1, select_x );
+    tmp4 = vec_sel( tmp1, tmp2, select_x );
+    tmp5 = vec_sel( tmp2, tmp0, select_x );
+    mCol0 = Vector3( vec_sel( tmp3, tmp2, select_z ) );
+    mCol1 = Vector3( vec_sel( tmp4, tmp0, select_z ) );
+    mCol2 = Vector3( vec_sel( tmp5, tmp1, select_z ) );
+}
+
+VECTORMATH_FORCE_INLINE Matrix3::Matrix3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol0( const Vector3 &_col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol1( const Vector3 &_col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol2( const Vector3 &_col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setCol( int col, const Vector3 &vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setRow( int row, const Vector3 &vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setElem( int col, int row, float val )
+{
+    (*this)[col].setElem(row, val);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::setElem( int col, int row, const floatInVec &val )
+{
+    Vector3 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Matrix3::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol0( ) const
+{
+    return mCol0;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol1( ) const
+{
+    return mCol1;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol2( ) const
+{
+    return mCol2;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::getRow( int row ) const
+{
+    return Vector3( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Matrix3::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator =( const Matrix3 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 transpose( const Matrix3 & mat )
+{
+    __m128 tmp0, tmp1, res0, res1, res2;
+    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
+    tmp1 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
+    res0 = vec_mergeh( tmp0, mat.getCol1().get128() );
+    //res1 = vec_perm( tmp0, mat.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	res1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
+	res1 = vec_sel(res1, mat.getCol1().get128(), select_y);
+    //res2 = vec_perm( tmp1, mat.getCol1().get128(), _VECTORMATH_PERM_XCYX );
+	res2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
+	res2 = vec_sel(res2, vec_splat(mat.getCol1().get128(), 2), select_y);
+    return Matrix3(
+        Vector3( res0 ),
+        Vector3( res1 ),
+        Vector3( res2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 inverse( const Matrix3 & mat )
+{
+    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet, inv0, inv1, inv2;
+    tmp2 = _vmathVfCross( mat.getCol0().get128(), mat.getCol1().get128() );
+    tmp0 = _vmathVfCross( mat.getCol1().get128(), mat.getCol2().get128() );
+    tmp1 = _vmathVfCross( mat.getCol2().get128(), mat.getCol0().get128() );
+    dot = _vmathVfDot3( tmp2, mat.getCol2().get128() );
+    dot = vec_splat( dot, 0 );
+    invdet = recipf4( dot );
+    tmp3 = vec_mergeh( tmp0, tmp2 );
+    tmp4 = vec_mergel( tmp0, tmp2 );
+    inv0 = vec_mergeh( tmp3, tmp1 );
+    //inv1 = vec_perm( tmp3, tmp1, _VECTORMATH_PERM_ZBWX );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	inv1 = _mm_shuffle_ps( tmp3, tmp3, _MM_SHUFFLE(0,3,2,2));
+	inv1 = vec_sel(inv1, tmp1, select_y);
+    //inv2 = vec_perm( tmp4, tmp1, _VECTORMATH_PERM_XCYX );
+	inv2 = _mm_shuffle_ps( tmp4, tmp4, _MM_SHUFFLE(0,1,1,0));
+	inv2 = vec_sel(inv2, vec_splat(tmp1, 2), select_y);
+    inv0 = vec_mul( inv0, invdet );
+    inv1 = vec_mul( inv1, invdet );
+	inv2 = vec_mul( inv2, invdet );
+    return Matrix3(
+        Vector3( inv0 ),
+        Vector3( inv1 ),
+        Vector3( inv2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix3 & mat )
+{
+    return dot( mat.getCol2(), cross( mat.getCol0(), mat.getCol1() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator +( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( mCol0 + mat.mCol0 ),
+        ( mCol1 + mat.mCol1 ),
+        ( mCol2 + mat.mCol2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator -( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( mCol0 - mat.mCol0 ),
+        ( mCol1 - mat.mCol1 ),
+        ( mCol2 - mat.mCol2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator +=( const Matrix3 & mat )
+{
+    *this = *this + mat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator -=( const Matrix3 & mat )
+{
+    *this = *this - mat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator -( ) const
+{
+    return Matrix3(
+        ( -mCol0 ),
+        ( -mCol1 ),
+        ( -mCol2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 absPerElem( const Matrix3 & mat )
+{
+    return Matrix3(
+        absPerElem( mat.getCol0() ),
+        absPerElem( mat.getCol1() ),
+        absPerElem( mat.getCol2() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator *( const floatInVec &scalar ) const
+{
+    return Matrix3(
+        ( mCol0 * scalar ),
+        ( mCol1 * scalar ),
+        ( mCol2 * scalar )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator *=( float scalar )
+{
+    return *this *= floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 operator *( float scalar, const Matrix3 & mat )
+{
+    return floatInVec(scalar) * mat;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat )
+{
+    return mat * scalar;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix3::operator *( const Vector3 &vec ) const
+{
+    __m128 res;
+    __m128 xxxx, yyyy, zzzz;
+    xxxx = vec_splat( vec.get128(), 0 );
+    yyyy = vec_splat( vec.get128(), 1 );
+    zzzz = vec_splat( vec.get128(), 2 );
+    res = vec_mul( mCol0.get128(), xxxx );
+    res = vec_madd( mCol1.get128(), yyyy, res );
+    res = vec_madd( mCol2.get128(), zzzz, res );
+    return Vector3( res );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( *this * mat.mCol0 ),
+        ( *this * mat.mCol1 ),
+        ( *this * mat.mCol2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix3 & Matrix3::operator *=( const Matrix3 & mat )
+{
+    *this = *this * mat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 )
+{
+    return Matrix3(
+        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
+        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
+        mulPerElem( mat0.getCol2(), mat1.getCol2() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::identity( )
+{
+    return Matrix3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationX( float radians )
+{
+    return rotationX( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationX( const floatInVec &radians )
+{
+    __m128 s, c, res1, res2;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res1 = vec_sel( zero, c, select_y );
+    res1 = vec_sel( res1, s, select_z );
+    res2 = vec_sel( zero, negatef4(s), select_y );
+    res2 = vec_sel( res2, c, select_z );
+    return Matrix3(
+        Vector3::xAxis( ),
+        Vector3( res1 ),
+        Vector3( res2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationY( float radians )
+{
+    return rotationY( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationY( const floatInVec &radians )
+{
+    __m128 s, c, res0, res2;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, negatef4(s), select_z );
+    res2 = vec_sel( zero, s, select_x );
+    res2 = vec_sel( res2, c, select_z );
+    return Matrix3(
+        Vector3( res0 ),
+        Vector3::yAxis( ),
+        Vector3( res2 )
+	);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationZ( float radians )
+{
+    return rotationZ( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationZ( const floatInVec &radians )
+{
+    __m128 s, c, res0, res1;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, s, select_y );
+    res1 = vec_sel( zero, negatef4(s), select_x );
+    res1 = vec_sel( res1, c, select_y );
+    return Matrix3(
+        Vector3( res0 ),
+        Vector3( res1 ),
+        Vector3::zAxis( )
+	);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotationZYX( const Vector3 &radiansXYZ )
+{
+    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
+    angles = Vector4( radiansXYZ, 0.0f ).get128();
+    sincosf4( angles, &s, &c );
+    negS = negatef4( s );
+    Z0 = vec_mergel( c, s );
+    Z1 = vec_mergel( negS, c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
+    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
+	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
+	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
+    X0 = vec_splat( s, 0 );
+    X1 = vec_splat( c, 0 );
+    tmp = vec_mul( Z0, Y1 );
+    return Matrix3(
+        Vector3( vec_mul( Z0, Y0 ) ),
+        Vector3( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
+        Vector3( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotation( float radians, const Vector3 &unitVec )
+{
+    return rotation( floatInVec(radians), unitVec );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotation( const floatInVec &radians, const Vector3 &unitVec )
+{
+    __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2;
+    axis = unitVec.get128();
+    sincosf4( radians.get128(), &s, &c );
+    xxxx = vec_splat( axis, 0 );
+    yyyy = vec_splat( axis, 1 );
+    zzzz = vec_splat( axis, 2 );
+    oneMinusC = vec_sub( _mm_set1_ps(1.0f), c );
+    axisS = vec_mul( axis, s );
+    negAxisS = negatef4( axisS );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    //tmp0 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_XZBX );
+	tmp0 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,2,0) );
+	tmp0 = vec_sel(tmp0, vec_splat(negAxisS, 1), select_z);
+    //tmp1 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_CXXX );
+	tmp1 = vec_sel( vec_splat(axisS, 0), vec_splat(negAxisS, 2), select_x );
+    //tmp2 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_YAXX );
+	tmp2 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,0,1) );
+	tmp2 = vec_sel(tmp2, vec_splat(negAxisS, 0), select_y);
+    tmp0 = vec_sel( tmp0, c, select_x );
+    tmp1 = vec_sel( tmp1, c, select_y );
+    tmp2 = vec_sel( tmp2, c, select_z );
+    return Matrix3(
+        Vector3( vec_madd( vec_mul( axis, xxxx ), oneMinusC, tmp0 ) ),
+        Vector3( vec_madd( vec_mul( axis, yyyy ), oneMinusC, tmp1 ) ),
+        Vector3( vec_madd( vec_mul( axis, zzzz ), oneMinusC, tmp2 ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::rotation( const Quat &unitQuat )
+{
+    return Matrix3( unitQuat );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix3::scale( const Vector3 &scaleVec )
+{
+    __m128 zero = _mm_setzero_ps();
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    return Matrix3(
+        Vector3( vec_sel( zero, scaleVec.get128(), select_x ) ),
+        Vector3( vec_sel( zero, scaleVec.get128(), select_y ) ),
+        Vector3( vec_sel( zero, scaleVec.get128(), select_z ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec )
+{
+    return Matrix3(
+        ( mat.getCol0() * scaleVec.getX( ) ),
+        ( mat.getCol1() * scaleVec.getY( ) ),
+        ( mat.getCol2() * scaleVec.getZ( ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat )
+{
+    return Matrix3(
+        mulPerElem( mat.getCol0(), scaleVec ),
+        mulPerElem( mat.getCol1(), scaleVec ),
+        mulPerElem( mat.getCol2(), scaleVec )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 )
+{
+    return Matrix3(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 )
+{
+    return Matrix3(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat )
+{
+    print( mat.getRow( 0 ) );
+    print( mat.getRow( 1 ) );
+    print( mat.getRow( 2 ) );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat, const char * name )
+{
+    printf("%s:\n", name);
+    print( mat );
+}
+
+#endif
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Matrix4 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    mCol3 = mat.mCol3;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( float scalar )
+{
+    mCol0 = Vector4( scalar );
+    mCol1 = Vector4( scalar );
+    mCol2 = Vector4( scalar );
+    mCol3 = Vector4( scalar );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const floatInVec &scalar )
+{
+    mCol0 = Vector4( scalar );
+    mCol1 = Vector4( scalar );
+    mCol2 = Vector4( scalar );
+    mCol3 = Vector4( scalar );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Transform3 & mat )
+{
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( mat.getCol3(), 1.0f );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Vector4 &_col0, const Vector4 &_col1, const Vector4 &_col2, const Vector4 &_col3 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+    mCol3 = _col3;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Matrix3 & mat, const Vector3 &translateVec )
+{
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( translateVec, 1.0f );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4::Matrix4( const Quat &unitQuat, const Vector3 &translateVec )
+{
+    Matrix3 mat;
+    mat = Matrix3( unitQuat );
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( translateVec, 1.0f );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol0( const Vector4 &_col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol1( const Vector4 &_col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol2( const Vector4 &_col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol3( const Vector4 &_col3 )
+{
+    mCol3 = _col3;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setCol( int col, const Vector4 &vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setRow( int row, const Vector4 &vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    mCol3.setElem( row, vec.getElem( 3 ) );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setElem( int col, int row, float val )
+{
+    (*this)[col].setElem(row, val);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setElem( int col, int row, const floatInVec &val )
+{
+    Vector4 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Matrix4::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol0( ) const
+{
+    return mCol0;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol1( ) const
+{
+    return mCol1;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol2( ) const
+{
+    return mCol2;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol3( ) const
+{
+    return mCol3;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::getRow( int row ) const
+{
+    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Matrix4::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator =( const Matrix4 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    mCol3 = mat.mCol3;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 transpose( const Matrix4 & mat )
+{
+    __m128 tmp0, tmp1, tmp2, tmp3, res0, res1, res2, res3;
+    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
+    tmp1 = vec_mergeh( mat.getCol1().get128(), mat.getCol3().get128() );
+    tmp2 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
+    tmp3 = vec_mergel( mat.getCol1().get128(), mat.getCol3().get128() );
+    res0 = vec_mergeh( tmp0, tmp1 );
+    res1 = vec_mergel( tmp0, tmp1 );
+    res2 = vec_mergeh( tmp2, tmp3 );
+    res3 = vec_mergel( tmp2, tmp3 );
+    return Matrix4(
+        Vector4( res0 ),
+        Vector4( res1 ),
+        Vector4( res2 ),
+        Vector4( res3 )
+    );
+}
+
+// TODO: Tidy
+static VM_ATTRIBUTE_ALIGN16 const unsigned int _vmathPNPN[4] = {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+static VM_ATTRIBUTE_ALIGN16 const unsigned int _vmathNPNP[4] = {0x80000000, 0x00000000, 0x80000000, 0x00000000};
+static VM_ATTRIBUTE_ALIGN16 const float _vmathZERONE[4] = {1.0f, 0.0f, 0.0f, 1.0f};
+
+VECTORMATH_FORCE_INLINE const Matrix4 inverse( const Matrix4 & mat )
+{
+	__m128 Va,Vb,Vc;
+	__m128 r1,r2,r3,tt,tt2;
+	__m128 sum,Det,RDet;
+	__m128 trns0,trns1,trns2,trns3;
+
+	__m128 _L1 = mat.getCol0().get128();
+	__m128 _L2 = mat.getCol1().get128();
+	__m128 _L3 = mat.getCol2().get128();
+	__m128 _L4 = mat.getCol3().get128();
+	// Calculating the minterms for the first line.
+
+	// _mm_ror_ps is just a macro using _mm_shuffle_ps().
+	tt = _L4; tt2 = _mm_ror_ps(_L3,1); 
+	Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));					// V3'dot V4
+	Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));					// V3'dot V4"
+	Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));					// V3' dot V4^
+
+	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V3" dot V4^ - V3^ dot V4"
+	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V3^ dot V4' - V3' dot V4^
+	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V3' dot V4" - V3" dot V4'
+
+	tt = _L2;
+	Va = _mm_ror_ps(tt,1);		sum = _mm_mul_ps(Va,r1);
+	Vb = _mm_ror_ps(tt,2);		sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
+	Vc = _mm_ror_ps(tt,3);		sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
+
+	// Calculating the determinant.
+	Det = _mm_mul_ps(sum,_L1);
+	Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
+
+	const __m128 Sign_PNPN = _mm_load_ps((float *)_vmathPNPN);
+	const __m128 Sign_NPNP = _mm_load_ps((float *)_vmathNPNP);
+
+	__m128 mtL1 = _mm_xor_ps(sum,Sign_PNPN);
+
+	// Calculating the minterms of the second line (using previous results).
+	tt = _mm_ror_ps(_L1,1);		sum = _mm_mul_ps(tt,r1);
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+	__m128 mtL2 = _mm_xor_ps(sum,Sign_NPNP);
+
+	// Testing the determinant.
+	Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
+
+	// Calculating the minterms of the third line.
+	tt = _mm_ror_ps(_L1,1);
+	Va = _mm_mul_ps(tt,Vb);									// V1' dot V2"
+	Vb = _mm_mul_ps(tt,Vc);									// V1' dot V2^
+	Vc = _mm_mul_ps(tt,_L2);								// V1' dot V2
+
+	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V1" dot V2^ - V1^ dot V2"
+	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V1^ dot V2' - V1' dot V2^
+	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V1' dot V2" - V1" dot V2'
+
+	tt = _mm_ror_ps(_L4,1);		sum = _mm_mul_ps(tt,r1);
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+	__m128 mtL3 = _mm_xor_ps(sum,Sign_PNPN);
+
+	// Dividing is FASTER than rcp_nr! (Because rcp_nr causes many register-memory RWs).
+	RDet = _mm_div_ss(_mm_load_ss((float *)&_vmathZERONE), Det); // TODO: just 1.0f?
+	RDet = _mm_shuffle_ps(RDet,RDet,0x00);
+
+	// Devide the first 12 minterms with the determinant.
+	mtL1 = _mm_mul_ps(mtL1, RDet);
+	mtL2 = _mm_mul_ps(mtL2, RDet);
+	mtL3 = _mm_mul_ps(mtL3, RDet);
+
+	// Calculate the minterms of the forth line and devide by the determinant.
+	tt = _mm_ror_ps(_L3,1);		sum = _mm_mul_ps(tt,r1);
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+	__m128 mtL4 = _mm_xor_ps(sum,Sign_NPNP);
+	mtL4 = _mm_mul_ps(mtL4, RDet);
+
+	// Now we just have to transpose the minterms matrix.
+	trns0 = _mm_unpacklo_ps(mtL1,mtL2);
+	trns1 = _mm_unpacklo_ps(mtL3,mtL4);
+	trns2 = _mm_unpackhi_ps(mtL1,mtL2);
+	trns3 = _mm_unpackhi_ps(mtL3,mtL4);
+	_L1 = _mm_movelh_ps(trns0,trns1);
+	_L2 = _mm_movehl_ps(trns1,trns0);
+	_L3 = _mm_movelh_ps(trns2,trns3);
+	_L4 = _mm_movehl_ps(trns3,trns2);
+
+    return Matrix4(
+        Vector4( _L1 ),
+        Vector4( _L2 ),
+        Vector4( _L3 ),
+        Vector4( _L4 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 affineInverse( const Matrix4 & mat )
+{
+    Transform3 affineMat;
+    affineMat.setCol0( mat.getCol0().getXYZ( ) );
+    affineMat.setCol1( mat.getCol1().getXYZ( ) );
+    affineMat.setCol2( mat.getCol2().getXYZ( ) );
+    affineMat.setCol3( mat.getCol3().getXYZ( ) );
+    return Matrix4( inverse( affineMat ) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 orthoInverse( const Matrix4 & mat )
+{
+    Transform3 affineMat;
+    affineMat.setCol0( mat.getCol0().getXYZ( ) );
+    affineMat.setCol1( mat.getCol1().getXYZ( ) );
+    affineMat.setCol2( mat.getCol2().getXYZ( ) );
+    affineMat.setCol3( mat.getCol3().getXYZ( ) );
+    return Matrix4( orthoInverse( affineMat ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix4 & mat )
+{
+	__m128 Va,Vb,Vc;
+	__m128 r1,r2,r3,tt,tt2;
+	__m128 sum,Det;
+
+	__m128 _L1 = mat.getCol0().get128();
+	__m128 _L2 = mat.getCol1().get128();
+	__m128 _L3 = mat.getCol2().get128();
+	__m128 _L4 = mat.getCol3().get128();
+	// Calculating the minterms for the first line.
+
+	// _mm_ror_ps is just a macro using _mm_shuffle_ps().
+	tt = _L4; tt2 = _mm_ror_ps(_L3,1); 
+	Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));					// V3' dot V4
+	Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));					// V3' dot V4"
+	Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));					// V3' dot V4^
+
+	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V3" dot V4^ - V3^ dot V4"
+	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V3^ dot V4' - V3' dot V4^
+	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V3' dot V4" - V3" dot V4'
+
+	tt = _L2;
+	Va = _mm_ror_ps(tt,1);		sum = _mm_mul_ps(Va,r1);
+	Vb = _mm_ror_ps(tt,2);		sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
+	Vc = _mm_ror_ps(tt,3);		sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
+
+	// Calculating the determinant.
+	Det = _mm_mul_ps(sum,_L1);
+	Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
+
+	// Calculating the minterms of the second line (using previous results).
+	tt = _mm_ror_ps(_L1,1);		sum = _mm_mul_ps(tt,r1);
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+
+	// Testing the determinant.
+	Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
+	return floatInVec(Det, 0);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator +( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( mCol0 + mat.mCol0 ),
+        ( mCol1 + mat.mCol1 ),
+        ( mCol2 + mat.mCol2 ),
+        ( mCol3 + mat.mCol3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator -( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( mCol0 - mat.mCol0 ),
+        ( mCol1 - mat.mCol1 ),
+        ( mCol2 - mat.mCol2 ),
+        ( mCol3 - mat.mCol3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator +=( const Matrix4 & mat )
+{
+    *this = *this + mat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator -=( const Matrix4 & mat )
+{
+    *this = *this - mat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator -( ) const
+{
+    return Matrix4(
+        ( -mCol0 ),
+        ( -mCol1 ),
+        ( -mCol2 ),
+        ( -mCol3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 absPerElem( const Matrix4 & mat )
+{
+    return Matrix4(
+        absPerElem( mat.getCol0() ),
+        absPerElem( mat.getCol1() ),
+        absPerElem( mat.getCol2() ),
+        absPerElem( mat.getCol3() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( const floatInVec &scalar ) const
+{
+    return Matrix4(
+        ( mCol0 * scalar ),
+        ( mCol1 * scalar ),
+        ( mCol2 * scalar ),
+        ( mCol3 * scalar )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( float scalar )
+{
+    return *this *= floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 operator *( float scalar, const Matrix4 & mat )
+{
+    return floatInVec(scalar) * mat;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat )
+{
+    return mat * scalar;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator *( const Vector4 &vec ) const
+{
+    return Vector4(
+		_mm_add_ps(
+			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(1,1,1,1)))),
+			_mm_add_ps(_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(2,2,2,2))), _mm_mul_ps(mCol3.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(3,3,3,3)))))
+		);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator *( const Vector3 &vec ) const
+{
+    return Vector4(
+		_mm_add_ps(
+			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(1,1,1,1)))),
+			_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(2,2,2,2))))
+		);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Matrix4::operator *( const Point3 &pnt ) const
+{
+    return Vector4(
+		_mm_add_ps(
+			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(1,1,1,1)))),
+			_mm_add_ps(_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(2,2,2,2))), mCol3.get128()))
+		);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( *this * mat.mCol0 ),
+        ( *this * mat.mCol1 ),
+        ( *this * mat.mCol2 ),
+        ( *this * mat.mCol3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( const Matrix4 & mat )
+{
+    *this = *this * mat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::operator *( const Transform3 & tfrm ) const
+{
+    return Matrix4(
+        ( *this * tfrm.getCol0() ),
+        ( *this * tfrm.getCol1() ),
+        ( *this * tfrm.getCol2() ),
+        ( *this * Point3( tfrm.getCol3() ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::operator *=( const Transform3 & tfrm )
+{
+    *this = *this * tfrm;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 )
+{
+    return Matrix4(
+        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
+        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
+        mulPerElem( mat0.getCol2(), mat1.getCol2() ),
+        mulPerElem( mat0.getCol3(), mat1.getCol3() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::identity( )
+{
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4::yAxis( ),
+        Vector4::zAxis( ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setUpper3x3( const Matrix3 & mat3 )
+{
+    mCol0.setXYZ( mat3.getCol0() );
+    mCol1.setXYZ( mat3.getCol1() );
+    mCol2.setXYZ( mat3.getCol2() );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Matrix4::getUpper3x3( ) const
+{
+    return Matrix3(
+        mCol0.getXYZ( ),
+        mCol1.getXYZ( ),
+        mCol2.getXYZ( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Matrix4 & Matrix4::setTranslation( const Vector3 &translateVec )
+{
+    mCol3.setXYZ( translateVec );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Matrix4::getTranslation( ) const
+{
+    return mCol3.getXYZ( );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationX( float radians )
+{
+    return rotationX( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationX( const floatInVec &radians )
+{
+    __m128 s, c, res1, res2;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res1 = vec_sel( zero, c, select_y );
+    res1 = vec_sel( res1, s, select_z );
+    res2 = vec_sel( zero, negatef4(s), select_y );
+    res2 = vec_sel( res2, c, select_z );
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4( res1 ),
+        Vector4( res2 ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationY( float radians )
+{
+    return rotationY( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationY( const floatInVec &radians )
+{
+    __m128 s, c, res0, res2;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, negatef4(s), select_z );
+    res2 = vec_sel( zero, s, select_x );
+    res2 = vec_sel( res2, c, select_z );
+    return Matrix4(
+        Vector4( res0 ),
+        Vector4::yAxis( ),
+        Vector4( res2 ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationZ( float radians )
+{
+    return rotationZ( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationZ( const floatInVec &radians )
+{
+    __m128 s, c, res0, res1;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, s, select_y );
+    res1 = vec_sel( zero, negatef4(s), select_x );
+    res1 = vec_sel( res1, c, select_y );
+    return Matrix4(
+        Vector4( res0 ),
+        Vector4( res1 ),
+        Vector4::zAxis( ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotationZYX( const Vector3 &radiansXYZ )
+{
+    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
+    angles = Vector4( radiansXYZ, 0.0f ).get128();
+    sincosf4( angles, &s, &c );
+    negS = negatef4( s );
+    Z0 = vec_mergel( c, s );
+    Z1 = vec_mergel( negS, c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
+    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
+	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
+	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
+    X0 = vec_splat( s, 0 );
+    X1 = vec_splat( c, 0 );
+    tmp = vec_mul( Z0, Y1 );
+    return Matrix4(
+        Vector4( vec_mul( Z0, Y0 ) ),
+        Vector4( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
+        Vector4( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotation( float radians, const Vector3 &unitVec )
+{
+    return rotation( floatInVec(radians), unitVec );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotation( const floatInVec &radians, const Vector3 &unitVec )
+{
+    __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2;
+    axis = unitVec.get128();
+    sincosf4( radians.get128(), &s, &c );
+    xxxx = vec_splat( axis, 0 );
+    yyyy = vec_splat( axis, 1 );
+    zzzz = vec_splat( axis, 2 );
+    oneMinusC = vec_sub( _mm_set1_ps(1.0f), c );
+    axisS = vec_mul( axis, s );
+    negAxisS = negatef4( axisS );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    //tmp0 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_XZBX );
+	tmp0 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,2,0) );
+	tmp0 = vec_sel(tmp0, vec_splat(negAxisS, 1), select_z);
+    //tmp1 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_CXXX );
+	tmp1 = vec_sel( vec_splat(axisS, 0), vec_splat(negAxisS, 2), select_x );
+    //tmp2 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_YAXX );
+	tmp2 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,0,1) );
+	tmp2 = vec_sel(tmp2, vec_splat(negAxisS, 0), select_y);
+    tmp0 = vec_sel( tmp0, c, select_x );
+    tmp1 = vec_sel( tmp1, c, select_y );
+    tmp2 = vec_sel( tmp2, c, select_z );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
+    axis = vec_and( axis, _mm_load_ps( (float *)select_xyz ) );
+    tmp0 = vec_and( tmp0, _mm_load_ps( (float *)select_xyz ) );
+    tmp1 = vec_and( tmp1, _mm_load_ps( (float *)select_xyz ) );
+    tmp2 = vec_and( tmp2, _mm_load_ps( (float *)select_xyz ) );
+    return Matrix4(
+        Vector4( vec_madd( vec_mul( axis, xxxx ), oneMinusC, tmp0 ) ),
+        Vector4( vec_madd( vec_mul( axis, yyyy ), oneMinusC, tmp1 ) ),
+        Vector4( vec_madd( vec_mul( axis, zzzz ), oneMinusC, tmp2 ) ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::rotation( const Quat &unitQuat )
+{
+    return Matrix4( Transform3::rotation( unitQuat ) );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::scale( const Vector3 &scaleVec )
+{
+    __m128 zero = _mm_setzero_ps();
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    return Matrix4(
+        Vector4( vec_sel( zero, scaleVec.get128(), select_x ) ),
+        Vector4( vec_sel( zero, scaleVec.get128(), select_y ) ),
+        Vector4( vec_sel( zero, scaleVec.get128(), select_z ) ),
+        Vector4::wAxis( )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec )
+{
+    return Matrix4(
+        ( mat.getCol0() * scaleVec.getX( ) ),
+        ( mat.getCol1() * scaleVec.getY( ) ),
+        ( mat.getCol2() * scaleVec.getZ( ) ),
+        mat.getCol3()
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat )
+{
+    Vector4 scale4;
+    scale4 = Vector4( scaleVec, 1.0f );
+    return Matrix4(
+        mulPerElem( mat.getCol0(), scale4 ),
+        mulPerElem( mat.getCol1(), scale4 ),
+        mulPerElem( mat.getCol2(), scale4 ),
+        mulPerElem( mat.getCol3(), scale4 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::translation( const Vector3 &translateVec )
+{
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4::yAxis( ),
+        Vector4::zAxis( ),
+        Vector4( translateVec, 1.0f )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec )
+{
+    Matrix4 m4EyeFrame;
+    Vector3 v3X, v3Y, v3Z;
+    v3Y = normalize( upVec );
+    v3Z = normalize( ( eyePos - lookAtPos ) );
+    v3X = normalize( cross( v3Y, v3Z ) );
+    v3Y = cross( v3Z, v3X );
+    m4EyeFrame = Matrix4( Vector4( v3X ), Vector4( v3Y ), Vector4( v3Z ), Vector4( eyePos ) );
+    return orthoInverse( m4EyeFrame );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::perspective( float fovyRadians, float aspect, float zNear, float zFar )
+{
+    float f, rangeInv;
+    __m128 zero, col0, col1, col2, col3;
+    union { __m128 v; float s[4]; } tmp;
+    f = tanf( _VECTORMATH_PI_OVER_2 - fovyRadians * 0.5f );
+    rangeInv = 1.0f / ( zNear - zFar );
+    zero = _mm_setzero_ps();
+    tmp.v = zero;
+    tmp.s[0] = f / aspect;
+    col0 = tmp.v;
+    tmp.v = zero;
+    tmp.s[1] = f;
+    col1 = tmp.v;
+    tmp.v = zero;
+    tmp.s[2] = ( zNear + zFar ) * rangeInv;
+    tmp.s[3] = -1.0f;
+    col2 = tmp.v;
+    tmp.v = zero;
+    tmp.s[2] = zNear * zFar * rangeInv * 2.0f;
+    col3 = tmp.v;
+    return Matrix4(
+        Vector4( col0 ),
+        Vector4( col1 ),
+        Vector4( col2 ),
+        Vector4( col3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::frustum( float left, float right, float bottom, float top, float zNear, float zFar )
+{
+    /* function implementation based on code from STIDC SDK:           */
+    /* --------------------------------------------------------------  */
+    /* PLEASE DO NOT MODIFY THIS SECTION                               */
+    /* This prolog section is automatically generated.                 */
+    /*                                                                 */
+    /* (C)Copyright                                                    */
+    /* Sony Computer Entertainment, Inc.,                              */
+    /* Toshiba Corporation,                                            */
+    /* International Business Machines Corporation,                    */
+    /* 2001,2002.                                                      */
+    /* S/T/I Confidential Information                                  */
+    /* --------------------------------------------------------------  */
+    __m128 lbf, rtn;
+    __m128 diff, sum, inv_diff;
+    __m128 diagonal, column, near2;
+    __m128 zero = _mm_setzero_ps();
+    union { __m128 v; float s[4]; } l, f, r, n, b, t; // TODO: Union?
+    l.s[0] = left;
+    f.s[0] = zFar;
+    r.s[0] = right;
+    n.s[0] = zNear;
+    b.s[0] = bottom;
+    t.s[0] = top;
+    lbf = vec_mergeh( l.v, f.v );
+    rtn = vec_mergeh( r.v, n.v );
+    lbf = vec_mergeh( lbf, b.v );
+    rtn = vec_mergeh( rtn, t.v );
+    diff = vec_sub( rtn, lbf );
+    sum  = vec_add( rtn, lbf );
+    inv_diff = recipf4( diff );
+    near2 = vec_splat( n.v, 0 );
+    near2 = vec_add( near2, near2 );
+    diagonal = vec_mul( near2, inv_diff );
+    column = vec_mul( sum, inv_diff );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
+    return Matrix4(
+        Vector4( vec_sel( zero, diagonal, select_x ) ),
+        Vector4( vec_sel( zero, diagonal, select_y ) ),
+        Vector4( vec_sel( column, _mm_set1_ps(-1.0f), select_w ) ),
+        Vector4( vec_sel( zero, vec_mul( diagonal, vec_splat( f.v, 0 ) ), select_z ) )
+	);
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 Matrix4::orthographic( float left, float right, float bottom, float top, float zNear, float zFar )
+{
+    /* function implementation based on code from STIDC SDK:           */
+    /* --------------------------------------------------------------  */
+    /* PLEASE DO NOT MODIFY THIS SECTION                               */
+    /* This prolog section is automatically generated.                 */
+    /*                                                                 */
+    /* (C)Copyright                                                    */
+    /* Sony Computer Entertainment, Inc.,                              */
+    /* Toshiba Corporation,                                            */
+    /* International Business Machines Corporation,                    */
+    /* 2001,2002.                                                      */
+    /* S/T/I Confidential Information                                  */
+    /* --------------------------------------------------------------  */
+    __m128 lbf, rtn;
+    __m128 diff, sum, inv_diff, neg_inv_diff;
+    __m128 diagonal, column;
+    __m128 zero = _mm_setzero_ps();
+    union { __m128 v; float s[4]; } l, f, r, n, b, t;
+    l.s[0] = left;
+    f.s[0] = zFar;
+    r.s[0] = right;
+    n.s[0] = zNear;
+    b.s[0] = bottom;
+    t.s[0] = top;
+    lbf = vec_mergeh( l.v, f.v );
+    rtn = vec_mergeh( r.v, n.v );
+    lbf = vec_mergeh( lbf, b.v );
+    rtn = vec_mergeh( rtn, t.v );
+    diff = vec_sub( rtn, lbf );
+    sum  = vec_add( rtn, lbf );
+    inv_diff = recipf4( diff );
+    neg_inv_diff = negatef4( inv_diff );
+    diagonal = vec_add( inv_diff, inv_diff );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
+    column = vec_mul( sum, vec_sel( neg_inv_diff, inv_diff, select_z ) ); // TODO: no madds with zero
+    return Matrix4(
+        Vector4( vec_sel( zero, diagonal, select_x ) ),
+        Vector4( vec_sel( zero, diagonal, select_y ) ),
+        Vector4( vec_sel( zero, diagonal, select_z ) ),
+        Vector4( vec_sel( column, _mm_set1_ps(1.0f), select_w ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 )
+{
+    return Matrix4(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 ),
+        select( mat0.getCol3(), mat1.getCol3(), select1 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 )
+{
+    return Matrix4(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 ),
+        select( mat0.getCol3(), mat1.getCol3(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat )
+{
+    print( mat.getRow( 0 ) );
+    print( mat.getRow( 1 ) );
+    print( mat.getRow( 2 ) );
+    print( mat.getRow( 3 ) );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat, const char * name )
+{
+    printf("%s:\n", name);
+    print( mat );
+}
+
+#endif
+
+VECTORMATH_FORCE_INLINE Transform3::Transform3( const Transform3 & tfrm )
+{
+    mCol0 = tfrm.mCol0;
+    mCol1 = tfrm.mCol1;
+    mCol2 = tfrm.mCol2;
+    mCol3 = tfrm.mCol3;
+}
+
+VECTORMATH_FORCE_INLINE Transform3::Transform3( float scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+    mCol3 = Vector3( scalar );
+}
+
+VECTORMATH_FORCE_INLINE Transform3::Transform3( const floatInVec &scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+    mCol3 = Vector3( scalar );
+}
+
+VECTORMATH_FORCE_INLINE Transform3::Transform3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2, const Vector3 &_col3 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+    mCol3 = _col3;
+}
+
+VECTORMATH_FORCE_INLINE Transform3::Transform3( const Matrix3 & tfrm, const Vector3 &translateVec )
+{
+    this->setUpper3x3( tfrm );
+    this->setTranslation( translateVec );
+}
+
+VECTORMATH_FORCE_INLINE Transform3::Transform3( const Quat &unitQuat, const Vector3 &translateVec )
+{
+    this->setUpper3x3( Matrix3( unitQuat ) );
+    this->setTranslation( translateVec );
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol0( const Vector3 &_col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol1( const Vector3 &_col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol2( const Vector3 &_col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol3( const Vector3 &_col3 )
+{
+    mCol3 = _col3;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setCol( int col, const Vector3 &vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setRow( int row, const Vector4 &vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    mCol3.setElem( row, vec.getElem( 3 ) );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setElem( int col, int row, float val )
+{
+    (*this)[col].setElem(row, val);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setElem( int col, int row, const floatInVec &val )
+{
+    Vector3 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Transform3::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol0( ) const
+{
+    return mCol0;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol1( ) const
+{
+    return mCol1;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol2( ) const
+{
+    return mCol2;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol3( ) const
+{
+    return mCol3;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Transform3::getRow( int row ) const
+{
+    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Transform3::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::operator =( const Transform3 & tfrm )
+{
+    mCol0 = tfrm.mCol0;
+    mCol1 = tfrm.mCol1;
+    mCol2 = tfrm.mCol2;
+    mCol3 = tfrm.mCol3;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 inverse( const Transform3 & tfrm )
+{
+    __m128 inv0, inv1, inv2, inv3;
+    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet;
+    __m128 xxxx, yyyy, zzzz;
+    tmp2 = _vmathVfCross( tfrm.getCol0().get128(), tfrm.getCol1().get128() );
+    tmp0 = _vmathVfCross( tfrm.getCol1().get128(), tfrm.getCol2().get128() );
+    tmp1 = _vmathVfCross( tfrm.getCol2().get128(), tfrm.getCol0().get128() );
+    inv3 = negatef4( tfrm.getCol3().get128() );
+    dot = _vmathVfDot3( tmp2, tfrm.getCol2().get128() );
+    dot = vec_splat( dot, 0 );
+    invdet = recipf4( dot );
+    tmp3 = vec_mergeh( tmp0, tmp2 );
+    tmp4 = vec_mergel( tmp0, tmp2 );
+    inv0 = vec_mergeh( tmp3, tmp1 );
+    xxxx = vec_splat( inv3, 0 );
+    //inv1 = vec_perm( tmp3, tmp1, _VECTORMATH_PERM_ZBWX );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	inv1 = _mm_shuffle_ps( tmp3, tmp3, _MM_SHUFFLE(0,3,2,2));
+	inv1 = vec_sel(inv1, tmp1, select_y);
+    //inv2 = vec_perm( tmp4, tmp1, _VECTORMATH_PERM_XCYX );
+	inv2 = _mm_shuffle_ps( tmp4, tmp4, _MM_SHUFFLE(0,1,1,0));
+	inv2 = vec_sel(inv2, vec_splat(tmp1, 2), select_y);
+    yyyy = vec_splat( inv3, 1 );
+    zzzz = vec_splat( inv3, 2 );
+    inv3 = vec_mul( inv0, xxxx );
+    inv3 = vec_madd( inv1, yyyy, inv3 );
+    inv3 = vec_madd( inv2, zzzz, inv3 );
+    inv0 = vec_mul( inv0, invdet );
+    inv1 = vec_mul( inv1, invdet );
+    inv2 = vec_mul( inv2, invdet );
+    inv3 = vec_mul( inv3, invdet );
+    return Transform3(
+        Vector3( inv0 ),
+        Vector3( inv1 ),
+        Vector3( inv2 ),
+        Vector3( inv3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 orthoInverse( const Transform3 & tfrm )
+{
+    __m128 inv0, inv1, inv2, inv3;
+    __m128 tmp0, tmp1;
+    __m128 xxxx, yyyy, zzzz;
+    tmp0 = vec_mergeh( tfrm.getCol0().get128(), tfrm.getCol2().get128() );
+    tmp1 = vec_mergel( tfrm.getCol0().get128(), tfrm.getCol2().get128() );
+    inv3 = negatef4( tfrm.getCol3().get128() );
+    inv0 = vec_mergeh( tmp0, tfrm.getCol1().get128() );
+    xxxx = vec_splat( inv3, 0 );
+    //inv1 = vec_perm( tmp0, tfrm.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	inv1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
+	inv1 = vec_sel(inv1, tfrm.getCol1().get128(), select_y);
+    //inv2 = vec_perm( tmp1, tfrm.getCol1().get128(), _VECTORMATH_PERM_XCYX );
+	inv2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
+	inv2 = vec_sel(inv2, vec_splat(tfrm.getCol1().get128(), 2), select_y);
+    yyyy = vec_splat( inv3, 1 );
+    zzzz = vec_splat( inv3, 2 );
+    inv3 = vec_mul( inv0, xxxx );
+    inv3 = vec_madd( inv1, yyyy, inv3 );
+    inv3 = vec_madd( inv2, zzzz, inv3 );
+    return Transform3(
+        Vector3( inv0 ),
+        Vector3( inv1 ),
+        Vector3( inv2 ),
+        Vector3( inv3 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 absPerElem( const Transform3 & tfrm )
+{
+    return Transform3(
+        absPerElem( tfrm.getCol0() ),
+        absPerElem( tfrm.getCol1() ),
+        absPerElem( tfrm.getCol2() ),
+        absPerElem( tfrm.getCol3() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::operator *( const Vector3 &vec ) const
+{
+    __m128 res;
+    __m128 xxxx, yyyy, zzzz;
+    xxxx = vec_splat( vec.get128(), 0 );
+    yyyy = vec_splat( vec.get128(), 1 );
+    zzzz = vec_splat( vec.get128(), 2 );
+    res = vec_mul( mCol0.get128(), xxxx );
+    res = vec_madd( mCol1.get128(), yyyy, res );
+    res = vec_madd( mCol2.get128(), zzzz, res );
+    return Vector3( res );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 Transform3::operator *( const Point3 &pnt ) const
+{
+    __m128 tmp0, tmp1, res;
+    __m128 xxxx, yyyy, zzzz;
+    xxxx = vec_splat( pnt.get128(), 0 );
+    yyyy = vec_splat( pnt.get128(), 1 );
+    zzzz = vec_splat( pnt.get128(), 2 );
+    tmp0 = vec_mul( mCol0.get128(), xxxx );
+    tmp1 = vec_mul( mCol1.get128(), yyyy );
+    tmp0 = vec_madd( mCol2.get128(), zzzz, tmp0 );
+    tmp1 = vec_add( mCol3.get128(), tmp1 );
+    res = vec_add( tmp0, tmp1 );
+    return Point3( res );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::operator *( const Transform3 & tfrm ) const
+{
+    return Transform3(
+        ( *this * tfrm.mCol0 ),
+        ( *this * tfrm.mCol1 ),
+        ( *this * tfrm.mCol2 ),
+        Vector3( ( *this * Point3( tfrm.mCol3 ) ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::operator *=( const Transform3 & tfrm )
+{
+    *this = *this * tfrm;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 )
+{
+    return Transform3(
+        mulPerElem( tfrm0.getCol0(), tfrm1.getCol0() ),
+        mulPerElem( tfrm0.getCol1(), tfrm1.getCol1() ),
+        mulPerElem( tfrm0.getCol2(), tfrm1.getCol2() ),
+        mulPerElem( tfrm0.getCol3(), tfrm1.getCol3() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::identity( )
+{
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( ),
+        Vector3( 0.0f )
+    );
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setUpper3x3( const Matrix3 & tfrm )
+{
+    mCol0 = tfrm.getCol0();
+    mCol1 = tfrm.getCol1();
+    mCol2 = tfrm.getCol2();
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 Transform3::getUpper3x3( ) const
+{
+    return Matrix3( mCol0, mCol1, mCol2 );
+}
+
+VECTORMATH_FORCE_INLINE Transform3 & Transform3::setTranslation( const Vector3 &translateVec )
+{
+    mCol3 = translateVec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Transform3::getTranslation( ) const
+{
+    return mCol3;
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationX( float radians )
+{
+    return rotationX( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationX( const floatInVec &radians )
+{
+    __m128 s, c, res1, res2;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res1 = vec_sel( zero, c, select_y );
+    res1 = vec_sel( res1, s, select_z );
+    res2 = vec_sel( zero, negatef4(s), select_y );
+    res2 = vec_sel( res2, c, select_z );
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3( res1 ),
+        Vector3( res2 ),
+        Vector3( _mm_setzero_ps() )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationY( float radians )
+{
+    return rotationY( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationY( const floatInVec &radians )
+{
+    __m128 s, c, res0, res2;
+    __m128 zero;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, negatef4(s), select_z );
+    res2 = vec_sel( zero, s, select_x );
+    res2 = vec_sel( res2, c, select_z );
+    return Transform3(
+        Vector3( res0 ),
+        Vector3::yAxis( ),
+        Vector3( res2 ),
+        Vector3( 0.0f )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationZ( float radians )
+{
+    return rotationZ( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationZ( const floatInVec &radians )
+{
+    __m128 s, c, res0, res1;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+    __m128 zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, s, select_y );
+    res1 = vec_sel( zero, negatef4(s), select_x );
+    res1 = vec_sel( res1, c, select_y );
+    return Transform3(
+        Vector3( res0 ),
+        Vector3( res1 ),
+        Vector3::zAxis( ),
+        Vector3( 0.0f )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotationZYX( const Vector3 &radiansXYZ )
+{
+    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
+    angles = Vector4( radiansXYZ, 0.0f ).get128();
+    sincosf4( angles, &s, &c );
+    negS = negatef4( s );
+    Z0 = vec_mergel( c, s );
+    Z1 = vec_mergel( negS, c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
+    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
+	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
+	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
+    X0 = vec_splat( s, 0 );
+    X1 = vec_splat( c, 0 );
+    tmp = vec_mul( Z0, Y1 );
+    return Transform3(
+        Vector3( vec_mul( Z0, Y0 ) ),
+        Vector3( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
+        Vector3( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) ),
+        Vector3( 0.0f )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotation( float radians, const Vector3 &unitVec )
+{
+    return rotation( floatInVec(radians), unitVec );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotation( const floatInVec &radians, const Vector3 &unitVec )
+{
+    return Transform3( Matrix3::rotation( radians, unitVec ), Vector3( 0.0f ) );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::rotation( const Quat &unitQuat )
+{
+    return Transform3( Matrix3( unitQuat ), Vector3( 0.0f ) );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::scale( const Vector3 &scaleVec )
+{
+    __m128 zero = _mm_setzero_ps();
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    return Transform3(
+        Vector3( vec_sel( zero, scaleVec.get128(), select_x ) ),
+        Vector3( vec_sel( zero, scaleVec.get128(), select_y ) ),
+        Vector3( vec_sel( zero, scaleVec.get128(), select_z ) ),
+        Vector3( 0.0f )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec )
+{
+    return Transform3(
+        ( tfrm.getCol0() * scaleVec.getX( ) ),
+        ( tfrm.getCol1() * scaleVec.getY( ) ),
+        ( tfrm.getCol2() * scaleVec.getZ( ) ),
+        tfrm.getCol3()
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm )
+{
+    return Transform3(
+        mulPerElem( tfrm.getCol0(), scaleVec ),
+        mulPerElem( tfrm.getCol1(), scaleVec ),
+        mulPerElem( tfrm.getCol2(), scaleVec ),
+        mulPerElem( tfrm.getCol3(), scaleVec )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 Transform3::translation( const Vector3 &translateVec )
+{
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( ),
+        translateVec
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 )
+{
+    return Transform3(
+        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
+        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
+        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
+        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 )
+{
+    return Transform3(
+        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
+        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
+        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
+        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm )
+{
+    print( tfrm.getRow( 0 ) );
+    print( tfrm.getRow( 1 ) );
+    print( tfrm.getRow( 2 ) );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm, const char * name )
+{
+    printf("%s:\n", name);
+    print( tfrm );
+}
+
+#endif
+
+VECTORMATH_FORCE_INLINE Quat::Quat( const Matrix3 & tfrm )
+{
+    __m128 res;
+    __m128 col0, col1, col2;
+    __m128 xx_yy, xx_yy_zz_xx, yy_zz_xx_yy, zz_xx_yy_zz, diagSum, diagDiff;
+    __m128 zy_xz_yx, yz_zx_xy, sum, diff;
+    __m128 radicand, invSqrt, scale;
+    __m128 res0, res1, res2, res3;
+    __m128 xx, yy, zz;
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
+
+    col0 = tfrm.getCol0().get128();
+    col1 = tfrm.getCol1().get128();
+    col2 = tfrm.getCol2().get128();
+
+    /* four cases: */
+    /* trace > 0 */
+    /* else */
+    /*    xx largest diagonal element */
+    /*    yy largest diagonal element */
+    /*    zz largest diagonal element */
+
+    /* compute quaternion for each case */
+
+    xx_yy = vec_sel( col0, col1, select_y );
+    //xx_yy_zz_xx = vec_perm( xx_yy, col2, _VECTORMATH_PERM_XYCX );
+    //yy_zz_xx_yy = vec_perm( xx_yy, col2, _VECTORMATH_PERM_YCXY );
+    //zz_xx_yy_zz = vec_perm( xx_yy, col2, _VECTORMATH_PERM_CXYC );
+    xx_yy_zz_xx = _mm_shuffle_ps( xx_yy, xx_yy, _MM_SHUFFLE(0,0,1,0) );
+    xx_yy_zz_xx = vec_sel( xx_yy_zz_xx, col2, select_z ); // TODO: Ck
+    yy_zz_xx_yy = _mm_shuffle_ps( xx_yy_zz_xx, xx_yy_zz_xx, _MM_SHUFFLE(1,0,2,1) );
+    zz_xx_yy_zz = _mm_shuffle_ps( xx_yy_zz_xx, xx_yy_zz_xx, _MM_SHUFFLE(2,1,0,2) );
+
+    diagSum = vec_add( vec_add( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz );
+    diagDiff = vec_sub( vec_sub( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz );
+    radicand = vec_add( vec_sel( diagDiff, diagSum, select_w ), _mm_set1_ps(1.0f) );
+ //   invSqrt = rsqrtf4( radicand );
+	invSqrt = newtonrapson_rsqrt4( radicand );
+
+	
+
+    zy_xz_yx = vec_sel( col0, col1, select_z );									// zy_xz_yx = 00 01 12 03
+    //zy_xz_yx = vec_perm( zy_xz_yx, col2, _VECTORMATH_PERM_ZAYX );
+	zy_xz_yx = _mm_shuffle_ps( zy_xz_yx, zy_xz_yx, _MM_SHUFFLE(0,1,2,2) );		// zy_xz_yx = 12 12 01 00
+    zy_xz_yx = vec_sel( zy_xz_yx, vec_splat(col2, 0), select_y );				// zy_xz_yx = 12 20 01 00
+    yz_zx_xy = vec_sel( col0, col1, select_x );									// yz_zx_xy = 10 01 02 03
+    //yz_zx_xy = vec_perm( yz_zx_xy, col2, _VECTORMATH_PERM_BZXX );
+	yz_zx_xy = _mm_shuffle_ps( yz_zx_xy, yz_zx_xy, _MM_SHUFFLE(0,0,2,0) );		// yz_zx_xy = 10 02 10 10
+	yz_zx_xy = vec_sel( yz_zx_xy, vec_splat(col2, 1), select_x );				// yz_zx_xy = 21 02 10 10
+
+    sum = vec_add( zy_xz_yx, yz_zx_xy );
+    diff = vec_sub( zy_xz_yx, yz_zx_xy );
+
+    scale = vec_mul( invSqrt, _mm_set1_ps(0.5f) );
+
+    //res0 = vec_perm( sum, diff, _VECTORMATH_PERM_XZYA );
+	res0 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,1,2,0) );
+	res0 = vec_sel( res0, vec_splat(diff, 0), select_w );  // TODO: Ck
+    //res1 = vec_perm( sum, diff, _VECTORMATH_PERM_ZXXB );
+	res1 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,0,0,2) );
+	res1 = vec_sel( res1, vec_splat(diff, 1), select_w );  // TODO: Ck
+    //res2 = vec_perm( sum, diff, _VECTORMATH_PERM_YXXC );
+	res2 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,0,0,1) );
+	res2 = vec_sel( res2, vec_splat(diff, 2), select_w );  // TODO: Ck
+    res3 = diff;
+    res0 = vec_sel( res0, radicand, select_x );
+    res1 = vec_sel( res1, radicand, select_y );
+    res2 = vec_sel( res2, radicand, select_z );
+    res3 = vec_sel( res3, radicand, select_w );
+    res0 = vec_mul( res0, vec_splat( scale, 0 ) );
+    res1 = vec_mul( res1, vec_splat( scale, 1 ) );
+    res2 = vec_mul( res2, vec_splat( scale, 2 ) );
+    res3 = vec_mul( res3, vec_splat( scale, 3 ) );
+
+    /* determine case and select answer */
+
+    xx = vec_splat( col0, 0 );
+    yy = vec_splat( col1, 1 );
+    zz = vec_splat( col2, 2 );
+    res = vec_sel( res0, res1, vec_cmpgt( yy, xx ) );
+    res = vec_sel( res, res2, vec_and( vec_cmpgt( zz, xx ), vec_cmpgt( zz, yy ) ) );
+    res = vec_sel( res, res3, vec_cmpgt( vec_splat( diagSum, 0 ), _mm_setzero_ps() ) );
+    mVec128 = res;
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 outer( const Vector3 &tfrm0, const Vector3 &tfrm1 )
+{
+    return Matrix3(
+        ( tfrm0 * tfrm1.getX( ) ),
+        ( tfrm0 * tfrm1.getY( ) ),
+        ( tfrm0 * tfrm1.getZ( ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix4 outer( const Vector4 &tfrm0, const Vector4 &tfrm1 )
+{
+    return Matrix4(
+        ( tfrm0 * tfrm1.getX( ) ),
+        ( tfrm0 * tfrm1.getY( ) ),
+        ( tfrm0 * tfrm1.getZ( ) ),
+        ( tfrm0 * tfrm1.getW( ) )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat )
+{
+    __m128 tmp0, tmp1, mcol0, mcol1, mcol2, res;
+    __m128 xxxx, yyyy, zzzz;
+    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
+    tmp1 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
+    xxxx = vec_splat( vec.get128(), 0 );
+    mcol0 = vec_mergeh( tmp0, mat.getCol1().get128() );
+    //mcol1 = vec_perm( tmp0, mat.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	mcol1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
+	mcol1 = vec_sel(mcol1, mat.getCol1().get128(), select_y);
+    //mcol2 = vec_perm( tmp1, mat.getCol1().get128(), _VECTORMATH_PERM_XCYX );
+	mcol2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
+	mcol2 = vec_sel(mcol2, vec_splat(mat.getCol1().get128(), 2), select_y);
+    yyyy = vec_splat( vec.get128(), 1 );
+    res = vec_mul( mcol0, xxxx );
+    zzzz = vec_splat( vec.get128(), 2 );
+    res = vec_madd( mcol1, yyyy, res );
+    res = vec_madd( mcol2, zzzz, res );
+    return Vector3( res );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 crossMatrix( const Vector3 &vec )
+{
+    __m128 neg, res0, res1, res2;
+    neg = negatef4( vec.get128() );
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    //res0 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_XZBX );
+	res0 = _mm_shuffle_ps( vec.get128(), vec.get128(), _MM_SHUFFLE(0,2,2,0) );
+	res0 = vec_sel(res0, vec_splat(neg, 1), select_z);
+    //res1 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_CXXX );
+	res1 = vec_sel(vec_splat(vec.get128(), 0), vec_splat(neg, 2), select_x);
+    //res2 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_YAXX );
+	res2 = _mm_shuffle_ps( vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,1,1) );
+	res2 = vec_sel(res2, vec_splat(neg, 0), select_y);
+	VM_ATTRIBUTE_ALIGN16 unsigned int filter_x[4] = {0, 0xffffffff, 0xffffffff, 0xffffffff};
+	VM_ATTRIBUTE_ALIGN16 unsigned int filter_y[4] = {0xffffffff, 0, 0xffffffff, 0xffffffff};
+	VM_ATTRIBUTE_ALIGN16 unsigned int filter_z[4] = {0xffffffff, 0xffffffff, 0, 0xffffffff};
+    res0 = vec_and( res0, _mm_load_ps((float *)filter_x ) );
+    res1 = vec_and( res1, _mm_load_ps((float *)filter_y ) );
+    res2 = vec_and( res2, _mm_load_ps((float *)filter_z ) ); // TODO: Use selects?
+    return Matrix3(
+        Vector3( res0 ),
+        Vector3( res1 ),
+        Vector3( res2 )
+    );
+}
+
+VECTORMATH_FORCE_INLINE const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat )
+{
+    return Matrix3( cross( vec, mat.getCol0() ), cross( vec, mat.getCol1() ), cross( vec, mat.getCol2() ) );
+}
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/test/Bullet2/vectormath/sse/quat_aos.h b/test/Bullet2/vectormath/sse/quat_aos.h
new file mode 100644
index 000000000..7eac59fe5
--- /dev/null
+++ b/test/Bullet2/vectormath/sse/quat_aos.h
@@ -0,0 +1,579 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef _VECTORMATH_QUAT_AOS_CPP_H
+#define _VECTORMATH_QUAT_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+VECTORMATH_FORCE_INLINE void Quat::set128(vec_float4 vec)
+{
+    mVec128 = vec;
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
+{
+	mVec128 = _mm_unpacklo_ps(
+		_mm_unpacklo_ps( _x.get128(), _z.get128() ),
+		_mm_unpacklo_ps( _y.get128(), _w.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( const Vector3 &xyz, float _w )
+{
+    mVec128 = xyz.get128();
+    _vmathVfSetElement(mVec128, _w, 3);
+}
+
+
+
+VECTORMATH_FORCE_INLINE  Quat::Quat(const Quat& quat)
+{
+	mVec128 = quat.get128();
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( float _x, float _y, float _z, float _w )
+{
+	mVec128 = _mm_setr_ps(_x, _y, _z, _w);
+}
+
+
+
+
+
+VECTORMATH_FORCE_INLINE Quat::Quat( const Vector3 &xyz, const floatInVec &_w )
+{
+    mVec128 = xyz.get128();
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( const Vector4 &vec )
+{
+    mVec128 = vec.get128();
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( float scalar )
+{
+    mVec128 = floatInVec(scalar).get128();
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( const floatInVec &scalar )
+{
+    mVec128 = scalar.get128();
+}
+
+VECTORMATH_FORCE_INLINE Quat::Quat( __m128 vf4 )
+{
+    mVec128 = vf4;
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::identity( )
+{
+    return Quat( _VECTORMATH_UNIT_0001 );
+}
+
+VECTORMATH_FORCE_INLINE const Quat lerp( float t, const Quat &quat0, const Quat &quat1 )
+{
+    return lerp( floatInVec(t), quat0, quat1 );
+}
+
+VECTORMATH_FORCE_INLINE const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 )
+{
+    return ( quat0 + ( ( quat1 - quat0 ) * t ) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 )
+{
+    return slerp( floatInVec(t), unitQuat0, unitQuat1 );
+}
+
+VECTORMATH_FORCE_INLINE const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 )
+{
+    Quat start;
+    vec_float4 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
+    __m128 selectMask;
+    cosAngle = _vmathVfDot4( unitQuat0.get128(), unitQuat1.get128() );
+    selectMask = (__m128)vec_cmpgt( _mm_setzero_ps(), cosAngle );
+    cosAngle = vec_sel( cosAngle, negatef4( cosAngle ), selectMask );
+    start = Quat( vec_sel( unitQuat0.get128(), negatef4( unitQuat0.get128() ), selectMask ) );
+    selectMask = (__m128)vec_cmpgt( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
+    angle = acosf4( cosAngle );
+    tttt = t.get128();
+    oneMinusT = vec_sub( _mm_set1_ps(1.0f), tttt );
+    angles = vec_mergeh( _mm_set1_ps(1.0f), tttt );
+    angles = vec_mergeh( angles, oneMinusT );
+    angles = vec_madd( angles, angle, _mm_setzero_ps() );
+    sines = sinf4( angles );
+    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
+    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
+    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
+    return Quat( vec_madd( start.get128(), scale0, vec_mul( unitQuat1.get128(), scale1 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
+{
+    return squad( floatInVec(t), unitQuat0, unitQuat1, unitQuat2, unitQuat3 );
+}
+
+VECTORMATH_FORCE_INLINE const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
+{
+    return slerp( ( ( floatInVec(2.0f) * t ) * ( floatInVec(1.0f) - t ) ), slerp( t, unitQuat0, unitQuat3 ), slerp( t, unitQuat1, unitQuat2 ) );
+}
+
+VECTORMATH_FORCE_INLINE __m128 Quat::get128( ) const
+{
+    return mVec128;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator =( const Quat &quat )
+{
+    mVec128 = quat.mVec128;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setXYZ( const Vector3 &vec )
+{
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+	mVec128 = vec_sel( vec.get128(), mVec128, sw );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Quat::getXYZ( ) const
+{
+    return Vector3( mVec128 );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setX( float _x )
+{
+    _vmathVfSetElement(mVec128, _x, 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setX( const floatInVec &_x )
+{
+    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Quat::getX( ) const
+{
+    return floatInVec( mVec128, 0 );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setY( float _y )
+{
+    _vmathVfSetElement(mVec128, _y, 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setY( const floatInVec &_y )
+{
+    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Quat::getY( ) const
+{
+    return floatInVec( mVec128, 1 );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setZ( float _z )
+{
+    _vmathVfSetElement(mVec128, _z, 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setZ( const floatInVec &_z )
+{
+    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Quat::getZ( ) const
+{
+    return floatInVec( mVec128, 2 );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setW( float _w )
+{
+    _vmathVfSetElement(mVec128, _w, 3);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setW( const floatInVec &_w )
+{
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Quat::getW( ) const
+{
+    return floatInVec( mVec128, 3 );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setElem( int idx, float value )
+{
+    _vmathVfSetElement(mVec128, value, idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::setElem( int idx, const floatInVec &value )
+{
+    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Quat::getElem( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE VecIdx Quat::operator []( int idx )
+{
+    return VecIdx( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Quat::operator []( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator +( const Quat &quat ) const
+{
+    return Quat( _mm_add_ps( mVec128, quat.mVec128 ) );
+}
+
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator -( const Quat &quat ) const
+{
+    return Quat( _mm_sub_ps( mVec128, quat.mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator *( const floatInVec &scalar ) const
+{
+    return Quat( _mm_mul_ps( mVec128, scalar.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator +=( const Quat &quat )
+{
+    *this = *this + quat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator -=( const Quat &quat )
+{
+    *this = *this - quat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator /( float scalar ) const
+{
+    return *this / floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator /( const floatInVec &scalar ) const
+{
+    return Quat( _mm_div_ps( mVec128, scalar.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator /=( const floatInVec &scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator -( ) const
+{
+	return Quat(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat operator *( float scalar, const Quat &quat )
+{
+    return floatInVec(scalar) * quat;
+}
+
+VECTORMATH_FORCE_INLINE const Quat operator *( const floatInVec &scalar, const Quat &quat )
+{
+    return quat * scalar;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec dot( const Quat &quat0, const Quat &quat1 )
+{
+    return floatInVec( _vmathVfDot4( quat0.get128(), quat1.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec norm( const Quat &quat )
+{
+    return floatInVec(  _vmathVfDot4( quat.get128(), quat.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec length( const Quat &quat )
+{
+    return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( quat.get128(), quat.get128() )), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const Quat normalize( const Quat &quat )
+{
+	vec_float4 dot =_vmathVfDot4( quat.get128(), quat.get128());
+    return Quat( _mm_mul_ps( quat.get128(), newtonrapson_rsqrt4( dot ) ) );
+}
+
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 )
+{
+    Vector3 crossVec;
+    __m128 cosAngle, cosAngleX2Plus2, recipCosHalfAngleX2, cosHalfAngleX2, res;
+    cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() );
+    cosAngleX2Plus2 = vec_madd( cosAngle, _mm_set1_ps(2.0f), _mm_set1_ps(2.0f) );
+    recipCosHalfAngleX2 = _mm_rsqrt_ps( cosAngleX2Plus2 );
+    cosHalfAngleX2 = vec_mul( recipCosHalfAngleX2, cosAngleX2Plus2 );
+    crossVec = cross( unitVec0, unitVec1 );
+    res = vec_mul( crossVec.get128(), recipCosHalfAngleX2 );
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( res, vec_mul( cosHalfAngleX2, _mm_set1_ps(0.5f) ), sw );
+    return Quat( res );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotation( float radians, const Vector3 &unitVec )
+{
+    return rotation( floatInVec(radians), unitVec );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotation( const floatInVec &radians, const Vector3 &unitVec )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( vec_mul( unitVec.get128(), s ), c, sw );
+    return Quat( res );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotationX( float radians )
+{
+    return rotationX( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotationX( const floatInVec &radians )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int xsw[4] = {0xffffffff, 0, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( _mm_setzero_ps(), s, xsw );
+    res = vec_sel( res, c, wsw );
+    return Quat( res );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotationY( float radians )
+{
+    return rotationY( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotationY( const floatInVec &radians )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int ysw[4] = {0, 0xffffffff, 0, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( _mm_setzero_ps(), s, ysw );
+    res = vec_sel( res, c, wsw );
+    return Quat( res );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotationZ( float radians )
+{
+    return rotationZ( floatInVec(radians) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::rotationZ( const floatInVec &radians )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	VM_ATTRIBUTE_ALIGN16 unsigned int zsw[4] = {0, 0, 0xffffffff, 0};
+	VM_ATTRIBUTE_ALIGN16 unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( _mm_setzero_ps(), s, zsw );
+    res = vec_sel( res, c, wsw );
+    return Quat( res );
+}
+
+VECTORMATH_FORCE_INLINE const Quat Quat::operator *( const Quat &quat ) const
+{
+    __m128 ldata, rdata, qv, tmp0, tmp1, tmp2, tmp3;
+    __m128 product, l_wxyz, r_wxyz, xy, qw;
+    ldata = mVec128;
+    rdata = quat.mVec128;
+    tmp0 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,0,2,1) );
+    tmp1 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,1,0,2) );
+    tmp2 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,0,2,1) );
+    qv = vec_mul( vec_splat( ldata, 3 ), rdata );
+    qv = vec_madd( vec_splat( rdata, 3 ), ldata, qv );
+    qv = vec_madd( tmp0, tmp1, qv );
+    qv = vec_nmsub( tmp2, tmp3, qv );
+    product = vec_mul( ldata, rdata );
+    l_wxyz = vec_sld( ldata, ldata, 12 );
+    r_wxyz = vec_sld( rdata, rdata, 12 );
+    qw = vec_nmsub( l_wxyz, r_wxyz, product );
+    xy = vec_madd( l_wxyz, r_wxyz, product );
+    qw = vec_sub( qw, vec_sld( xy, xy, 8 ) );
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+    return Quat( vec_sel( qv, qw, sw ) );
+}
+
+VECTORMATH_FORCE_INLINE Quat & Quat::operator *=( const Quat &quat )
+{
+    *this = *this * quat;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 rotate( const Quat &quat, const Vector3 &vec )
+{    __m128 qdata, vdata, product, tmp0, tmp1, tmp2, tmp3, wwww, qv, qw, res;
+    qdata = quat.get128();
+    vdata = vec.get128();
+    tmp0 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,0,2,1) );
+    tmp1 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,1,0,2) );
+    tmp2 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,0,2,1) );
+    wwww = vec_splat( qdata, 3 );
+    qv = vec_mul( wwww, vdata );
+    qv = vec_madd( tmp0, tmp1, qv );
+    qv = vec_nmsub( tmp2, tmp3, qv );
+    product = vec_mul( qdata, vdata );
+    qw = vec_madd( vec_sld( qdata, qdata, 4 ), vec_sld( vdata, vdata, 4 ), product );
+    qw = vec_add( vec_sld( product, product, 8 ), qw );
+    tmp1 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,0,2,1) );
+    res = vec_mul( vec_splat( qw, 0 ), qdata );
+    res = vec_madd( wwww, qv, res );
+    res = vec_madd( tmp0, tmp1, res );
+    res = vec_nmsub( tmp2, tmp3, res );
+    return Vector3( res );
+}
+
+VECTORMATH_FORCE_INLINE const Quat conj( const Quat &quat )
+{
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0x80000000,0x80000000,0x80000000,0};
+    return Quat( vec_xor( quat.get128(), _mm_load_ps((float *)sw) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, bool select1 )
+{
+    return select( quat0, quat1, boolInVec(select1) );
+}
+
+//VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 )
+//{
+//    return Quat( vec_sel( quat0.get128(), quat1.get128(), select1.get128() ) );
+//}
+
+VECTORMATH_FORCE_INLINE void loadXYZW(Quat& quat, const float* fptr)
+{
+#ifdef USE_SSE3_LDDQU
+	quat = Quat(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128		);
+#else
+	SSEFloat fl;
+	fl.f[0] = fptr[0];
+	fl.f[1] = fptr[1];
+	fl.f[2] = fptr[2];
+	fl.f[3] = fptr[3];
+    quat = Quat(	fl.m128);
+#endif
+    
+
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZW(const Quat& quat, float* fptr)
+{
+	fptr[0] = quat.getX();
+	fptr[1] = quat.getY();
+	fptr[2] = quat.getZ();
+	fptr[3] = quat.getW();
+//    _mm_storeu_ps((float*)quat.get128(),fptr);
+}
+
+
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Quat &quat )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = quat.get128();
+    printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Quat &quat, const char * name )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = quat.get128();
+    printf( "%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/test/Bullet2/vectormath/sse/vec_aos.h b/test/Bullet2/vectormath/sse/vec_aos.h
new file mode 100644
index 000000000..35aeeaf16
--- /dev/null
+++ b/test/Bullet2/vectormath/sse/vec_aos.h
@@ -0,0 +1,1455 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _VECTORMATH_VEC_AOS_CPP_H
+#define _VECTORMATH_VEC_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Constants
+// for permutes words are labeled [x,y,z,w] [a,b,c,d]
+
+#define _VECTORMATH_PERM_X 0x00010203
+#define _VECTORMATH_PERM_Y 0x04050607
+#define _VECTORMATH_PERM_Z 0x08090a0b
+#define _VECTORMATH_PERM_W 0x0c0d0e0f
+#define _VECTORMATH_PERM_A 0x10111213
+#define _VECTORMATH_PERM_B 0x14151617
+#define _VECTORMATH_PERM_C 0x18191a1b
+#define _VECTORMATH_PERM_D 0x1c1d1e1f
+#define _VECTORMATH_PERM_XYZA (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A }
+#define _VECTORMATH_PERM_ZXYW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_W }
+#define _VECTORMATH_PERM_YZXW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_W }
+#define _VECTORMATH_PERM_YZAB (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B }
+#define _VECTORMATH_PERM_ZABC (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B, _VECTORMATH_PERM_C }
+#define _VECTORMATH_PERM_XYAW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_W }
+#define _VECTORMATH_PERM_XAZW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_W }
+#define _VECTORMATH_MASK_0xF000 (vec_uint4){ 0xffffffff, 0, 0, 0 }
+#define _VECTORMATH_MASK_0x0F00 (vec_uint4){ 0, 0xffffffff, 0, 0 }
+#define _VECTORMATH_MASK_0x00F0 (vec_uint4){ 0, 0, 0xffffffff, 0 }
+#define _VECTORMATH_MASK_0x000F (vec_uint4){ 0, 0, 0, 0xffffffff }
+#define _VECTORMATH_UNIT_1000 _mm_setr_ps(1.0f,0.0f,0.0f,0.0f) // (__m128){ 1.0f, 0.0f, 0.0f, 0.0f }
+#define _VECTORMATH_UNIT_0100 _mm_setr_ps(0.0f,1.0f,0.0f,0.0f) // (__m128){ 0.0f, 1.0f, 0.0f, 0.0f }
+#define _VECTORMATH_UNIT_0010 _mm_setr_ps(0.0f,0.0f,1.0f,0.0f) // (__m128){ 0.0f, 0.0f, 1.0f, 0.0f }
+#define _VECTORMATH_UNIT_0001 _mm_setr_ps(0.0f,0.0f,0.0f,1.0f) // (__m128){ 0.0f, 0.0f, 0.0f, 1.0f }
+#define _VECTORMATH_SLERP_TOL 0.999f
+//_VECTORMATH_SLERP_TOLF
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#define     _vmath_shufps(a, b, immx, immy, immz, immw) _mm_shuffle_ps(a, b, _MM_SHUFFLE(immw, immz, immy, immx))
+static VECTORMATH_FORCE_INLINE __m128 _vmathVfDot3( __m128 vec0, __m128 vec1 )
+{
+	__m128 result = _mm_mul_ps( vec0, vec1);
+    return _mm_add_ps( vec_splat( result, 0 ), _mm_add_ps( vec_splat( result, 1 ), vec_splat( result, 2 ) ) );
+}
+
+static VECTORMATH_FORCE_INLINE __m128 _vmathVfDot4( __m128 vec0, __m128 vec1 )
+{
+    __m128 result = _mm_mul_ps(vec0, vec1);
+	return _mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(0,0,0,0)),
+			_mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(1,1,1,1)),
+			_mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(2,2,2,2)), _mm_shuffle_ps(result, result, _MM_SHUFFLE(3,3,3,3)))));
+}
+
+static VECTORMATH_FORCE_INLINE __m128 _vmathVfCross( __m128 vec0, __m128 vec1 )
+{
+    __m128 tmp0, tmp1, tmp2, tmp3, result;
+    tmp0 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,0,2,1) );
+    tmp1 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,1,0,2) );
+    tmp2 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,0,2,1) );
+    result = vec_mul( tmp0, tmp1 );
+    result = vec_nmsub( tmp2, tmp3, result );
+    return result;
+}
+/*
+static VECTORMATH_FORCE_INLINE vec_uint4 _vmathVfToHalfFloatsUnpacked(__m128 v)
+{
+#if 0
+    vec_int4 bexp;
+    vec_uint4 mant, sign, hfloat;
+    vec_uint4 notZero, isInf;
+    const vec_uint4 hfloatInf = (vec_uint4)(0x00007c00u);
+    const vec_uint4 mergeMant = (vec_uint4)(0x000003ffu);
+    const vec_uint4 mergeSign = (vec_uint4)(0x00008000u);
+
+    sign = vec_sr((vec_uint4)v, (vec_uint4)16);
+    mant = vec_sr((vec_uint4)v, (vec_uint4)13);
+    bexp = vec_and(vec_sr((vec_int4)v, (vec_uint4)23), (vec_int4)0xff);
+
+    notZero = (vec_uint4)vec_cmpgt(bexp, (vec_int4)112);
+    isInf = (vec_uint4)vec_cmpgt(bexp, (vec_int4)142);
+
+    bexp = _mm_add_ps(bexp, (vec_int4)-112);
+    bexp = vec_sl(bexp, (vec_uint4)10);
+
+    hfloat = vec_sel((vec_uint4)bexp, mant, mergeMant);
+    hfloat = vec_sel((vec_uint4)(0), hfloat, notZero);
+    hfloat = vec_sel(hfloat, hfloatInf, isInf);
+    hfloat = vec_sel(hfloat, sign, mergeSign);
+
+    return hfloat;
+#else
+	assert(0);
+	return _mm_setzero_ps();
+#endif
+}
+
+static VECTORMATH_FORCE_INLINE vec_ushort8 _vmath2VfToHalfFloats(__m128 u, __m128 v)
+{
+#if 0
+    vec_uint4 hfloat_u, hfloat_v;
+    const vec_uchar16 pack = (vec_uchar16){2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+    hfloat_u = _vmathVfToHalfFloatsUnpacked(u);
+    hfloat_v = _vmathVfToHalfFloatsUnpacked(v);
+    return (vec_ushort8)vec_perm(hfloat_u, hfloat_v, pack);
+#else
+	assert(0);
+	return _mm_setzero_si128();
+#endif
+}
+*/
+
+static VECTORMATH_FORCE_INLINE __m128 _vmathVfInsert(__m128 dst, __m128 src, int slot)
+{
+	SSEFloat s;
+	s.m128 = src;
+	SSEFloat d;
+	d.m128 = dst;
+	d.f[slot] = s.f[slot];
+	return d.m128;
+}
+
+#define _vmathVfSetElement(vec, scalar, slot) ((float *)&(vec))[slot] = scalar
+
+static VECTORMATH_FORCE_INLINE __m128 _vmathVfSplatScalar(float scalar)
+{
+	return _mm_set1_ps(scalar);
+}
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+	
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+VECTORMATH_FORCE_INLINE VecIdx::operator floatInVec() const
+{
+    return floatInVec(ref, i);
+}
+
+VECTORMATH_FORCE_INLINE float VecIdx::getAsFloat() const
+#else
+VECTORMATH_FORCE_INLINE VecIdx::operator float() const
+#endif
+{
+    return ((float *)&ref)[i];
+}
+
+VECTORMATH_FORCE_INLINE float VecIdx::operator =( float scalar )
+{
+    _vmathVfSetElement(ref, scalar, i);
+    return scalar;
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator =( const floatInVec &scalar )
+{
+    ref = _vmathVfInsert(ref, scalar.get128(), i);
+    return scalar;
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator =( const VecIdx& scalar )
+{
+    return *this = floatInVec(scalar.ref, scalar.i);
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator *=( float scalar )
+{
+    return *this *= floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator *=( const floatInVec &scalar )
+{
+    return *this = floatInVec(ref, i) * scalar;
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator /=( float scalar )
+{
+    return *this /= floatInVec(scalar);
+}
+
+inline floatInVec VecIdx::operator /=( const floatInVec &scalar )
+{
+    return *this = floatInVec(ref, i) / scalar;
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator +=( float scalar )
+{
+    return *this += floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator +=( const floatInVec &scalar )
+{
+    return *this = floatInVec(ref, i) + scalar;
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator -=( float scalar )
+{
+    return *this -= floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE floatInVec VecIdx::operator -=( const floatInVec &scalar )
+{
+    return *this = floatInVec(ref, i) - scalar;
+}
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3(const Vector3& vec)
+{
+    set128(vec.get128());
+}
+
+VECTORMATH_FORCE_INLINE void Vector3::set128(vec_float4 vec)
+{
+    mVec128 = vec;
+}
+
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3( float _x, float _y, float _z )
+{
+    mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
+}
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
+{
+	__m128 xz = _mm_unpacklo_ps( _x.get128(), _z.get128() );
+	mVec128 = _mm_unpacklo_ps( xz, _y.get128() );
+}
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3( const Point3 &pnt )
+{
+    mVec128 = pnt.get128();
+}
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3( float scalar )
+{
+    mVec128 = floatInVec(scalar).get128();
+}
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3( const floatInVec &scalar )
+{
+    mVec128 = scalar.get128();
+}
+
+VECTORMATH_FORCE_INLINE Vector3::Vector3( __m128 vf4 )
+{
+    mVec128 = vf4;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::xAxis( )
+{
+    return Vector3( _VECTORMATH_UNIT_1000 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::yAxis( )
+{
+    return Vector3( _VECTORMATH_UNIT_0100 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::zAxis( )
+{
+    return Vector3( _VECTORMATH_UNIT_0010 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return lerp( floatInVec(t), vec0, vec1 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
+{
+    return slerp( floatInVec(t), unitVec0, unitVec1 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
+{
+    __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
+    cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() );
+    __m128 selectMask = _mm_cmpgt_ps( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
+    angle = acosf4( cosAngle );
+    tttt = t.get128();
+    oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt );
+    angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt ); // angles = 1, t, 1, t
+    angles = _mm_unpacklo_ps( angles, oneMinusT );		// angles = 1, 1-t, t, 1-t
+    angles = _mm_mul_ps( angles, angle );
+    sines = sinf4( angles );
+    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
+    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
+    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
+    return Vector3( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE __m128 Vector3::get128( ) const
+{
+    return mVec128;
+}
+
+VECTORMATH_FORCE_INLINE void loadXYZ(Point3& vec, const float* fptr)
+{
+#ifdef USE_SSE3_LDDQU
+	vec = Point3(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128 );
+#else
+	SSEFloat fl;
+	fl.f[0] = fptr[0];
+	fl.f[1] = fptr[1];
+	fl.f[2] = fptr[2];
+	fl.f[3] = fptr[3];
+    vec = Point3(	fl.m128);
+#endif //USE_SSE3_LDDQU
+	
+}
+
+
+
+VECTORMATH_FORCE_INLINE void loadXYZ(Vector3& vec, const float* fptr)
+{
+#ifdef USE_SSE3_LDDQU
+	vec = Vector3(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128 );
+#else
+	SSEFloat fl;
+	fl.f[0] = fptr[0];
+	fl.f[1] = fptr[1];
+	fl.f[2] = fptr[2];
+	fl.f[3] = fptr[3];
+    vec = Vector3(	fl.m128);
+#endif //USE_SSE3_LDDQU
+	
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZ( const Vector3 &vec, __m128 * quad )
+{
+	__m128 dstVec = *quad;
+	VM_ATTRIBUTE_ALIGN16  unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize
+	dstVec = vec_sel(vec.get128(), dstVec, sw);
+	*quad = dstVec;
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZ(const Point3& vec, float* fptr)
+{
+	fptr[0] = vec.getX();
+	fptr[1] = vec.getY();
+	fptr[2] = vec.getZ();
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZ(const Vector3& vec, float* fptr)
+{
+	fptr[0] = vec.getX();
+	fptr[1] = vec.getY();
+	fptr[2] = vec.getZ();
+}
+
+
+VECTORMATH_FORCE_INLINE void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads )
+{
+	const float *quads = (float *)threeQuads;
+    vec0 = Vector3(  _mm_load_ps(quads) );
+    vec1 = Vector3( _mm_loadu_ps(quads + 3) );
+    vec2 = Vector3( _mm_loadu_ps(quads + 6) );
+    vec3 = Vector3( _mm_loadu_ps(quads + 9) );
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads )
+{
+	__m128 xxxx = _mm_shuffle_ps( vec1.get128(), vec1.get128(), _MM_SHUFFLE(0, 0, 0, 0) );
+	__m128 zzzz = _mm_shuffle_ps( vec2.get128(), vec2.get128(), _MM_SHUFFLE(2, 2, 2, 2) );
+	VM_ATTRIBUTE_ALIGN16 unsigned int xsw[4] = {0, 0, 0, 0xffffffff};
+	VM_ATTRIBUTE_ALIGN16 unsigned int zsw[4] = {0xffffffff, 0, 0, 0};
+	threeQuads[0] = vec_sel( vec0.get128(), xxxx, xsw );
+    threeQuads[1] = _mm_shuffle_ps( vec1.get128(), vec2.get128(), _MM_SHUFFLE(1, 0, 2, 1) );
+    threeQuads[2] = vec_sel( _mm_shuffle_ps( vec3.get128(), vec3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
+}
+/*
+VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads )
+{
+	assert(0);
+#if 0
+    __m128 xyz0[3];
+    __m128 xyz1[3];
+    storeXYZArray( vec0, vec1, vec2, vec3, xyz0 );
+    storeXYZArray( vec4, vec5, vec6, vec7, xyz1 );
+    threeQuads[0] = _vmath2VfToHalfFloats(xyz0[0], xyz0[1]);
+    threeQuads[1] = _vmath2VfToHalfFloats(xyz0[2], xyz1[0]);
+    threeQuads[2] = _vmath2VfToHalfFloats(xyz1[1], xyz1[2]);
+#endif
+}
+*/
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator =( const Vector3 &vec )
+{
+    mVec128 = vec.mVec128;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setX( float _x )
+{
+    _vmathVfSetElement(mVec128, _x, 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setX( const floatInVec &_x )
+{
+    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector3::getX( ) const
+{
+    return floatInVec( mVec128, 0 );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setY( float _y )
+{
+    _vmathVfSetElement(mVec128, _y, 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setY( const floatInVec &_y )
+{
+    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector3::getY( ) const
+{
+    return floatInVec( mVec128, 1 );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setZ( float _z )
+{
+    _vmathVfSetElement(mVec128, _z, 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setZ( const floatInVec &_z )
+{
+    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector3::getZ( ) const
+{
+    return floatInVec( mVec128, 2 );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setElem( int idx, float value )
+{
+    _vmathVfSetElement(mVec128, value, idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::setElem( int idx, const floatInVec &value )
+{
+    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector3::getElem( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE VecIdx Vector3::operator []( int idx )
+{
+    return VecIdx( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector3::operator []( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator +( const Vector3 &vec ) const
+{
+    return Vector3( _mm_add_ps( mVec128, vec.mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator -( const Vector3 &vec ) const
+{
+    return Vector3( _mm_sub_ps( mVec128, vec.mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 Vector3::operator +( const Point3 &pnt ) const
+{
+    return Point3( _mm_add_ps( mVec128, pnt.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator *( const floatInVec &scalar ) const
+{
+    return Vector3( _mm_mul_ps( mVec128, scalar.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator +=( const Vector3 &vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator -=( const Vector3 &vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator /( float scalar ) const
+{
+    return *this / floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator /( const floatInVec &scalar ) const
+{
+    return Vector3( _mm_div_ps( mVec128, scalar.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector3 & Vector3::operator /=( const floatInVec &scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector3::operator -( ) const
+{
+	//return Vector3(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
+
+	VM_ATTRIBUTE_ALIGN16 static const int array[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+	__m128 NEG_MASK = SSEFloat(*(const vec_float4*)array).vf;
+	return Vector3(_mm_xor_ps(get128(),NEG_MASK));
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 operator *( float scalar, const Vector3 &vec )
+{
+    return floatInVec(scalar) * vec;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec )
+{
+    return vec * scalar;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _mm_div_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 recipPerElem( const Vector3 &vec )
+{
+    return Vector3( _mm_rcp_ps( vec.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 absPerElem( const Vector3 &vec )
+{
+    return Vector3( fabsf4( vec.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+	__m128 vmask = toM128(0x7fffffff);
+	return Vector3( _mm_or_ps(
+		_mm_and_ps   ( vmask, vec0.get128() ),			// Value
+		_mm_andnot_ps( vmask, vec1.get128() ) ) );		// Signs
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _mm_max_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector3 &vec )
+{
+    return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _mm_min_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector3 &vec )
+{
+    return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector3 &vec )
+{
+    return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return floatInVec( _vmathVfDot3( vec0.get128(), vec1.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector3 &vec )
+{
+    return floatInVec(  _vmathVfDot3( vec.get128(), vec.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec length( const Vector3 &vec )
+{
+    return floatInVec(  _mm_sqrt_ps(_vmathVfDot3( vec.get128(), vec.get128() )), 0 );
+}
+
+
+VECTORMATH_FORCE_INLINE const Vector3 normalizeApprox( const Vector3 &vec )
+{
+    return Vector3( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 normalize( const Vector3 &vec )
+{
+	return Vector3( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _vmathVfCross( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 )
+{
+    return select( vec0, vec1, boolInVec(select1) );
+}
+
+
+VECTORMATH_FORCE_INLINE  const Vector4 select(const Vector4& vec0, const Vector4& vec1, const boolInVec& select1)
+{
+    return Vector4(vec_sel(vec0.get128(), vec1.get128(), select1.get128()));
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Vector3 &vec )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = vec.get128();
+    printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Vector3 &vec, const char * name )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = vec.get128();
+    printf( "%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] );
+}
+
+#endif
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( float _x, float _y, float _z, float _w )
+{
+    mVec128 = _mm_setr_ps(_x, _y, _z, _w); 
+ }
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
+{
+	mVec128 = _mm_unpacklo_ps(
+		_mm_unpacklo_ps( _x.get128(), _z.get128() ),
+		_mm_unpacklo_ps( _y.get128(), _w.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const Vector3 &xyz, float _w )
+{
+    mVec128 = xyz.get128();
+    _vmathVfSetElement(mVec128, _w, 3);
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const Vector3 &xyz, const floatInVec &_w )
+{
+    mVec128 = xyz.get128();
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const Vector3 &vec )
+{
+    mVec128 = vec.get128();
+    mVec128 = _vmathVfInsert(mVec128, _mm_setzero_ps(), 3);
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const Point3 &pnt )
+{
+    mVec128 = pnt.get128();
+    mVec128 = _vmathVfInsert(mVec128, _mm_set1_ps(1.0f), 3);
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const Quat &quat )
+{
+    mVec128 = quat.get128();
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( float scalar )
+{
+    mVec128 = floatInVec(scalar).get128();
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( const floatInVec &scalar )
+{
+    mVec128 = scalar.get128();
+}
+
+VECTORMATH_FORCE_INLINE Vector4::Vector4( __m128 vf4 )
+{
+    mVec128 = vf4;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::xAxis( )
+{
+    return Vector4( _VECTORMATH_UNIT_1000 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::yAxis( )
+{
+    return Vector4( _VECTORMATH_UNIT_0100 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::zAxis( )
+{
+    return Vector4( _VECTORMATH_UNIT_0010 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::wAxis( )
+{
+    return Vector4( _VECTORMATH_UNIT_0001 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return lerp( floatInVec(t), vec0, vec1 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
+{
+    return slerp( floatInVec(t), unitVec0, unitVec1 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
+{
+    __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
+    cosAngle = _vmathVfDot4( unitVec0.get128(), unitVec1.get128() );
+    __m128 selectMask = _mm_cmpgt_ps( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
+    angle = acosf4( cosAngle );
+    tttt = t.get128();
+    oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt );
+    angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt ); // angles = 1, t, 1, t
+    angles = _mm_unpacklo_ps( angles, oneMinusT );		// angles = 1, 1-t, t, 1-t
+    angles = _mm_mul_ps( angles, angle );
+    sines = sinf4( angles );
+    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
+    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
+    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
+    return Vector4( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE __m128 Vector4::get128( ) const
+{
+    return mVec128;
+}
+/*
+VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads )
+{
+    twoQuads[0] = _vmath2VfToHalfFloats(vec0.get128(), vec1.get128());
+    twoQuads[1] = _vmath2VfToHalfFloats(vec2.get128(), vec3.get128());
+}
+*/
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator =( const Vector4 &vec )
+{
+    mVec128 = vec.mVec128;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setXYZ( const Vector3 &vec )
+{
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+	mVec128 = vec_sel( vec.get128(), mVec128, sw );
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Vector4::getXYZ( ) const
+{
+    return Vector3( mVec128 );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setX( float _x )
+{
+    _vmathVfSetElement(mVec128, _x, 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setX( const floatInVec &_x )
+{
+    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector4::getX( ) const
+{
+    return floatInVec( mVec128, 0 );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setY( float _y )
+{
+    _vmathVfSetElement(mVec128, _y, 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setY( const floatInVec &_y )
+{
+    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector4::getY( ) const
+{
+    return floatInVec( mVec128, 1 );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setZ( float _z )
+{
+    _vmathVfSetElement(mVec128, _z, 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setZ( const floatInVec &_z )
+{
+    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector4::getZ( ) const
+{
+    return floatInVec( mVec128, 2 );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setW( float _w )
+{
+    _vmathVfSetElement(mVec128, _w, 3);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setW( const floatInVec &_w )
+{
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector4::getW( ) const
+{
+    return floatInVec( mVec128, 3 );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setElem( int idx, float value )
+{
+    _vmathVfSetElement(mVec128, value, idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::setElem( int idx, const floatInVec &value )
+{
+    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector4::getElem( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE VecIdx Vector4::operator []( int idx )
+{
+    return VecIdx( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Vector4::operator []( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator +( const Vector4 &vec ) const
+{
+    return Vector4( _mm_add_ps( mVec128, vec.mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator -( const Vector4 &vec ) const
+{
+    return Vector4( _mm_sub_ps( mVec128, vec.mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator *( const floatInVec &scalar ) const
+{
+    return Vector4( _mm_mul_ps( mVec128, scalar.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator +=( const Vector4 &vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator -=( const Vector4 &vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator /( float scalar ) const
+{
+    return *this / floatInVec(scalar);
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator /( const floatInVec &scalar ) const
+{
+    return Vector4( _mm_div_ps( mVec128, scalar.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Vector4 & Vector4::operator /=( const floatInVec &scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 Vector4::operator -( ) const
+{
+	return Vector4(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 operator *( float scalar, const Vector4 &vec )
+{
+    return floatInVec(scalar) * vec;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec )
+{
+    return vec * scalar;
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return Vector4( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return Vector4( _mm_div_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 recipPerElem( const Vector4 &vec )
+{
+    return Vector4( _mm_rcp_ps( vec.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 absPerElem( const Vector4 &vec )
+{
+    return Vector4( fabsf4( vec.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+	__m128 vmask = toM128(0x7fffffff);
+	return Vector4( _mm_or_ps(
+		_mm_and_ps   ( vmask, vec0.get128() ),			// Value
+		_mm_andnot_ps( vmask, vec1.get128() ) ) );		// Signs
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return Vector4( _mm_max_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector4 &vec )
+{
+    return floatInVec( _mm_max_ps(
+		_mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
+		_mm_max_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return Vector4( _mm_min_ps( vec0.get128(), vec1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector4 &vec )
+{
+    return floatInVec( _mm_min_ps(
+		_mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
+		_mm_min_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector4 &vec )
+{
+    return floatInVec( _mm_add_ps(
+		_mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
+		_mm_add_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return floatInVec( _vmathVfDot4( vec0.get128(), vec1.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector4 &vec )
+{
+    return floatInVec(  _vmathVfDot4( vec.get128(), vec.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec length( const Vector4 &vec )
+{
+    return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( vec.get128(), vec.get128() )), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 normalizeApprox( const Vector4 &vec )
+{
+    return Vector4( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 normalize( const Vector4 &vec )
+{
+    return Vector4( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 )
+{
+    return select( vec0, vec1, boolInVec(select1) );
+}
+
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Vector4 &vec )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = vec.get128();
+    printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Vector4 &vec, const char * name )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = vec.get128();
+    printf( "%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+#endif
+
+VECTORMATH_FORCE_INLINE Point3::Point3( float _x, float _y, float _z )
+{
+    mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
+}
+
+VECTORMATH_FORCE_INLINE Point3::Point3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
+{
+	mVec128 = _mm_unpacklo_ps( _mm_unpacklo_ps( _x.get128(), _z.get128() ), _y.get128() );
+}
+
+VECTORMATH_FORCE_INLINE Point3::Point3( const Vector3 &vec )
+{
+    mVec128 = vec.get128();
+}
+
+VECTORMATH_FORCE_INLINE Point3::Point3( float scalar )
+{
+    mVec128 = floatInVec(scalar).get128();
+}
+
+VECTORMATH_FORCE_INLINE Point3::Point3( const floatInVec &scalar )
+{
+    mVec128 = scalar.get128();
+}
+
+VECTORMATH_FORCE_INLINE Point3::Point3( __m128 vf4 )
+{
+    mVec128 = vf4;
+}
+
+VECTORMATH_FORCE_INLINE const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return lerp( floatInVec(t), pnt0, pnt1 );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return ( pnt0 + ( ( pnt1 - pnt0 ) * t ) );
+}
+
+VECTORMATH_FORCE_INLINE __m128 Point3::get128( ) const
+{
+    return mVec128;
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZ( const Point3 &pnt, __m128 * quad )
+{
+    __m128 dstVec = *quad;
+	VM_ATTRIBUTE_ALIGN16 unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize
+    dstVec = vec_sel(pnt.get128(), dstVec, sw);
+    *quad = dstVec;
+}
+
+VECTORMATH_FORCE_INLINE void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads )
+{
+	const float *quads = (float *)threeQuads;
+    pnt0 = Point3(  _mm_load_ps(quads) );
+    pnt1 = Point3( _mm_loadu_ps(quads + 3) );
+    pnt2 = Point3( _mm_loadu_ps(quads + 6) );
+    pnt3 = Point3( _mm_loadu_ps(quads + 9) );
+}
+
+VECTORMATH_FORCE_INLINE void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads )
+{
+	__m128 xxxx = _mm_shuffle_ps( pnt1.get128(), pnt1.get128(), _MM_SHUFFLE(0, 0, 0, 0) );
+	__m128 zzzz = _mm_shuffle_ps( pnt2.get128(), pnt2.get128(), _MM_SHUFFLE(2, 2, 2, 2) );
+	VM_ATTRIBUTE_ALIGN16 unsigned int xsw[4] = {0, 0, 0, 0xffffffff};
+	VM_ATTRIBUTE_ALIGN16 unsigned int zsw[4] = {0xffffffff, 0, 0, 0};
+	threeQuads[0] = vec_sel( pnt0.get128(), xxxx, xsw );
+    threeQuads[1] = _mm_shuffle_ps( pnt1.get128(), pnt2.get128(), _MM_SHUFFLE(1, 0, 2, 1) );
+    threeQuads[2] = vec_sel( _mm_shuffle_ps( pnt3.get128(), pnt3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
+}
+/*
+VECTORMATH_FORCE_INLINE void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads )
+{
+#if 0
+    __m128 xyz0[3];
+    __m128 xyz1[3];
+    storeXYZArray( pnt0, pnt1, pnt2, pnt3, xyz0 );
+    storeXYZArray( pnt4, pnt5, pnt6, pnt7, xyz1 );
+    threeQuads[0] = _vmath2VfToHalfFloats(xyz0[0], xyz0[1]);
+    threeQuads[1] = _vmath2VfToHalfFloats(xyz0[2], xyz1[0]);
+    threeQuads[2] = _vmath2VfToHalfFloats(xyz1[1], xyz1[2]);
+#else
+	assert(0);
+#endif
+}
+*/
+VECTORMATH_FORCE_INLINE Point3 & Point3::operator =( const Point3 &pnt )
+{
+    mVec128 = pnt.mVec128;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setX( float _x )
+{
+    _vmathVfSetElement(mVec128, _x, 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setX( const floatInVec &_x )
+{
+    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Point3::getX( ) const
+{
+    return floatInVec( mVec128, 0 );
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setY( float _y )
+{
+    _vmathVfSetElement(mVec128, _y, 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setY( const floatInVec &_y )
+{
+    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Point3::getY( ) const
+{
+    return floatInVec( mVec128, 1 );
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setZ( float _z )
+{
+    _vmathVfSetElement(mVec128, _z, 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setZ( const floatInVec &_z )
+{
+    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Point3::getZ( ) const
+{
+    return floatInVec( mVec128, 2 );
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setElem( int idx, float value )
+{
+    _vmathVfSetElement(mVec128, value, idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::setElem( int idx, const floatInVec &value )
+{
+    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Point3::getElem( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE VecIdx Point3::operator []( int idx )
+{
+    return VecIdx( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec Point3::operator []( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+VECTORMATH_FORCE_INLINE const Vector3 Point3::operator -( const Point3 &pnt ) const
+{
+    return Vector3( _mm_sub_ps( mVec128, pnt.mVec128 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 Point3::operator +( const Vector3 &vec ) const
+{
+    return Point3( _mm_add_ps( mVec128, vec.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 Point3::operator -( const Vector3 &vec ) const
+{
+    return Point3( _mm_sub_ps( mVec128, vec.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::operator +=( const Vector3 &vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE Point3 & Point3::operator -=( const Vector3 &vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+VECTORMATH_FORCE_INLINE const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return Point3( _mm_mul_ps( pnt0.get128(), pnt1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return Point3( _mm_div_ps( pnt0.get128(), pnt1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 recipPerElem( const Point3 &pnt )
+{
+    return Point3( _mm_rcp_ps( pnt.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 absPerElem( const Point3 &pnt )
+{
+    return Point3( fabsf4( pnt.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+	__m128 vmask = toM128(0x7fffffff);
+	return Point3( _mm_or_ps(
+		_mm_and_ps   ( vmask, pnt0.get128() ),			// Value
+		_mm_andnot_ps( vmask, pnt1.get128() ) ) );		// Signs
+}
+
+VECTORMATH_FORCE_INLINE const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return Point3( _mm_max_ps( pnt0.get128(), pnt1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Point3 &pnt )
+{
+    return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return Point3( _mm_min_ps( pnt0.get128(), pnt1.get128() ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec minElem( const Point3 &pnt )
+{
+    return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec sum( const Point3 &pnt )
+{
+    return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, float scaleVal )
+{
+    return scale( pnt, floatInVec( scaleVal ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal )
+{
+    return mulPerElem( pnt, Point3( scaleVal ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec )
+{
+    return mulPerElem( pnt, Point3( scaleVec ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec )
+{
+    return floatInVec( _vmathVfDot3( pnt.get128(), unitVec.get128() ), 0 );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec distSqrFromOrigin( const Point3 &pnt )
+{
+    return lengthSqr( Vector3( pnt ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec distFromOrigin( const Point3 &pnt )
+{
+    return length( Vector3( pnt ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return lengthSqr( ( pnt1 - pnt0 ) );
+}
+
+VECTORMATH_FORCE_INLINE const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return length( ( pnt1 - pnt0 ) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 )
+{
+    return select( pnt0, pnt1, boolInVec(select1) );
+}
+
+VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 )
+{
+    return Point3( vec_sel( pnt0.get128(), pnt1.get128(), select1.get128() ) );
+}
+
+
+
+#ifdef _VECTORMATH_DEBUG
+
+VECTORMATH_FORCE_INLINE void print( const Point3 &pnt )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = pnt.get128();
+    printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
+}
+
+VECTORMATH_FORCE_INLINE void print( const Point3 &pnt, const char * name )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = pnt.get128();
+    printf( "%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/test/Bullet2/vectormath/sse/vecidx_aos.h b/test/Bullet2/vectormath/sse/vecidx_aos.h
new file mode 100644
index 000000000..8ba4b1d75
--- /dev/null
+++ b/test/Bullet2/vectormath/sse/vecidx_aos.h
@@ -0,0 +1,80 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _VECTORMATH_VECIDX_AOS_H
+#define _VECTORMATH_VECIDX_AOS_H
+
+
+#include "floatInVec.h"
+
+namespace Vectormath {
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// VecIdx 
+// Used in setting elements of Vector3, Vector4, Point3, or Quat with the 
+// subscripting operator.
+//
+
+VM_ATTRIBUTE_ALIGNED_CLASS16 (class) VecIdx
+{
+private:
+   __m128 &ref;
+   int i;
+public:
+    inline VecIdx( __m128& vec, int idx ): ref(vec) { i = idx; }
+
+    // implicitly casts to float unless _VECTORMATH_NO_SCALAR_CAST defined
+    // in which case, implicitly casts to floatInVec, and one must call
+    // getAsFloat to convert to float.
+    //
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+    inline operator floatInVec() const;
+    inline float getAsFloat() const;
+#else
+    inline operator float() const;
+#endif
+
+    inline float operator =( float scalar );
+    inline floatInVec operator =( const floatInVec &scalar );
+    inline floatInVec operator =( const VecIdx& scalar );
+    inline floatInVec operator *=( float scalar );
+    inline floatInVec operator *=( const floatInVec &scalar );
+    inline floatInVec operator /=( float scalar );
+    inline floatInVec operator /=( const floatInVec &scalar );
+    inline floatInVec operator +=( float scalar );
+    inline floatInVec operator +=( const floatInVec &scalar );
+    inline floatInVec operator -=( float scalar );
+    inline floatInVec operator -=( const floatInVec &scalar );
+};
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/test/Bullet2/vectormath/sse/vectormath_aos.h b/test/Bullet2/vectormath/sse/vectormath_aos.h
new file mode 100644
index 000000000..be5ae8c6e
--- /dev/null
+++ b/test/Bullet2/vectormath/sse/vectormath_aos.h
@@ -0,0 +1,2547 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef _VECTORMATH_AOS_CPP_SSE_H
+#define _VECTORMATH_AOS_CPP_SSE_H
+
+#include <math.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <assert.h>
+
+#define Vector3Ref Vector3&
+#define QuatRef	Quat&
+#define Matrix3Ref Matrix3&
+
+#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400)
+	#define USE_SSE3_LDDQU
+
+	#define VM_ATTRIBUTE_ALIGNED_CLASS16(a) __declspec(align(16)) a
+	#define VM_ATTRIBUTE_ALIGN16 __declspec(align(16))
+	#define VECTORMATH_FORCE_INLINE __forceinline 
+#else
+	#define VM_ATTRIBUTE_ALIGNED_CLASS16(a) a __attribute__ ((aligned (16)))	
+	#define VM_ATTRIBUTE_ALIGN16 __attribute__ ((aligned (16)))	
+	#define VECTORMATH_FORCE_INLINE inline __attribute__ ((always_inline))
+	#ifdef __SSE3__
+		#define USE_SSE3_LDDQU
+	#endif //__SSE3__
+#endif//_WIN32
+
+
+#ifdef USE_SSE3_LDDQU
+#include <pmmintrin.h>//_mm_lddqu_si128
+#endif //USE_SSE3_LDDQU
+
+
+// TODO: Tidy
+typedef __m128 vec_float4;
+typedef __m128 vec_uint4;
+typedef __m128 vec_int4;
+typedef __m128i vec_uchar16;
+typedef __m128i vec_ushort8;
+
+#define vec_splat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e))
+
+#define _mm_ror_ps(vec,i)	\
+	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(i+3)%4,(unsigned char)(i+2)%4,(unsigned char)(i+1)%4,(unsigned char)(i+0)%4))) : (vec))
+#define _mm_rol_ps(vec,i)	\
+	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(7-i)%4,(unsigned char)(6-i)%4,(unsigned char)(5-i)%4,(unsigned char)(4-i)%4))) : (vec))
+
+#define vec_sld(vec,vec2,x) _mm_ror_ps(vec, ((x)/4))
+
+#define _mm_abs_ps(vec)		_mm_andnot_ps(_MASKSIGN_,vec)
+#define _mm_neg_ps(vec)		_mm_xor_ps(_MASKSIGN_,vec)
+
+#define vec_madd(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b) )
+
+union SSEFloat
+{
+	__m128i vi;
+	__m128 m128;
+	__m128 vf;
+	unsigned int	ui[4];
+	unsigned short s[8];
+	float f[4];
+	SSEFloat(__m128 v) : m128(v) {}
+    SSEFloat(__m128i v) : vi(v) {}
+	SSEFloat() {}//uninitialized
+};
+
+static VECTORMATH_FORCE_INLINE __m128 vec_sel(__m128 a, __m128 b, __m128 mask)
+{
+	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
+}
+static VECTORMATH_FORCE_INLINE __m128 vec_sel(__m128 a, __m128 b, const unsigned int *_mask)
+{
+	return vec_sel(a, b, _mm_load_ps((float *)_mask));
+}
+static VECTORMATH_FORCE_INLINE __m128 vec_sel(__m128 a, __m128 b, unsigned int _mask)
+{
+	return vec_sel(a, b, _mm_set1_ps(*(float *)&_mask));
+}
+
+static VECTORMATH_FORCE_INLINE __m128 toM128(unsigned int x)
+{
+    return _mm_set1_ps( *(float *)&x );
+}
+
+static VECTORMATH_FORCE_INLINE __m128 fabsf4(__m128 x)
+{
+    return _mm_and_ps( x, toM128( 0x7fffffff ) );
+}
+/*
+union SSE64
+{
+	__m128 m128;
+	struct
+	{
+		__m64 m01;
+		__m64 m23;
+	} m64;
+};
+
+static VECTORMATH_FORCE_INLINE __m128 vec_cts(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	SSE64 sse64;
+	sse64.m64.m01 = _mm_cvttps_pi32(x);
+	sse64.m64.m23 = _mm_cvttps_pi32(_mm_ror_ps(x,2));
+	_mm_empty();
+    return sse64.m128;
+}
+
+static VECTORMATH_FORCE_INLINE __m128 vec_ctf(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	SSE64 sse64;
+	sse64.m128 = x;
+	__m128 result =_mm_movelh_ps(
+		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m01),
+		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m23));
+	_mm_empty();
+	return result;
+}
+*/
+static VECTORMATH_FORCE_INLINE __m128 vec_cts(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	__m128i result = _mm_cvtps_epi32(x);
+    return (__m128 &)result;
+}
+
+static VECTORMATH_FORCE_INLINE __m128 vec_ctf(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	return _mm_cvtepi32_ps((__m128i &)x);
+}
+
+#define vec_nmsub(a,b,c) _mm_sub_ps( c, _mm_mul_ps( a, b ) )
+#define vec_sub(a,b) _mm_sub_ps( a, b )
+#define vec_add(a,b) _mm_add_ps( a, b )
+#define vec_mul(a,b) _mm_mul_ps( a, b )
+#define vec_xor(a,b) _mm_xor_ps( a, b )
+#define vec_and(a,b) _mm_and_ps( a, b )
+#define vec_cmpeq(a,b) _mm_cmpeq_ps( a, b )
+#define vec_cmpgt(a,b) _mm_cmpgt_ps( a, b )
+
+#define vec_mergeh(a,b) _mm_unpacklo_ps( a, b )
+#define vec_mergel(a,b) _mm_unpackhi_ps( a, b )
+
+#define vec_andc(a,b) _mm_andnot_ps( b, a )
+
+#define sqrtf4(x) _mm_sqrt_ps( x )
+#define rsqrtf4(x) _mm_rsqrt_ps( x )
+#define recipf4(x) _mm_rcp_ps( x )
+#define negatef4(x) _mm_sub_ps( _mm_setzero_ps(), x )
+
+static VECTORMATH_FORCE_INLINE __m128 newtonrapson_rsqrt4( const __m128 v )
+{   
+#define _half4 _mm_setr_ps(.5f,.5f,.5f,.5f) 
+#define _three _mm_setr_ps(3.f,3.f,3.f,3.f)
+const __m128 approx = _mm_rsqrt_ps( v );   
+const __m128 muls = _mm_mul_ps(_mm_mul_ps(v, approx), approx);   
+return _mm_mul_ps(_mm_mul_ps(_half4, approx), _mm_sub_ps(_three, muls) );
+}
+
+static VECTORMATH_FORCE_INLINE __m128 acosf4(__m128 x)
+{
+    __m128 xabs = fabsf4(x);
+	__m128 select = _mm_cmplt_ps( x, _mm_setzero_ps() );
+    __m128 t1 = sqrtf4(vec_sub(_mm_set1_ps(1.0f), xabs));
+    
+    /* Instruction counts can be reduced if the polynomial was
+     * computed entirely from nested (dependent) fma's. However, 
+     * to reduce the number of pipeline stalls, the polygon is evaluated 
+     * in two halves (hi amd lo). 
+     */
+    __m128 xabs2 = _mm_mul_ps(xabs,  xabs);
+    __m128 xabs4 = _mm_mul_ps(xabs2, xabs2);
+    __m128 hi = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0012624911f),
+		xabs, _mm_set1_ps(0.0066700901f)),
+			xabs, _mm_set1_ps(-0.0170881256f)),
+				xabs, _mm_set1_ps( 0.0308918810f));
+    __m128 lo = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0501743046f),
+		xabs, _mm_set1_ps(0.0889789874f)),
+			xabs, _mm_set1_ps(-0.2145988016f)),
+				xabs, _mm_set1_ps( 1.5707963050f));
+    
+    __m128 result = vec_madd(hi, xabs4, lo);
+    
+    // Adjust the result if x is negactive.
+    return vec_sel(
+		vec_mul(t1, result),									// Positive
+		vec_nmsub(t1, result, _mm_set1_ps(3.1415926535898f)),	// Negative
+		select);
+}
+
+static VECTORMATH_FORCE_INLINE __m128 sinf4(vec_float4 x)
+{
+
+//
+// Common constants used to evaluate sinf4/cosf4/tanf4
+//
+#define _SINCOS_CC0  -0.0013602249f
+#define _SINCOS_CC1   0.0416566950f
+#define _SINCOS_CC2  -0.4999990225f
+#define _SINCOS_SC0  -0.0001950727f
+#define _SINCOS_SC1   0.0083320758f
+#define _SINCOS_SC2  -0.1666665247f
+
+#define _SINCOS_KC1  1.57079625129f
+#define _SINCOS_KC2  7.54978995489e-8f
+
+    vec_float4 xl,xl2,xl3,res;
+
+    // Range reduction using : xl = angle * TwoOverPi;
+    //  
+    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
+
+    // Find the quadrant the angle falls in
+    // using:  q = (int) (ceil(abs(xl))*sign(xl))
+    //
+    vec_int4 q = vec_cts(xl,0);
+
+    // Compute an offset based on the quadrant that the angle falls in
+    // 
+    vec_int4 offset = _mm_and_ps(q,toM128(0x3));
+
+    // Remainder in range [-pi/4..pi/4]
+    //
+    vec_float4 qf = vec_ctf(q,0);
+    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
+    
+    // Compute x^2 and x^3
+    //
+    xl2 = vec_mul(xl,xl);
+    xl3 = vec_mul(xl2,xl);
+    
+    // Compute both the sin and cos of the angles
+    // using a polynomial expression:
+    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
+    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
+    //
+    
+    vec_float4 cx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
+    vec_float4 sx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
+
+    // Use the cosine when the offset is odd and the sin
+    // when the offset is even
+    //
+    res = vec_sel(cx,sx,vec_cmpeq(vec_and(offset,
+                                          toM128(0x1)),
+										  _mm_setzero_ps()));
+
+    // Flip the sign of the result when (offset mod 4) = 1 or 2
+    //
+    return vec_sel(
+		vec_xor(toM128(0x80000000U), res),	// Negative
+		res,								// Positive
+		vec_cmpeq(vec_and(offset,toM128(0x2)),_mm_setzero_ps()));
+}
+
+static VECTORMATH_FORCE_INLINE void sincosf4(vec_float4 x, vec_float4* s, vec_float4* c)
+{
+    vec_float4 xl,xl2,xl3;
+    vec_int4   offsetSin, offsetCos;
+
+    // Range reduction using : xl = angle * TwoOverPi;
+    //  
+    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
+
+    // Find the quadrant the angle falls in
+    // using:  q = (int) (ceil(abs(xl))*sign(xl))
+    //
+    //vec_int4 q = vec_cts(vec_add(xl,vec_sel(_mm_set1_ps(0.5f),xl,(0x80000000))),0);
+    vec_int4 q = vec_cts(xl,0);
+     
+    // Compute the offset based on the quadrant that the angle falls in.
+    // Add 1 to the offset for the cosine. 
+    //
+    offsetSin = vec_and(q,toM128((int)0x3));
+	__m128i temp = _mm_add_epi32(_mm_set1_epi32(1),(__m128i &)offsetSin);
+	offsetCos = (__m128 &)temp;
+
+    // Remainder in range [-pi/4..pi/4]
+    //
+    vec_float4 qf = vec_ctf(q,0);
+    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
+    
+    // Compute x^2 and x^3
+    //
+    xl2 = vec_mul(xl,xl);
+    xl3 = vec_mul(xl2,xl);
+    
+    // Compute both the sin and cos of the angles
+    // using a polynomial expression:
+    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
+    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
+    //
+    vec_float4 cx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
+    vec_float4 sx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
+
+    // Use the cosine when the offset is odd and the sin
+    // when the offset is even
+    //
+    vec_uint4 sinMask = (vec_uint4)vec_cmpeq(vec_and(offsetSin,toM128(0x1)),_mm_setzero_ps());
+    vec_uint4 cosMask = (vec_uint4)vec_cmpeq(vec_and(offsetCos,toM128(0x1)),_mm_setzero_ps());    
+    *s = vec_sel(cx,sx,sinMask);
+    *c = vec_sel(cx,sx,cosMask);
+
+    // Flip the sign of the result when (offset mod 4) = 1 or 2
+    //
+    sinMask = vec_cmpeq(vec_and(offsetSin,toM128(0x2)),_mm_setzero_ps());
+    cosMask = vec_cmpeq(vec_and(offsetCos,toM128(0x2)),_mm_setzero_ps());
+    
+    *s = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*s),*s,sinMask);
+    *c = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*c),*c,cosMask);    
+}
+
+#include "vecidx_aos.h"
+#include "floatInVec.h"
+#include "boolInVec.h"
+
+#ifdef _VECTORMATH_DEBUG
+#include <stdio.h>
+#endif
+namespace Vectormath {
+
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// Forward Declarations
+//
+
+class Vector3;
+class Vector4;
+class Point3;
+class Quat;
+class Matrix3;
+class Matrix4;
+class Transform3;
+
+// A 3-D vector in array-of-structures format
+//
+class Vector3
+{
+    __m128 mVec128;
+
+	VECTORMATH_FORCE_INLINE void set128(vec_float4 vec);
+	 
+	 VECTORMATH_FORCE_INLINE  vec_float4& get128Ref();
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Vector3( ) { };
+
+	// Default copy constructor
+    // 
+	VECTORMATH_FORCE_INLINE Vector3(const Vector3& vec);
+
+    // Construct a 3-D vector from x, y, and z elements
+    // 
+    VECTORMATH_FORCE_INLINE Vector3( float x, float y, float z );
+
+    // Construct a 3-D vector from x, y, and z elements (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
+
+    // Copy elements from a 3-D point into a 3-D vector
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector3( const Point3 &pnt );
+
+    // Set all elements of a 3-D vector to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector3( float scalar );
+
+    // Set all elements of a 3-D vector to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector3( const floatInVec &scalar );
+
+    // Set vector float data in a 3-D vector
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector3( __m128 vf4 );
+
+    // Get vector float data from a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
+
+    // Assign one 3-D vector to another
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator =( const Vector3 &vec );
+
+    // Set the x element of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setX( float x );
+
+    // Set the y element of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setY( float y );
+
+    // Set the z element of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setZ( float z );
+
+    // Set the x element of a 3-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setX( const floatInVec &x );
+
+    // Set the y element of a 3-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setY( const floatInVec &y );
+
+    // Set the z element of a 3-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setZ( const floatInVec &z );
+
+    // Get the x element of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
+
+    // Get the y element of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
+
+    // Get the z element of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D vector by index
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setElem( int idx, float value );
+
+    // Set an x, y, or z element of a 3-D vector by index (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, or z element of a 3-D vector by index
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
+
+    // Add two 3-D vectors
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator +( const Vector3 &vec ) const;
+
+    // Subtract a 3-D vector from another 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator -( const Vector3 &vec ) const;
+
+    // Add a 3-D vector to a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const Point3 operator +( const Point3 &pnt ) const;
+
+    // Multiply a 3-D vector by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator *( float scalar ) const;
+
+    // Divide a 3-D vector by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator /( float scalar ) const;
+
+    // Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator *( const floatInVec &scalar ) const;
+
+    // Divide a 3-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator /( const floatInVec &scalar ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator +=( const Vector3 &vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator -=( const Vector3 &vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator /=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator /=( const floatInVec &scalar );
+
+    // Negate all elements of a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector3 xAxis( );
+
+    // Construct y axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector3 yAxis( );
+
+    // Construct z axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector3 zAxis( );
+
+};
+
+// Multiply a 3-D vector by a scalar
+// 
+VECTORMATH_FORCE_INLINE const Vector3 operator *( float scalar, const Vector3 &vec );
+
+// Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec );
+
+// Multiply two 3-D vectors per element
+// 
+VECTORMATH_FORCE_INLINE const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Divide two 3-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Compute the reciprocal of a 3-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 recipPerElem( const Vector3 &vec );
+
+// Compute the absolute value of a 3-D vector per element
+// 
+VECTORMATH_FORCE_INLINE const Vector3 absPerElem( const Vector3 &vec );
+
+// Copy sign from one 3-D vector to another, per element
+// 
+VECTORMATH_FORCE_INLINE const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Maximum of two 3-D vectors per element
+// 
+VECTORMATH_FORCE_INLINE const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Minimum of two 3-D vectors per element
+// 
+VECTORMATH_FORCE_INLINE const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Maximum element of a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector3 &vec );
+
+// Minimum element of a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector3 &vec );
+
+// Compute the sum of all elements of a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector3 &vec );
+
+// Compute the dot product of two 3-D vectors
+// 
+VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Compute the square of the length of a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector3 &vec );
+
+// Compute the length of a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec length( const Vector3 &vec );
+
+// Normalize a 3-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 normalize( const Vector3 &vec );
+
+// Compute cross product of two 3-D vectors
+// 
+VECTORMATH_FORCE_INLINE const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Outer product of two 3-D vectors
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 outer( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Pre-multiply a row vector by a 3x3 matrix
+// NOTE: 
+// Slower than column post-multiply.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat );
+
+// Cross-product matrix of a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 crossMatrix( const Vector3 &vec );
+
+// Create cross-product matrix and multiply
+// NOTE: 
+// Faster than separately creating a cross-product matrix and multiplying.
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat );
+
+// Linear interpolation between two 3-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 );
+
+// Linear interpolation between two 3-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 );
+
+// Spherical linear interpolation between two 3-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
+
+// Spherical linear interpolation between two 3-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
+
+// Conditionally select between two 3-D vectors
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 );
+
+// Conditionally select between two 3-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, const boolInVec &select1 );
+
+// Store x, y, and z elements of 3-D vector in first three words of a quadword, preserving fourth word
+// 
+VECTORMATH_FORCE_INLINE void storeXYZ( const Vector3 &vec, __m128 * quad );
+
+// Load four three-float 3-D vectors, stored in three quadwords
+// 
+VECTORMATH_FORCE_INLINE void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads );
+
+// Store four 3-D vectors in three quadwords
+// 
+VECTORMATH_FORCE_INLINE void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads );
+
+// Store eight 3-D vectors as half-floats
+// 
+VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Vector3 &vec );
+
+// Print a 3-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Vector3 &vec, const char * name );
+
+#endif
+
+// A 4-D vector in array-of-structures format
+//
+class Vector4
+{
+    __m128 mVec128;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Vector4( ) { };
+
+    // Construct a 4-D vector from x, y, z, and w elements
+    // 
+    VECTORMATH_FORCE_INLINE Vector4( float x, float y, float z, float w );
+
+    // Construct a 4-D vector from x, y, z, and w elements (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
+
+    // Construct a 4-D vector from a 3-D vector and a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Vector4( const Vector3 &xyz, float w );
+
+    // Construct a 4-D vector from a 3-D vector and a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4( const Vector3 &xyz, const floatInVec &w );
+
+    // Copy x, y, and z from a 3-D vector into a 4-D vector, and set w to 0
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector4( const Vector3 &vec );
+
+    // Copy x, y, and z from a 3-D point into a 4-D vector, and set w to 1
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector4( const Point3 &pnt );
+
+    // Copy elements from a quaternion into a 4-D vector
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector4( const Quat &quat );
+
+    // Set all elements of a 4-D vector to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector4( float scalar );
+
+    // Set all elements of a 4-D vector to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector4( const floatInVec &scalar );
+
+    // Set vector float data in a 4-D vector
+    // 
+    explicit VECTORMATH_FORCE_INLINE Vector4( __m128 vf4 );
+
+    // Get vector float data from a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
+
+    // Assign one 4-D vector to another
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator =( const Vector4 &vec );
+
+    // Set the x, y, and z elements of a 4-D vector
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setXYZ( const Vector3 &vec );
+
+    // Get the x, y, and z elements of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getXYZ( ) const;
+
+    // Set the x element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setX( float x );
+
+    // Set the y element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setY( float y );
+
+    // Set the z element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setZ( float z );
+
+    // Set the w element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setW( float w );
+
+    // Set the x element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setX( const floatInVec &x );
+
+    // Set the y element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setY( const floatInVec &y );
+
+    // Set the z element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setZ( const floatInVec &z );
+
+    // Set the w element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setW( const floatInVec &w );
+
+    // Get the x element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
+
+    // Get the y element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
+
+    // Get the z element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
+
+    // Get the w element of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getW( ) const;
+
+    // Set an x, y, z, or w element of a 4-D vector by index
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setElem( int idx, float value );
+
+    // Set an x, y, z, or w element of a 4-D vector by index (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, z, or w element of a 4-D vector by index
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
+
+    // Add two 4-D vectors
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator +( const Vector4 &vec ) const;
+
+    // Subtract a 4-D vector from another 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator -( const Vector4 &vec ) const;
+
+    // Multiply a 4-D vector by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator *( float scalar ) const;
+
+    // Divide a 4-D vector by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator /( float scalar ) const;
+
+    // Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator *( const floatInVec &scalar ) const;
+
+    // Divide a 4-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator /( const floatInVec &scalar ) const;
+
+    // Perform compound assignment and addition with a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator +=( const Vector4 &vec );
+
+    // Perform compound assignment and subtraction by a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator -=( const Vector4 &vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator /=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator /=( const floatInVec &scalar );
+
+    // Negate all elements of a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector4 xAxis( );
+
+    // Construct y axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector4 yAxis( );
+
+    // Construct z axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector4 zAxis( );
+
+    // Construct w axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Vector4 wAxis( );
+
+};
+
+// Multiply a 4-D vector by a scalar
+// 
+VECTORMATH_FORCE_INLINE const Vector4 operator *( float scalar, const Vector4 &vec );
+
+// Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec );
+
+// Multiply two 4-D vectors per element
+// 
+VECTORMATH_FORCE_INLINE const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Divide two 4-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Compute the reciprocal of a 4-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 recipPerElem( const Vector4 &vec );
+
+// Compute the absolute value of a 4-D vector per element
+// 
+VECTORMATH_FORCE_INLINE const Vector4 absPerElem( const Vector4 &vec );
+
+// Copy sign from one 4-D vector to another, per element
+// 
+VECTORMATH_FORCE_INLINE const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Maximum of two 4-D vectors per element
+// 
+VECTORMATH_FORCE_INLINE const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Minimum of two 4-D vectors per element
+// 
+VECTORMATH_FORCE_INLINE const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Maximum element of a 4-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Vector4 &vec );
+
+// Minimum element of a 4-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec minElem( const Vector4 &vec );
+
+// Compute the sum of all elements of a 4-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec sum( const Vector4 &vec );
+
+// Compute the dot product of two 4-D vectors
+// 
+VECTORMATH_FORCE_INLINE const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Compute the square of the length of a 4-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec lengthSqr( const Vector4 &vec );
+
+// Compute the length of a 4-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec length( const Vector4 &vec );
+
+// Normalize a 4-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 normalize( const Vector4 &vec );
+
+// Outer product of two 4-D vectors
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 outer( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Linear interpolation between two 4-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 );
+
+// Linear interpolation between two 4-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 );
+
+// Spherical linear interpolation between two 4-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
+
+// Spherical linear interpolation between two 4-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
+
+// Conditionally select between two 4-D vectors
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 );
+
+// Conditionally select between two 4-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, const boolInVec &select1 );
+
+// Store four 4-D vectors as half-floats
+// 
+VECTORMATH_FORCE_INLINE void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Vector4 &vec );
+
+// Print a 4-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Vector4 &vec, const char * name );
+
+#endif
+
+// A 3-D point in array-of-structures format
+//
+class Point3
+{
+    __m128 mVec128;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Point3( ) { };
+
+    // Construct a 3-D point from x, y, and z elements
+    // 
+    VECTORMATH_FORCE_INLINE Point3( float x, float y, float z );
+
+    // Construct a 3-D point from x, y, and z elements (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Point3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
+
+    // Copy elements from a 3-D vector into a 3-D point
+    // 
+    explicit VECTORMATH_FORCE_INLINE Point3( const Vector3 &vec );
+
+    // Set all elements of a 3-D point to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Point3( float scalar );
+
+    // Set all elements of a 3-D point to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Point3( const floatInVec &scalar );
+
+    // Set vector float data in a 3-D point
+    // 
+    explicit VECTORMATH_FORCE_INLINE Point3( __m128 vf4 );
+
+    // Get vector float data from a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
+
+    // Assign one 3-D point to another
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & operator =( const Point3 &pnt );
+
+    // Set the x element of a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setX( float x );
+
+    // Set the y element of a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setY( float y );
+
+    // Set the z element of a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setZ( float z );
+
+    // Set the x element of a 3-D point (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setX( const floatInVec &x );
+
+    // Set the y element of a 3-D point (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setY( const floatInVec &y );
+
+    // Set the z element of a 3-D point (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setZ( const floatInVec &z );
+
+    // Get the x element of a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
+
+    // Get the y element of a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
+
+    // Get the z element of a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D point by index
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setElem( int idx, float value );
+
+    // Set an x, y, or z element of a 3-D point by index (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, or z element of a 3-D point by index
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
+
+    // Subtract a 3-D point from another 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator -( const Point3 &pnt ) const;
+
+    // Add a 3-D point to a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Point3 operator +( const Vector3 &vec ) const;
+
+    // Subtract a 3-D vector from a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const Point3 operator -( const Vector3 &vec ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & operator +=( const Vector3 &vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Point3 & operator -=( const Vector3 &vec );
+
+};
+
+// Multiply two 3-D points per element
+// 
+VECTORMATH_FORCE_INLINE const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Divide two 3-D points per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+VECTORMATH_FORCE_INLINE const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Compute the reciprocal of a 3-D point per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+VECTORMATH_FORCE_INLINE const Point3 recipPerElem( const Point3 &pnt );
+
+// Compute the absolute value of a 3-D point per element
+// 
+VECTORMATH_FORCE_INLINE const Point3 absPerElem( const Point3 &pnt );
+
+// Copy sign from one 3-D point to another, per element
+// 
+VECTORMATH_FORCE_INLINE const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Maximum of two 3-D points per element
+// 
+VECTORMATH_FORCE_INLINE const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Minimum of two 3-D points per element
+// 
+VECTORMATH_FORCE_INLINE const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Maximum element of a 3-D point
+// 
+VECTORMATH_FORCE_INLINE const floatInVec maxElem( const Point3 &pnt );
+
+// Minimum element of a 3-D point
+// 
+VECTORMATH_FORCE_INLINE const floatInVec minElem( const Point3 &pnt );
+
+// Compute the sum of all elements of a 3-D point
+// 
+VECTORMATH_FORCE_INLINE const floatInVec sum( const Point3 &pnt );
+
+// Apply uniform scale to a 3-D point
+// 
+VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, float scaleVal );
+
+// Apply uniform scale to a 3-D point (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal );
+
+// Apply non-uniform scale to a 3-D point
+// 
+VECTORMATH_FORCE_INLINE const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec );
+
+// Scalar projection of a 3-D point on a unit-length 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec );
+
+// Compute the square of the distance of a 3-D point from the coordinate-system origin
+// 
+VECTORMATH_FORCE_INLINE const floatInVec distSqrFromOrigin( const Point3 &pnt );
+
+// Compute the distance of a 3-D point from the coordinate-system origin
+// 
+VECTORMATH_FORCE_INLINE const floatInVec distFromOrigin( const Point3 &pnt );
+
+// Compute the square of the distance between two 3-D points
+// 
+VECTORMATH_FORCE_INLINE const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Compute the distance between two 3-D points
+// 
+VECTORMATH_FORCE_INLINE const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Linear interpolation between two 3-D points
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 );
+
+// Linear interpolation between two 3-D points (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 );
+
+// Conditionally select between two 3-D points
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 );
+
+// Conditionally select between two 3-D points (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 );
+
+// Store x, y, and z elements of 3-D point in first three words of a quadword, preserving fourth word
+// 
+VECTORMATH_FORCE_INLINE void storeXYZ( const Point3 &pnt, __m128 * quad );
+
+// Load four three-float 3-D points, stored in three quadwords
+// 
+VECTORMATH_FORCE_INLINE void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads );
+
+// Store four 3-D points in three quadwords
+// 
+VECTORMATH_FORCE_INLINE void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads );
+
+// Store eight 3-D points as half-floats
+// 
+VECTORMATH_FORCE_INLINE void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D point
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Point3 &pnt );
+
+// Print a 3-D point and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Point3 &pnt, const char * name );
+
+#endif
+
+// A quaternion in array-of-structures format
+//
+class Quat
+{
+    __m128 mVec128;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Quat( ) { };
+
+	VECTORMATH_FORCE_INLINE  Quat(const Quat& quat);
+
+    // Construct a quaternion from x, y, z, and w elements
+    // 
+    VECTORMATH_FORCE_INLINE Quat( float x, float y, float z, float w );
+
+    // Construct a quaternion from x, y, z, and w elements (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
+
+    // Construct a quaternion from a 3-D vector and a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Quat( const Vector3 &xyz, float w );
+
+    // Construct a quaternion from a 3-D vector and a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat( const Vector3 &xyz, const floatInVec &w );
+
+    // Copy elements from a 4-D vector into a quaternion
+    // 
+    explicit VECTORMATH_FORCE_INLINE Quat( const Vector4 &vec );
+
+    // Convert a rotation matrix to a unit-length quaternion
+    // 
+    explicit VECTORMATH_FORCE_INLINE Quat( const Matrix3 & rotMat );
+
+    // Set all elements of a quaternion to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Quat( float scalar );
+
+    // Set all elements of a quaternion to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Quat( const floatInVec &scalar );
+
+    // Set vector float data in a quaternion
+    // 
+    explicit VECTORMATH_FORCE_INLINE Quat( __m128 vf4 );
+
+    // Get vector float data from a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE __m128 get128( ) const;
+
+	// Set a quaterion from vector float data
+    //
+	VECTORMATH_FORCE_INLINE void set128(vec_float4 vec);
+
+    // Assign one quaternion to another
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator =( const Quat &quat );
+
+    // Set the x, y, and z elements of a quaternion
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setXYZ( const Vector3 &vec );
+
+    // Get the x, y, and z elements of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getXYZ( ) const;
+
+    // Set the x element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setX( float x );
+
+    // Set the y element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setY( float y );
+
+    // Set the z element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setZ( float z );
+
+    // Set the w element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setW( float w );
+
+    // Set the x element of a quaternion (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setX( const floatInVec &x );
+
+    // Set the y element of a quaternion (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setY( const floatInVec &y );
+
+    // Set the z element of a quaternion (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setZ( const floatInVec &z );
+
+    // Set the w element of a quaternion (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setW( const floatInVec &w );
+
+    // Get the x element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getX( ) const;
+
+    // Get the y element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getY( ) const;
+
+    // Get the z element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getZ( ) const;
+
+    // Get the w element of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getW( ) const;
+
+    // Set an x, y, z, or w element of a quaternion by index
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setElem( int idx, float value );
+
+    // Set an x, y, z, or w element of a quaternion by index (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, z, or w element of a quaternion by index
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    VECTORMATH_FORCE_INLINE VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec operator []( int idx ) const;
+
+    // Add two quaternions
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator +( const Quat &quat ) const;
+
+    // Subtract a quaternion from another quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator -( const Quat &quat ) const;
+
+    // Multiply two quaternions
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator *( const Quat &quat ) const;
+
+    // Multiply a quaternion by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator *( float scalar ) const;
+
+    // Divide a quaternion by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator /( float scalar ) const;
+
+    // Multiply a quaternion by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator *( const floatInVec &scalar ) const;
+
+    // Divide a quaternion by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator /( const floatInVec &scalar ) const;
+
+    // Perform compound assignment and addition with a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator +=( const Quat &quat );
+
+    // Perform compound assignment and subtraction by a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator -=( const Quat &quat );
+
+    // Perform compound assignment and multiplication by a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator *=( const Quat &quat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator /=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Quat & operator /=( const floatInVec &scalar );
+
+    // Negate all elements of a quaternion
+    // 
+    VECTORMATH_FORCE_INLINE const Quat operator -( ) const;
+
+    // Construct an identity quaternion
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat identity( );
+
+    // Construct a quaternion to rotate between two unit-length 3-D vectors
+    // NOTE: 
+    // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions.
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 );
+
+    // Construct a quaternion to rotate around a unit-length 3-D vector
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a quaternion to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a quaternion to rotate around the x axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotationX( float radians );
+
+    // Construct a quaternion to rotate around the y axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotationY( float radians );
+
+    // Construct a quaternion to rotate around the z axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotationZ( float radians );
+
+    // Construct a quaternion to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotationX( const floatInVec &radians );
+
+    // Construct a quaternion to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotationY( const floatInVec &radians );
+
+    // Construct a quaternion to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Quat rotationZ( const floatInVec &radians );
+
+};
+
+// Multiply a quaternion by a scalar
+// 
+VECTORMATH_FORCE_INLINE const Quat operator *( float scalar, const Quat &quat );
+
+// Multiply a quaternion by a scalar (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Quat operator *( const floatInVec &scalar, const Quat &quat );
+
+// Compute the conjugate of a quaternion
+// 
+VECTORMATH_FORCE_INLINE const Quat conj( const Quat &quat );
+
+// Use a unit-length quaternion to rotate a 3-D vector
+// 
+VECTORMATH_FORCE_INLINE const Vector3 rotate( const Quat &unitQuat, const Vector3 &vec );
+
+// Compute the dot product of two quaternions
+// 
+VECTORMATH_FORCE_INLINE const floatInVec dot( const Quat &quat0, const Quat &quat1 );
+
+// Compute the norm of a quaternion
+// 
+VECTORMATH_FORCE_INLINE const floatInVec norm( const Quat &quat );
+
+// Compute the length of a quaternion
+// 
+VECTORMATH_FORCE_INLINE const floatInVec length( const Quat &quat );
+
+// Normalize a quaternion
+// NOTE: 
+// The result is unpredictable when all elements of quat are at or near zero.
+// 
+VECTORMATH_FORCE_INLINE const Quat normalize( const Quat &quat );
+
+// Linear interpolation between two quaternions
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Quat lerp( float t, const Quat &quat0, const Quat &quat1 );
+
+// Linear interpolation between two quaternions (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 );
+
+// Spherical linear interpolation between two quaternions
+// NOTE: 
+// Interpolates along the shortest path between orientations.
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 );
+
+// Spherical linear interpolation between two quaternions (scalar data contained in vector data type)
+// NOTE: 
+// Interpolates along the shortest path between orientations.
+// Does not clamp t between 0 and 1.
+// 
+VECTORMATH_FORCE_INLINE const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 );
+
+// Spherical quadrangle interpolation
+// 
+VECTORMATH_FORCE_INLINE const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
+
+// Spherical quadrangle interpolation (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
+
+// Conditionally select between two quaternions
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, bool select1 );
+
+// Conditionally select between two quaternions (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a quaternion
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Quat &quat );
+
+// Print a quaternion and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Quat &quat, const char * name );
+
+#endif
+
+// A 3x3 matrix in array-of-structures format
+//
+class Matrix3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3( ) { };
+
+    // Copy a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3( const Matrix3 & mat );
+
+    // Construct a 3x3 matrix containing the specified columns
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2 );
+
+    // Construct a 3x3 rotation matrix from a unit-length quaternion
+    // 
+    explicit VECTORMATH_FORCE_INLINE Matrix3( const Quat &unitQuat );
+
+    // Set all elements of a 3x3 matrix to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Matrix3( float scalar );
+
+    // Set all elements of a 3x3 matrix to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Matrix3( const floatInVec &scalar );
+
+    // Assign one 3x3 matrix to another
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & operator =( const Matrix3 & mat );
+
+    // Set column 0 of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setCol0( const Vector3 &col0 );
+
+    // Set column 1 of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setCol1( const Vector3 &col1 );
+
+    // Set column 2 of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setCol2( const Vector3 &col2 );
+
+    // Get column 0 of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol2( ) const;
+
+    // Set the column of a 3x3 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setCol( int col, const Vector3 &vec );
+
+    // Set the row of a 3x3 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setRow( int row, const Vector3 &vec );
+
+    // Get the column of a 3x3 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x3 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x3 matrix referred to by column and row indices
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setElem( int col, int row, float val );
+
+    // Set the element of a 3x3 matrix referred to by column and row indices (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & setElem( int col, int row, const floatInVec &val );
+
+    // Get the element of a 3x3 matrix referred to by column and row indices
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int col, int row ) const;
+
+    // Add two 3x3 matrices
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 operator +( const Matrix3 & mat ) const;
+
+    // Subtract a 3x3 matrix from another 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 operator -( const Matrix3 & mat ) const;
+
+    // Negate all elements of a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 operator -( ) const;
+
+    // Multiply a 3x3 matrix by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 operator *( float scalar ) const;
+
+    // Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 operator *( const floatInVec &scalar ) const;
+
+    // Multiply a 3x3 matrix by a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator *( const Vector3 &vec ) const;
+
+    // Multiply two 3x3 matrices
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 operator *( const Matrix3 & mat ) const;
+
+    // Perform compound assignment and addition with a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & operator +=( const Matrix3 & mat );
+
+    // Perform compound assignment and subtraction by a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & operator -=( const Matrix3 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and multiplication by a 3x3 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix3 & operator *=( const Matrix3 & mat );
+
+    // Construct an identity 3x3 matrix
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 identity( );
+
+    // Construct a 3x3 matrix to rotate around the x axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationX( float radians );
+
+    // Construct a 3x3 matrix to rotate around the y axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationY( float radians );
+
+    // Construct a 3x3 matrix to rotate around the z axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationZ( float radians );
+
+    // Construct a 3x3 matrix to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationX( const floatInVec &radians );
+
+    // Construct a 3x3 matrix to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationY( const floatInVec &radians );
+
+    // Construct a 3x3 matrix to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationZ( const floatInVec &radians );
+
+    // Construct a 3x3 matrix to rotate around the x, y, and z axes
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotationZYX( const Vector3 &radiansXYZ );
+
+    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 rotation( const Quat &unitQuat );
+
+    // Construct a 3x3 matrix to perform scaling
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix3 scale( const Vector3 &scaleVec );
+
+};
+// Multiply a 3x3 matrix by a scalar
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 operator *( float scalar, const Matrix3 & mat );
+
+// Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat );
+
+// Append (post-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat );
+
+// Multiply two 3x3 matrices per element
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 );
+
+// Compute the absolute value of a 3x3 matrix per element
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 absPerElem( const Matrix3 & mat );
+
+// Transpose of a 3x3 matrix
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 transpose( const Matrix3 & mat );
+
+// Compute the inverse of a 3x3 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 inverse( const Matrix3 & mat );
+
+// Determinant of a 3x3 matrix
+// 
+VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix3 & mat );
+
+// Conditionally select between two 3x3 matrices
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 );
+
+// Conditionally select between two 3x3 matrices (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x3 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat );
+
+// Print a 3x3 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Matrix3 & mat, const char * name );
+
+#endif
+
+// A 4x4 matrix in array-of-structures format
+//
+class Matrix4
+{
+    Vector4 mCol0;
+    Vector4 mCol1;
+    Vector4 mCol2;
+    Vector4 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4( ) { };
+
+    // Copy a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4( const Matrix4 & mat );
+
+    // Construct a 4x4 matrix containing the specified columns
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4( const Vector4 &col0, const Vector4 &col1, const Vector4 &col2, const Vector4 &col3 );
+
+    // Construct a 4x4 matrix from a 3x4 transformation matrix
+    // 
+    explicit VECTORMATH_FORCE_INLINE Matrix4( const Transform3 & mat );
+
+    // Construct a 4x4 matrix from a 3x3 matrix and a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4( const Matrix3 & mat, const Vector3 &translateVec );
+
+    // Construct a 4x4 matrix from a unit-length quaternion and a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4( const Quat &unitQuat, const Vector3 &translateVec );
+
+    // Set all elements of a 4x4 matrix to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Matrix4( float scalar );
+
+    // Set all elements of a 4x4 matrix to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Matrix4( const floatInVec &scalar );
+
+    // Assign one 4x4 matrix to another
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator =( const Matrix4 & mat );
+
+    // Set the upper-left 3x3 submatrix
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setTranslation( const Vector3 &translateVec );
+
+    // Get the translation component of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setCol0( const Vector4 &col0 );
+
+    // Set column 1 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setCol1( const Vector4 &col1 );
+
+    // Set column 2 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setCol2( const Vector4 &col2 );
+
+    // Set column 3 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setCol3( const Vector4 &col3 );
+
+    // Get column 0 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getCol0( ) const;
+
+    // Get column 1 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getCol1( ) const;
+
+    // Get column 2 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getCol2( ) const;
+
+    // Get column 3 of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getCol3( ) const;
+
+    // Set the column of a 4x4 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setCol( int col, const Vector4 &vec );
+
+    // Set the row of a 4x4 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setRow( int row, const Vector4 &vec );
+
+    // Get the column of a 4x4 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getCol( int col ) const;
+
+    // Get the row of a 4x4 matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    VECTORMATH_FORCE_INLINE Vector4 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator []( int col ) const;
+
+    // Set the element of a 4x4 matrix referred to by column and row indices
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setElem( int col, int row, float val );
+
+    // Set the element of a 4x4 matrix referred to by column and row indices (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & setElem( int col, int row, const floatInVec &val );
+
+    // Get the element of a 4x4 matrix referred to by column and row indices
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int col, int row ) const;
+
+    // Add two 4x4 matrices
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator +( const Matrix4 & mat ) const;
+
+    // Subtract a 4x4 matrix from another 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator -( const Matrix4 & mat ) const;
+
+    // Negate all elements of a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator -( ) const;
+
+    // Multiply a 4x4 matrix by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator *( float scalar ) const;
+
+    // Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator *( const floatInVec &scalar ) const;
+
+    // Multiply a 4x4 matrix by a 4-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator *( const Vector4 &vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator *( const Vector3 &vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 operator *( const Point3 &pnt ) const;
+
+    // Multiply two 4x4 matrices
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator *( const Matrix4 & mat ) const;
+
+    // Multiply a 4x4 matrix by a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix4 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and addition with a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator +=( const Matrix4 & mat );
+
+    // Perform compound assignment and subtraction by a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator -=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and multiplication by a 4x4 matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Matrix4 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 4x4 matrix
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 identity( );
+
+    // Construct a 4x4 matrix to rotate around the x axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationX( float radians );
+
+    // Construct a 4x4 matrix to rotate around the y axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationY( float radians );
+
+    // Construct a 4x4 matrix to rotate around the z axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationZ( float radians );
+
+    // Construct a 4x4 matrix to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationX( const floatInVec &radians );
+
+    // Construct a 4x4 matrix to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationY( const floatInVec &radians );
+
+    // Construct a 4x4 matrix to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationZ( const floatInVec &radians );
+
+    // Construct a 4x4 matrix to rotate around the x, y, and z axes
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotationZYX( const Vector3 &radiansXYZ );
+
+    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 rotation( const Quat &unitQuat );
+
+    // Construct a 4x4 matrix to perform scaling
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 scale( const Vector3 &scaleVec );
+
+    // Construct a 4x4 matrix to perform translation
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 translation( const Vector3 &translateVec );
+
+    // Construct viewing matrix based on eye, position looked at, and up direction
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec );
+
+    // Construct a perspective projection matrix
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 perspective( float fovyRadians, float aspect, float zNear, float zFar );
+
+    // Construct a perspective projection matrix based on frustum
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 frustum( float left, float right, float bottom, float top, float zNear, float zFar );
+
+    // Construct an orthographic projection matrix
+    // 
+    static VECTORMATH_FORCE_INLINE const Matrix4 orthographic( float left, float right, float bottom, float top, float zNear, float zFar );
+
+};
+// Multiply a 4x4 matrix by a scalar
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 operator *( float scalar, const Matrix4 & mat );
+
+// Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat );
+
+// Append (post-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat );
+
+// Multiply two 4x4 matrices per element
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 );
+
+// Compute the absolute value of a 4x4 matrix per element
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 absPerElem( const Matrix4 & mat );
+
+// Transpose of a 4x4 matrix
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 transpose( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 inverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.  The result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 affineInverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix with an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 orthoInverse( const Matrix4 & mat );
+
+// Determinant of a 4x4 matrix
+// 
+VECTORMATH_FORCE_INLINE const floatInVec determinant( const Matrix4 & mat );
+
+// Conditionally select between two 4x4 matrices
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 );
+
+// Conditionally select between two 4x4 matrices (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4x4 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat );
+
+// Print a 4x4 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Matrix4 & mat, const char * name );
+
+#endif
+
+// A 3x4 transformation matrix in array-of-structures format
+//
+class Transform3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+    Vector3 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    VECTORMATH_FORCE_INLINE Transform3( ) { };
+
+    // Copy a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3( const Transform3 & tfrm );
+
+    // Construct a 3x4 transformation matrix containing the specified columns
+    // 
+    VECTORMATH_FORCE_INLINE Transform3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2, const Vector3 &col3 );
+
+    // Construct a 3x4 transformation matrix from a 3x3 matrix and a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Transform3( const Matrix3 & tfrm, const Vector3 &translateVec );
+
+    // Construct a 3x4 transformation matrix from a unit-length quaternion and a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE Transform3( const Quat &unitQuat, const Vector3 &translateVec );
+
+    // Set all elements of a 3x4 transformation matrix to the same scalar value
+    // 
+    explicit VECTORMATH_FORCE_INLINE Transform3( float scalar );
+
+    // Set all elements of a 3x4 transformation matrix to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit VECTORMATH_FORCE_INLINE Transform3( const floatInVec &scalar );
+
+    // Assign one 3x4 transformation matrix to another
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & operator =( const Transform3 & tfrm );
+
+    // Set the upper-left 3x3 submatrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setTranslation( const Vector3 &translateVec );
+
+    // Get the translation component of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setCol0( const Vector3 &col0 );
+
+    // Set column 1 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setCol1( const Vector3 &col1 );
+
+    // Set column 2 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setCol2( const Vector3 &col2 );
+
+    // Set column 3 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setCol3( const Vector3 &col3 );
+
+    // Get column 0 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol2( ) const;
+
+    // Get column 3 of a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol3( ) const;
+
+    // Set the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setCol( int col, const Vector3 &vec );
+
+    // Set the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setRow( int row, const Vector4 &vec );
+
+    // Get the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    VECTORMATH_FORCE_INLINE const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    VECTORMATH_FORCE_INLINE Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setElem( int col, int row, float val );
+
+    // Set the element of a 3x4 transformation matrix referred to by column and row indices (scalar data contained in vector data type)
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & setElem( int col, int row, const floatInVec &val );
+
+    // Get the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    VECTORMATH_FORCE_INLINE const floatInVec getElem( int col, int row ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D vector
+    // 
+    VECTORMATH_FORCE_INLINE const Vector3 operator *( const Vector3 &vec ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D point
+    // 
+    VECTORMATH_FORCE_INLINE const Point3 operator *( const Point3 &pnt ) const;
+
+    // Multiply two 3x4 transformation matrices
+    // 
+    VECTORMATH_FORCE_INLINE const Transform3 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    VECTORMATH_FORCE_INLINE Transform3 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 3x4 transformation matrix
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 identity( );
+
+    // Construct a 3x4 transformation matrix to rotate around the x axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationX( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the y axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationY( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the z axis
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationZ( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationX( const floatInVec &radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationY( const floatInVec &radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationZ( const floatInVec &radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the x, y, and z axes
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotationZYX( const Vector3 &radiansXYZ );
+
+    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 rotation( const Quat &unitQuat );
+
+    // Construct a 3x4 transformation matrix to perform scaling
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 scale( const Vector3 &scaleVec );
+
+    // Construct a 3x4 transformation matrix to perform translation
+    // 
+    static VECTORMATH_FORCE_INLINE const Transform3 translation( const Vector3 &translateVec );
+
+};
+// Append (post-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+VECTORMATH_FORCE_INLINE const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+VECTORMATH_FORCE_INLINE const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm );
+
+// Multiply two 3x4 transformation matrices per element
+// 
+VECTORMATH_FORCE_INLINE const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 );
+
+// Compute the absolute value of a 3x4 transformation matrix per element
+// 
+VECTORMATH_FORCE_INLINE const Transform3 absPerElem( const Transform3 & tfrm );
+
+// Inverse of a 3x4 transformation matrix
+// NOTE: 
+// Result is unpredictable when the determinant of the left 3x3 submatrix is equal to or near 0.
+// 
+VECTORMATH_FORCE_INLINE const Transform3 inverse( const Transform3 & tfrm );
+
+// Compute the inverse of a 3x4 transformation matrix, expected to have an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 3x4 transformation matrix meets the given restrictions.
+// 
+VECTORMATH_FORCE_INLINE const Transform3 orthoInverse( const Transform3 & tfrm );
+
+// Conditionally select between two 3x4 transformation matrices
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 );
+
+// Conditionally select between two 3x4 transformation matrices (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+VECTORMATH_FORCE_INLINE const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x4 transformation matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm );
+
+// Print a 3x4 transformation matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+VECTORMATH_FORCE_INLINE void print( const Transform3 & tfrm, const char * name );
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#include "vec_aos.h"
+#include "quat_aos.h"
+#include "mat_aos.h"
+
+#endif
diff --git a/test/Bullet2/vectormath/vmInclude.h b/test/Bullet2/vectormath/vmInclude.h
new file mode 100644
index 000000000..656514e42
--- /dev/null
+++ b/test/Bullet2/vectormath/vmInclude.h
@@ -0,0 +1,31 @@
+
+#ifndef __VM_INCLUDE_H
+#define __VM_INCLUDE_H
+
+#include "LinearMath/btScalar.h"
+
+#if defined (USE_SYSTEM_VECTORMATH) || defined (__CELLOS_LV2__)
+	#include <vectormath_aos.h>
+#else //(USE_SYSTEM_VECTORMATH)
+	#if defined (BT_USE_SSE) 
+		#include "sse/vectormath_aos.h"
+	#else //all other platforms
+        #if defined (BT_USE_NEON)
+            #include "neon/vectormath_aos.h"
+        #else
+            #include "scalar/vectormath_aos.h"
+        #endif
+	#endif //(BT_USE_SSE) && defined (_WIN32)
+#endif //(USE_SYSTEM_VECTORMATH)
+
+
+
+typedef Vectormath::Aos::Vector3    vmVector3;
+typedef Vectormath::Aos::Quat       vmQuat;
+typedef Vectormath::Aos::Matrix3    vmMatrix3;
+typedef Vectormath::Aos::Transform3 vmTransform3;
+typedef Vectormath::Aos::Point3     vmPoint3;
+
+#endif //__VM_INCLUDE_H
+
+
diff --git a/test/collision/premake4.lua b/test/collision/premake4.lua
index f0a56f600..41269a6ab 100644
--- a/test/collision/premake4.lua
+++ b/test/collision/premake4.lua
@@ -1,5 +1,5 @@
 
-	project "test_bullet_collision"
+	project "Test_BulletCollision"
 		
 	kind "ConsoleApp"