diff --git a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/boolInVec.h b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/boolInVec.h
index d21d25cbb..a11f72a54 100644
--- a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/boolInVec.h
+++ b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/boolInVec.h
@@ -1,5 +1,5 @@
 /*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   Copyright (C) 2006, 2010 Sony Computer Entertainment Inc.
    All rights reserved.
 
    Redistribution and use in source and binary forms,
diff --git a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/floatInVec.h b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/floatInVec.h
index e8ac5959e..4c9682410 100644
--- a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/floatInVec.h
+++ b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/floatInVec.h
@@ -1,5 +1,5 @@
 /*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   Copyright (C) 2006, 2010 Sony Computer Entertainment Inc.
    All rights reserved.
 
    Redistribution and use in source and binary forms,
diff --git a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/mat_aos.h b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/mat_aos.h
index 120eac502..5b2b71410 100644
--- a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/mat_aos.h
+++ b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/mat_aos.h
@@ -1,5 +1,5 @@
 /*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   Copyright (C) 2006, 2010 Sony Computer Entertainment Inc.
    All rights reserved.
 
    Redistribution and use in source and binary forms,
@@ -62,28 +62,28 @@ namespace Aos {
 //-----------------------------------------------------------------------------
 // Definitions
 
-inline Matrix3::Matrix3( const Matrix3 & mat )
+__forceinline Matrix3::Matrix3( const Matrix3 & mat )
 {
     mCol0 = mat.mCol0;
     mCol1 = mat.mCol1;
     mCol2 = mat.mCol2;
 }
 
-inline Matrix3::Matrix3( float scalar )
+__forceinline Matrix3::Matrix3( float scalar )
 {
     mCol0 = Vector3( scalar );
     mCol1 = Vector3( scalar );
     mCol2 = Vector3( scalar );
 }
 
-inline Matrix3::Matrix3( const floatInVec &scalar )
+__forceinline Matrix3::Matrix3( const floatInVec &scalar )
 {
     mCol0 = Vector3( scalar );
     mCol1 = Vector3( scalar );
     mCol2 = Vector3( scalar );
 }
 
-inline Matrix3::Matrix3( const Quat &unitQuat )
+__forceinline Matrix3::Matrix3( const Quat &unitQuat )
 {
     __m128 xyzw_2, wwww, yzxw, zxyw, yzxw_2, zxyw_2;
     __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
@@ -114,38 +114,38 @@ inline Matrix3::Matrix3( const Quat &unitQuat )
     mCol2 = Vector3( vec_sel( tmp5, tmp1, select_z ) );
 }
 
-inline Matrix3::Matrix3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2 )
+__forceinline Matrix3::Matrix3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2 )
 {
     mCol0 = _col0;
     mCol1 = _col1;
     mCol2 = _col2;
 }
 
-inline Matrix3 & Matrix3::setCol0( const Vector3 &_col0 )
+__forceinline Matrix3 & Matrix3::setCol0( const Vector3 &_col0 )
 {
     mCol0 = _col0;
     return *this;
 }
 
-inline Matrix3 & Matrix3::setCol1( const Vector3 &_col1 )
+__forceinline Matrix3 & Matrix3::setCol1( const Vector3 &_col1 )
 {
     mCol1 = _col1;
     return *this;
 }
 
-inline Matrix3 & Matrix3::setCol2( const Vector3 &_col2 )
+__forceinline Matrix3 & Matrix3::setCol2( const Vector3 &_col2 )
 {
     mCol2 = _col2;
     return *this;
 }
 
-inline Matrix3 & Matrix3::setCol( int col, const Vector3 &vec )
+__forceinline Matrix3 & Matrix3::setCol( int col, const Vector3 &vec )
 {
     *(&mCol0 + col) = vec;
     return *this;
 }
 
-inline Matrix3 & Matrix3::setRow( int row, const Vector3 &vec )
+__forceinline Matrix3 & Matrix3::setRow( int row, const Vector3 &vec )
 {
     mCol0.setElem( row, vec.getElem( 0 ) );
     mCol1.setElem( row, vec.getElem( 1 ) );
@@ -153,13 +153,13 @@ inline Matrix3 & Matrix3::setRow( int row, const Vector3 &vec )
     return *this;
 }
 
-inline Matrix3 & Matrix3::setElem( int col, int row, float val )
+__forceinline Matrix3 & Matrix3::setElem( int col, int row, float val )
 {
     (*this)[col].setElem(row, val);
     return *this;
 }
 
-inline Matrix3 & Matrix3::setElem( int col, int row, const floatInVec &val )
+__forceinline Matrix3 & Matrix3::setElem( int col, int row, const floatInVec &val )
 {
     Vector3 tmpV3_0;
     tmpV3_0 = this->getCol( col );
@@ -168,47 +168,47 @@ inline Matrix3 & Matrix3::setElem( int col, int row, const floatInVec &val )
     return *this;
 }
 
-inline const floatInVec Matrix3::getElem( int col, int row ) const
+__forceinline const floatInVec Matrix3::getElem( int col, int row ) const
 {
     return this->getCol( col ).getElem( row );
 }
 
-inline const Vector3 Matrix3::getCol0( ) const
+__forceinline const Vector3 Matrix3::getCol0( ) const
 {
     return mCol0;
 }
 
-inline const Vector3 Matrix3::getCol1( ) const
+__forceinline const Vector3 Matrix3::getCol1( ) const
 {
     return mCol1;
 }
 
-inline const Vector3 Matrix3::getCol2( ) const
+__forceinline const Vector3 Matrix3::getCol2( ) const
 {
     return mCol2;
 }
 
-inline const Vector3 Matrix3::getCol( int col ) const
+__forceinline const Vector3 Matrix3::getCol( int col ) const
 {
     return *(&mCol0 + col);
 }
 
-inline const Vector3 Matrix3::getRow( int row ) const
+__forceinline const Vector3 Matrix3::getRow( int row ) const
 {
     return Vector3( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ) );
 }
 
-inline Vector3 & Matrix3::operator []( int col )
+__forceinline Vector3 & Matrix3::operator []( int col )
 {
     return *(&mCol0 + col);
 }
 
-inline const Vector3 Matrix3::operator []( int col ) const
+__forceinline const Vector3 Matrix3::operator []( int col ) const
 {
     return *(&mCol0 + col);
 }
 
-inline Matrix3 & Matrix3::operator =( const Matrix3 & mat )
+__forceinline Matrix3 & Matrix3::operator =( const Matrix3 & mat )
 {
     mCol0 = mat.mCol0;
     mCol1 = mat.mCol1;
@@ -216,7 +216,7 @@ inline Matrix3 & Matrix3::operator =( const Matrix3 & mat )
     return *this;
 }
 
-inline const Matrix3 transpose( const Matrix3 & mat )
+__forceinline const Matrix3 transpose( const Matrix3 & mat )
 {
     __m128 tmp0, tmp1, res0, res1, res2;
     tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
@@ -236,7 +236,7 @@ inline const Matrix3 transpose( const Matrix3 & mat )
     );
 }
 
-inline const Matrix3 inverse( const Matrix3 & mat )
+__forceinline const Matrix3 inverse( const Matrix3 & mat )
 {
     __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet, inv0, inv1, inv2;
     tmp2 = _vmathVfCross( mat.getCol0().get128(), mat.getCol1().get128() );
@@ -265,12 +265,12 @@ inline const Matrix3 inverse( const Matrix3 & mat )
     );
 }
 
-inline const floatInVec determinant( const Matrix3 & mat )
+__forceinline const floatInVec determinant( const Matrix3 & mat )
 {
     return dot( mat.getCol2(), cross( mat.getCol0(), mat.getCol1() ) );
 }
 
-inline const Matrix3 Matrix3::operator +( const Matrix3 & mat ) const
+__forceinline const Matrix3 Matrix3::operator +( const Matrix3 & mat ) const
 {
     return Matrix3(
         ( mCol0 + mat.mCol0 ),
@@ -279,7 +279,7 @@ inline const Matrix3 Matrix3::operator +( const Matrix3 & mat ) const
     );
 }
 
-inline const Matrix3 Matrix3::operator -( const Matrix3 & mat ) const
+__forceinline const Matrix3 Matrix3::operator -( const Matrix3 & mat ) const
 {
     return Matrix3(
         ( mCol0 - mat.mCol0 ),
@@ -288,19 +288,19 @@ inline const Matrix3 Matrix3::operator -( const Matrix3 & mat ) const
     );
 }
 
-inline Matrix3 & Matrix3::operator +=( const Matrix3 & mat )
+__forceinline Matrix3 & Matrix3::operator +=( const Matrix3 & mat )
 {
     *this = *this + mat;
     return *this;
 }
 
-inline Matrix3 & Matrix3::operator -=( const Matrix3 & mat )
+__forceinline Matrix3 & Matrix3::operator -=( const Matrix3 & mat )
 {
     *this = *this - mat;
     return *this;
 }
 
-inline const Matrix3 Matrix3::operator -( ) const
+__forceinline const Matrix3 Matrix3::operator -( ) const
 {
     return Matrix3(
         ( -mCol0 ),
@@ -309,7 +309,7 @@ inline const Matrix3 Matrix3::operator -( ) const
     );
 }
 
-inline const Matrix3 absPerElem( const Matrix3 & mat )
+__forceinline const Matrix3 absPerElem( const Matrix3 & mat )
 {
     return Matrix3(
         absPerElem( mat.getCol0() ),
@@ -318,12 +318,12 @@ inline const Matrix3 absPerElem( const Matrix3 & mat )
     );
 }
 
-inline const Matrix3 Matrix3::operator *( float scalar ) const
+__forceinline const Matrix3 Matrix3::operator *( float scalar ) const
 {
     return *this * floatInVec(scalar);
 }
 
-inline const Matrix3 Matrix3::operator *( const floatInVec &scalar ) const
+__forceinline const Matrix3 Matrix3::operator *( const floatInVec &scalar ) const
 {
     return Matrix3(
         ( mCol0 * scalar ),
@@ -332,28 +332,28 @@ inline const Matrix3 Matrix3::operator *( const floatInVec &scalar ) const
     );
 }
 
-inline Matrix3 & Matrix3::operator *=( float scalar )
+__forceinline Matrix3 & Matrix3::operator *=( float scalar )
 {
     return *this *= floatInVec(scalar);
 }
 
-inline Matrix3 & Matrix3::operator *=( const floatInVec &scalar )
+__forceinline Matrix3 & Matrix3::operator *=( const floatInVec &scalar )
 {
     *this = *this * scalar;
     return *this;
 }
 
-inline const Matrix3 operator *( float scalar, const Matrix3 & mat )
+__forceinline const Matrix3 operator *( float scalar, const Matrix3 & mat )
 {
     return floatInVec(scalar) * mat;
 }
 
-inline const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat )
+__forceinline const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat )
 {
     return mat * scalar;
 }
 
-inline const Vector3 Matrix3::operator *( const Vector3 &vec ) const
+__forceinline const Vector3 Matrix3::operator *( const Vector3 &vec ) const
 {
     __m128 res;
     __m128 xxxx, yyyy, zzzz;
@@ -366,7 +366,7 @@ inline const Vector3 Matrix3::operator *( const Vector3 &vec ) const
     return Vector3( res );
 }
 
-inline const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
+__forceinline const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
 {
     return Matrix3(
         ( *this * mat.mCol0 ),
@@ -375,13 +375,13 @@ inline const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
     );
 }
 
-inline Matrix3 & Matrix3::operator *=( const Matrix3 & mat )
+__forceinline Matrix3 & Matrix3::operator *=( const Matrix3 & mat )
 {
     *this = *this * mat;
     return *this;
 }
 
-inline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 )
+__forceinline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 )
 {
     return Matrix3(
         mulPerElem( mat0.getCol0(), mat1.getCol0() ),
@@ -390,7 +390,7 @@ inline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 )
     );
 }
 
-inline const Matrix3 Matrix3::identity( )
+__forceinline const Matrix3 Matrix3::identity( )
 {
     return Matrix3(
         Vector3::xAxis( ),
@@ -399,12 +399,12 @@ inline const Matrix3 Matrix3::identity( )
     );
 }
 
-inline const Matrix3 Matrix3::rotationX( float radians )
+__forceinline const Matrix3 Matrix3::rotationX( float radians )
 {
     return rotationX( floatInVec(radians) );
 }
 
-inline const Matrix3 Matrix3::rotationX( const floatInVec &radians )
+__forceinline const Matrix3 Matrix3::rotationX( const floatInVec &radians )
 {
     __m128 s, c, res1, res2;
     __m128 zero;
@@ -423,12 +423,12 @@ inline const Matrix3 Matrix3::rotationX( const floatInVec &radians )
     );
 }
 
-inline const Matrix3 Matrix3::rotationY( float radians )
+__forceinline const Matrix3 Matrix3::rotationY( float radians )
 {
     return rotationY( floatInVec(radians) );
 }
 
-inline const Matrix3 Matrix3::rotationY( const floatInVec &radians )
+__forceinline const Matrix3 Matrix3::rotationY( const floatInVec &radians )
 {
     __m128 s, c, res0, res2;
     __m128 zero;
@@ -447,12 +447,12 @@ inline const Matrix3 Matrix3::rotationY( const floatInVec &radians )
 	);
 }
 
-inline const Matrix3 Matrix3::rotationZ( float radians )
+__forceinline const Matrix3 Matrix3::rotationZ( float radians )
 {
     return rotationZ( floatInVec(radians) );
 }
 
-inline const Matrix3 Matrix3::rotationZ( const floatInVec &radians )
+__forceinline const Matrix3 Matrix3::rotationZ( const floatInVec &radians )
 {
     __m128 s, c, res0, res1;
     __m128 zero;
@@ -471,7 +471,7 @@ inline const Matrix3 Matrix3::rotationZ( const floatInVec &radians )
 	);
 }
 
-inline const Matrix3 Matrix3::rotationZYX( const Vector3 &radiansXYZ )
+__forceinline const Matrix3 Matrix3::rotationZYX( const Vector3 &radiansXYZ )
 {
     __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
     angles = Vector4( radiansXYZ, 0.0f ).get128();
@@ -493,12 +493,12 @@ inline const Matrix3 Matrix3::rotationZYX( const Vector3 &radiansXYZ )
     );
 }
 
-inline const Matrix3 Matrix3::rotation( float radians, const Vector3 &unitVec )
+__forceinline const Matrix3 Matrix3::rotation( float radians, const Vector3 &unitVec )
 {
     return rotation( floatInVec(radians), unitVec );
 }
 
-inline const Matrix3 Matrix3::rotation( const floatInVec &radians, const Vector3 &unitVec )
+__forceinline const Matrix3 Matrix3::rotation( const floatInVec &radians, const Vector3 &unitVec )
 {
     __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2;
     axis = unitVec.get128();
@@ -530,12 +530,12 @@ inline const Matrix3 Matrix3::rotation( const floatInVec &radians, const Vector3
     );
 }
 
-inline const Matrix3 Matrix3::rotation( const Quat &unitQuat )
+__forceinline const Matrix3 Matrix3::rotation( const Quat &unitQuat )
 {
     return Matrix3( unitQuat );
 }
 
-inline const Matrix3 Matrix3::scale( const Vector3 &scaleVec )
+__forceinline const Matrix3 Matrix3::scale( const Vector3 &scaleVec )
 {
     __m128 zero = _mm_setzero_ps();
 	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
@@ -548,7 +548,7 @@ inline const Matrix3 Matrix3::scale( const Vector3 &scaleVec )
     );
 }
 
-inline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec )
+__forceinline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec )
 {
     return Matrix3(
         ( mat.getCol0() * scaleVec.getX( ) ),
@@ -557,7 +557,7 @@ inline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec )
     );
 }
 
-inline const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat )
+__forceinline const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat )
 {
     return Matrix3(
         mulPerElem( mat.getCol0(), scaleVec ),
@@ -566,7 +566,7 @@ inline const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat
     );
 }
 
-inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 )
+__forceinline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 )
 {
     return Matrix3(
         select( mat0.getCol0(), mat1.getCol0(), select1 ),
@@ -575,7 +575,7 @@ inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool se
     );
 }
 
-inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 )
+__forceinline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 )
 {
     return Matrix3(
         select( mat0.getCol0(), mat1.getCol0(), select1 ),
@@ -586,14 +586,14 @@ inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const b
 
 #ifdef _VECTORMATH_DEBUG
 
-inline void print( const Matrix3 & mat )
+__forceinline void print( const Matrix3 & mat )
 {
     print( mat.getRow( 0 ) );
     print( mat.getRow( 1 ) );
     print( mat.getRow( 2 ) );
 }
 
-inline void print( const Matrix3 & mat, const char * name )
+__forceinline void print( const Matrix3 & mat, const char * name )
 {
     printf("%s:\n", name);
     print( mat );
@@ -601,7 +601,7 @@ inline void print( const Matrix3 & mat, const char * name )
 
 #endif
 
-inline Matrix4::Matrix4( const Matrix4 & mat )
+__forceinline Matrix4::Matrix4( const Matrix4 & mat )
 {
     mCol0 = mat.mCol0;
     mCol1 = mat.mCol1;
@@ -609,7 +609,7 @@ inline Matrix4::Matrix4( const Matrix4 & mat )
     mCol3 = mat.mCol3;
 }
 
-inline Matrix4::Matrix4( float scalar )
+__forceinline Matrix4::Matrix4( float scalar )
 {
     mCol0 = Vector4( scalar );
     mCol1 = Vector4( scalar );
@@ -617,7 +617,7 @@ inline Matrix4::Matrix4( float scalar )
     mCol3 = Vector4( scalar );
 }
 
-inline Matrix4::Matrix4( const floatInVec &scalar )
+__forceinline Matrix4::Matrix4( const floatInVec &scalar )
 {
     mCol0 = Vector4( scalar );
     mCol1 = Vector4( scalar );
@@ -625,7 +625,7 @@ inline Matrix4::Matrix4( const floatInVec &scalar )
     mCol3 = Vector4( scalar );
 }
 
-inline Matrix4::Matrix4( const Transform3 & mat )
+__forceinline Matrix4::Matrix4( const Transform3 & mat )
 {
     mCol0 = Vector4( mat.getCol0(), 0.0f );
     mCol1 = Vector4( mat.getCol1(), 0.0f );
@@ -633,7 +633,7 @@ inline Matrix4::Matrix4( const Transform3 & mat )
     mCol3 = Vector4( mat.getCol3(), 1.0f );
 }
 
-inline Matrix4::Matrix4( const Vector4 &_col0, const Vector4 &_col1, const Vector4 &_col2, const Vector4 &_col3 )
+__forceinline Matrix4::Matrix4( const Vector4 &_col0, const Vector4 &_col1, const Vector4 &_col2, const Vector4 &_col3 )
 {
     mCol0 = _col0;
     mCol1 = _col1;
@@ -641,7 +641,7 @@ inline Matrix4::Matrix4( const Vector4 &_col0, const Vector4 &_col1, const Vecto
     mCol3 = _col3;
 }
 
-inline Matrix4::Matrix4( const Matrix3 & mat, const Vector3 &translateVec )
+__forceinline Matrix4::Matrix4( const Matrix3 & mat, const Vector3 &translateVec )
 {
     mCol0 = Vector4( mat.getCol0(), 0.0f );
     mCol1 = Vector4( mat.getCol1(), 0.0f );
@@ -649,7 +649,7 @@ inline Matrix4::Matrix4( const Matrix3 & mat, const Vector3 &translateVec )
     mCol3 = Vector4( translateVec, 1.0f );
 }
 
-inline Matrix4::Matrix4( const Quat &unitQuat, const Vector3 &translateVec )
+__forceinline Matrix4::Matrix4( const Quat &unitQuat, const Vector3 &translateVec )
 {
     Matrix3 mat;
     mat = Matrix3( unitQuat );
@@ -659,37 +659,37 @@ inline Matrix4::Matrix4( const Quat &unitQuat, const Vector3 &translateVec )
     mCol3 = Vector4( translateVec, 1.0f );
 }
 
-inline Matrix4 & Matrix4::setCol0( const Vector4 &_col0 )
+__forceinline Matrix4 & Matrix4::setCol0( const Vector4 &_col0 )
 {
     mCol0 = _col0;
     return *this;
 }
 
-inline Matrix4 & Matrix4::setCol1( const Vector4 &_col1 )
+__forceinline Matrix4 & Matrix4::setCol1( const Vector4 &_col1 )
 {
     mCol1 = _col1;
     return *this;
 }
 
-inline Matrix4 & Matrix4::setCol2( const Vector4 &_col2 )
+__forceinline Matrix4 & Matrix4::setCol2( const Vector4 &_col2 )
 {
     mCol2 = _col2;
     return *this;
 }
 
-inline Matrix4 & Matrix4::setCol3( const Vector4 &_col3 )
+__forceinline Matrix4 & Matrix4::setCol3( const Vector4 &_col3 )
 {
     mCol3 = _col3;
     return *this;
 }
 
-inline Matrix4 & Matrix4::setCol( int col, const Vector4 &vec )
+__forceinline Matrix4 & Matrix4::setCol( int col, const Vector4 &vec )
 {
     *(&mCol0 + col) = vec;
     return *this;
 }
 
-inline Matrix4 & Matrix4::setRow( int row, const Vector4 &vec )
+__forceinline Matrix4 & Matrix4::setRow( int row, const Vector4 &vec )
 {
     mCol0.setElem( row, vec.getElem( 0 ) );
     mCol1.setElem( row, vec.getElem( 1 ) );
@@ -698,13 +698,13 @@ inline Matrix4 & Matrix4::setRow( int row, const Vector4 &vec )
     return *this;
 }
 
-inline Matrix4 & Matrix4::setElem( int col, int row, float val )
+__forceinline Matrix4 & Matrix4::setElem( int col, int row, float val )
 {
     (*this)[col].setElem(row, val);
     return *this;
 }
 
-inline Matrix4 & Matrix4::setElem( int col, int row, const floatInVec &val )
+__forceinline Matrix4 & Matrix4::setElem( int col, int row, const floatInVec &val )
 {
     Vector4 tmpV3_0;
     tmpV3_0 = this->getCol( col );
@@ -713,52 +713,52 @@ inline Matrix4 & Matrix4::setElem( int col, int row, const floatInVec &val )
     return *this;
 }
 
-inline const floatInVec Matrix4::getElem( int col, int row ) const
+__forceinline const floatInVec Matrix4::getElem( int col, int row ) const
 {
     return this->getCol( col ).getElem( row );
 }
 
-inline const Vector4 Matrix4::getCol0( ) const
+__forceinline const Vector4 Matrix4::getCol0( ) const
 {
     return mCol0;
 }
 
-inline const Vector4 Matrix4::getCol1( ) const
+__forceinline const Vector4 Matrix4::getCol1( ) const
 {
     return mCol1;
 }
 
-inline const Vector4 Matrix4::getCol2( ) const
+__forceinline const Vector4 Matrix4::getCol2( ) const
 {
     return mCol2;
 }
 
-inline const Vector4 Matrix4::getCol3( ) const
+__forceinline const Vector4 Matrix4::getCol3( ) const
 {
     return mCol3;
 }
 
-inline const Vector4 Matrix4::getCol( int col ) const
+__forceinline const Vector4 Matrix4::getCol( int col ) const
 {
     return *(&mCol0 + col);
 }
 
-inline const Vector4 Matrix4::getRow( int row ) const
+__forceinline const Vector4 Matrix4::getRow( int row ) const
 {
     return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
 }
 
-inline Vector4 & Matrix4::operator []( int col )
+__forceinline Vector4 & Matrix4::operator []( int col )
 {
     return *(&mCol0 + col);
 }
 
-inline const Vector4 Matrix4::operator []( int col ) const
+__forceinline const Vector4 Matrix4::operator []( int col ) const
 {
     return *(&mCol0 + col);
 }
 
-inline Matrix4 & Matrix4::operator =( const Matrix4 & mat )
+__forceinline Matrix4 & Matrix4::operator =( const Matrix4 & mat )
 {
     mCol0 = mat.mCol0;
     mCol1 = mat.mCol1;
@@ -767,7 +767,7 @@ inline Matrix4 & Matrix4::operator =( const Matrix4 & mat )
     return *this;
 }
 
-inline const Matrix4 transpose( const Matrix4 & mat )
+__forceinline const Matrix4 transpose( const Matrix4 & mat )
 {
     __m128 tmp0, tmp1, tmp2, tmp3, res0, res1, res2, res3;
     tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
@@ -791,7 +791,7 @@ static __declspec(align(16)) const unsigned int _vmathPNPN[4] = {0x00000000, 0x8
 static __declspec(align(16)) const unsigned int _vmathNPNP[4] = {0x80000000, 0x00000000, 0x80000000, 0x00000000};
 static __declspec(align(16)) const float _vmathZERONE[4] = {1.0f, 0.0f, 0.0f, 1.0f};
 
-inline const Matrix4 inverse( const Matrix4 & mat )
+__forceinline const Matrix4 inverse( const Matrix4 & mat )
 {
 	__m128 Va,Vb,Vc;
 	__m128 r1,r2,r3,tt,tt2;
@@ -886,7 +886,7 @@ inline const Matrix4 inverse( const Matrix4 & mat )
     );
 }
 
-inline const Matrix4 affineInverse( const Matrix4 & mat )
+__forceinline const Matrix4 affineInverse( const Matrix4 & mat )
 {
     Transform3 affineMat;
     affineMat.setCol0( mat.getCol0().getXYZ( ) );
@@ -896,7 +896,7 @@ inline const Matrix4 affineInverse( const Matrix4 & mat )
     return Matrix4( inverse( affineMat ) );
 }
 
-inline const Matrix4 orthoInverse( const Matrix4 & mat )
+__forceinline const Matrix4 orthoInverse( const Matrix4 & mat )
 {
     Transform3 affineMat;
     affineMat.setCol0( mat.getCol0().getXYZ( ) );
@@ -906,7 +906,7 @@ inline const Matrix4 orthoInverse( const Matrix4 & mat )
     return Matrix4( orthoInverse( affineMat ) );
 }
 
-inline const floatInVec determinant( const Matrix4 & mat )
+__forceinline const floatInVec determinant( const Matrix4 & mat )
 {
 	__m128 Va,Vb,Vc;
 	__m128 r1,r2,r3,tt,tt2;
@@ -947,7 +947,7 @@ inline const floatInVec determinant( const Matrix4 & mat )
 	return floatInVec(Det, 0);
 }
 
-inline const Matrix4 Matrix4::operator +( const Matrix4 & mat ) const
+__forceinline const Matrix4 Matrix4::operator +( const Matrix4 & mat ) const
 {
     return Matrix4(
         ( mCol0 + mat.mCol0 ),
@@ -957,7 +957,7 @@ inline const Matrix4 Matrix4::operator +( const Matrix4 & mat ) const
     );
 }
 
-inline const Matrix4 Matrix4::operator -( const Matrix4 & mat ) const
+__forceinline const Matrix4 Matrix4::operator -( const Matrix4 & mat ) const
 {
     return Matrix4(
         ( mCol0 - mat.mCol0 ),
@@ -967,19 +967,19 @@ inline const Matrix4 Matrix4::operator -( const Matrix4 & mat ) const
     );
 }
 
-inline Matrix4 & Matrix4::operator +=( const Matrix4 & mat )
+__forceinline Matrix4 & Matrix4::operator +=( const Matrix4 & mat )
 {
     *this = *this + mat;
     return *this;
 }
 
-inline Matrix4 & Matrix4::operator -=( const Matrix4 & mat )
+__forceinline Matrix4 & Matrix4::operator -=( const Matrix4 & mat )
 {
     *this = *this - mat;
     return *this;
 }
 
-inline const Matrix4 Matrix4::operator -( ) const
+__forceinline const Matrix4 Matrix4::operator -( ) const
 {
     return Matrix4(
         ( -mCol0 ),
@@ -989,7 +989,7 @@ inline const Matrix4 Matrix4::operator -( ) const
     );
 }
 
-inline const Matrix4 absPerElem( const Matrix4 & mat )
+__forceinline const Matrix4 absPerElem( const Matrix4 & mat )
 {
     return Matrix4(
         absPerElem( mat.getCol0() ),
@@ -999,12 +999,12 @@ inline const Matrix4 absPerElem( const Matrix4 & mat )
     );
 }
 
-inline const Matrix4 Matrix4::operator *( float scalar ) const
+__forceinline const Matrix4 Matrix4::operator *( float scalar ) const
 {
     return *this * floatInVec(scalar);
 }
 
-inline const Matrix4 Matrix4::operator *( const floatInVec &scalar ) const
+__forceinline const Matrix4 Matrix4::operator *( const floatInVec &scalar ) const
 {
     return Matrix4(
         ( mCol0 * scalar ),
@@ -1014,28 +1014,28 @@ inline const Matrix4 Matrix4::operator *( const floatInVec &scalar ) const
     );
 }
 
-inline Matrix4 & Matrix4::operator *=( float scalar )
+__forceinline Matrix4 & Matrix4::operator *=( float scalar )
 {
     return *this *= floatInVec(scalar);
 }
 
-inline Matrix4 & Matrix4::operator *=( const floatInVec &scalar )
+__forceinline Matrix4 & Matrix4::operator *=( const floatInVec &scalar )
 {
     *this = *this * scalar;
     return *this;
 }
 
-inline const Matrix4 operator *( float scalar, const Matrix4 & mat )
+__forceinline const Matrix4 operator *( float scalar, const Matrix4 & mat )
 {
     return floatInVec(scalar) * mat;
 }
 
-inline const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat )
+__forceinline const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat )
 {
     return mat * scalar;
 }
 
-inline const Vector4 Matrix4::operator *( const Vector4 &vec ) const
+__forceinline const Vector4 Matrix4::operator *( const Vector4 &vec ) const
 {
     return Vector4(
 		_mm_add_ps(
@@ -1044,7 +1044,7 @@ inline const Vector4 Matrix4::operator *( const Vector4 &vec ) const
 		);
 }
 
-inline const Vector4 Matrix4::operator *( const Vector3 &vec ) const
+__forceinline const Vector4 Matrix4::operator *( const Vector3 &vec ) const
 {
     return Vector4(
 		_mm_add_ps(
@@ -1053,7 +1053,7 @@ inline const Vector4 Matrix4::operator *( const Vector3 &vec ) const
 		);
 }
 
-inline const Vector4 Matrix4::operator *( const Point3 &pnt ) const
+__forceinline const Vector4 Matrix4::operator *( const Point3 &pnt ) const
 {
     return Vector4(
 		_mm_add_ps(
@@ -1062,7 +1062,7 @@ inline const Vector4 Matrix4::operator *( const Point3 &pnt ) const
 		);
 }
 
-inline const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
+__forceinline const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
 {
     return Matrix4(
         ( *this * mat.mCol0 ),
@@ -1072,13 +1072,13 @@ inline const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
     );
 }
 
-inline Matrix4 & Matrix4::operator *=( const Matrix4 & mat )
+__forceinline Matrix4 & Matrix4::operator *=( const Matrix4 & mat )
 {
     *this = *this * mat;
     return *this;
 }
 
-inline const Matrix4 Matrix4::operator *( const Transform3 & tfrm ) const
+__forceinline const Matrix4 Matrix4::operator *( const Transform3 & tfrm ) const
 {
     return Matrix4(
         ( *this * tfrm.getCol0() ),
@@ -1088,13 +1088,13 @@ inline const Matrix4 Matrix4::operator *( const Transform3 & tfrm ) const
     );
 }
 
-inline Matrix4 & Matrix4::operator *=( const Transform3 & tfrm )
+__forceinline Matrix4 & Matrix4::operator *=( const Transform3 & tfrm )
 {
     *this = *this * tfrm;
     return *this;
 }
 
-inline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 )
+__forceinline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 )
 {
     return Matrix4(
         mulPerElem( mat0.getCol0(), mat1.getCol0() ),
@@ -1104,7 +1104,7 @@ inline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 )
     );
 }
 
-inline const Matrix4 Matrix4::identity( )
+__forceinline const Matrix4 Matrix4::identity( )
 {
     return Matrix4(
         Vector4::xAxis( ),
@@ -1114,7 +1114,7 @@ inline const Matrix4 Matrix4::identity( )
     );
 }
 
-inline Matrix4 & Matrix4::setUpper3x3( const Matrix3 & mat3 )
+__forceinline Matrix4 & Matrix4::setUpper3x3( const Matrix3 & mat3 )
 {
     mCol0.setXYZ( mat3.getCol0() );
     mCol1.setXYZ( mat3.getCol1() );
@@ -1122,7 +1122,7 @@ inline Matrix4 & Matrix4::setUpper3x3( const Matrix3 & mat3 )
     return *this;
 }
 
-inline const Matrix3 Matrix4::getUpper3x3( ) const
+__forceinline const Matrix3 Matrix4::getUpper3x3( ) const
 {
     return Matrix3(
         mCol0.getXYZ( ),
@@ -1131,23 +1131,23 @@ inline const Matrix3 Matrix4::getUpper3x3( ) const
     );
 }
 
-inline Matrix4 & Matrix4::setTranslation( const Vector3 &translateVec )
+__forceinline Matrix4 & Matrix4::setTranslation( const Vector3 &translateVec )
 {
     mCol3.setXYZ( translateVec );
     return *this;
 }
 
-inline const Vector3 Matrix4::getTranslation( ) const
+__forceinline const Vector3 Matrix4::getTranslation( ) const
 {
     return mCol3.getXYZ( );
 }
 
-inline const Matrix4 Matrix4::rotationX( float radians )
+__forceinline const Matrix4 Matrix4::rotationX( float radians )
 {
     return rotationX( floatInVec(radians) );
 }
 
-inline const Matrix4 Matrix4::rotationX( const floatInVec &radians )
+__forceinline const Matrix4 Matrix4::rotationX( const floatInVec &radians )
 {
     __m128 s, c, res1, res2;
     __m128 zero;
@@ -1167,12 +1167,12 @@ inline const Matrix4 Matrix4::rotationX( const floatInVec &radians )
     );
 }
 
-inline const Matrix4 Matrix4::rotationY( float radians )
+__forceinline const Matrix4 Matrix4::rotationY( float radians )
 {
     return rotationY( floatInVec(radians) );
 }
 
-inline const Matrix4 Matrix4::rotationY( const floatInVec &radians )
+__forceinline const Matrix4 Matrix4::rotationY( const floatInVec &radians )
 {
     __m128 s, c, res0, res2;
     __m128 zero;
@@ -1192,12 +1192,12 @@ inline const Matrix4 Matrix4::rotationY( const floatInVec &radians )
     );
 }
 
-inline const Matrix4 Matrix4::rotationZ( float radians )
+__forceinline const Matrix4 Matrix4::rotationZ( float radians )
 {
     return rotationZ( floatInVec(radians) );
 }
 
-inline const Matrix4 Matrix4::rotationZ( const floatInVec &radians )
+__forceinline const Matrix4 Matrix4::rotationZ( const floatInVec &radians )
 {
     __m128 s, c, res0, res1;
     __m128 zero;
@@ -1217,7 +1217,7 @@ inline const Matrix4 Matrix4::rotationZ( const floatInVec &radians )
     );
 }
 
-inline const Matrix4 Matrix4::rotationZYX( const Vector3 &radiansXYZ )
+__forceinline const Matrix4 Matrix4::rotationZYX( const Vector3 &radiansXYZ )
 {
     __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
     angles = Vector4( radiansXYZ, 0.0f ).get128();
@@ -1240,12 +1240,12 @@ inline const Matrix4 Matrix4::rotationZYX( const Vector3 &radiansXYZ )
     );
 }
 
-inline const Matrix4 Matrix4::rotation( float radians, const Vector3 &unitVec )
+__forceinline const Matrix4 Matrix4::rotation( float radians, const Vector3 &unitVec )
 {
     return rotation( floatInVec(radians), unitVec );
 }
 
-inline const Matrix4 Matrix4::rotation( const floatInVec &radians, const Vector3 &unitVec )
+__forceinline const Matrix4 Matrix4::rotation( const floatInVec &radians, const Vector3 &unitVec )
 {
     __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2;
     axis = unitVec.get128();
@@ -1283,12 +1283,12 @@ inline const Matrix4 Matrix4::rotation( const floatInVec &radians, const Vector3
     );
 }
 
-inline const Matrix4 Matrix4::rotation( const Quat &unitQuat )
+__forceinline const Matrix4 Matrix4::rotation( const Quat &unitQuat )
 {
     return Matrix4( Transform3::rotation( unitQuat ) );
 }
 
-inline const Matrix4 Matrix4::scale( const Vector3 &scaleVec )
+__forceinline const Matrix4 Matrix4::scale( const Vector3 &scaleVec )
 {
     __m128 zero = _mm_setzero_ps();
 	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
@@ -1302,7 +1302,7 @@ inline const Matrix4 Matrix4::scale( const Vector3 &scaleVec )
     );
 }
 
-inline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec )
+__forceinline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec )
 {
     return Matrix4(
         ( mat.getCol0() * scaleVec.getX( ) ),
@@ -1312,7 +1312,7 @@ inline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec )
     );
 }
 
-inline const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat )
+__forceinline const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat )
 {
     Vector4 scale4;
     scale4 = Vector4( scaleVec, 1.0f );
@@ -1324,7 +1324,7 @@ inline const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat
     );
 }
 
-inline const Matrix4 Matrix4::translation( const Vector3 &translateVec )
+__forceinline const Matrix4 Matrix4::translation( const Vector3 &translateVec )
 {
     return Matrix4(
         Vector4::xAxis( ),
@@ -1334,7 +1334,7 @@ inline const Matrix4 Matrix4::translation( const Vector3 &translateVec )
     );
 }
 
-inline const Matrix4 Matrix4::lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec )
+__forceinline const Matrix4 Matrix4::lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec )
 {
     Matrix4 m4EyeFrame;
     Vector3 v3X, v3Y, v3Z;
@@ -1346,7 +1346,7 @@ inline const Matrix4 Matrix4::lookAt( const Point3 &eyePos, const Point3 &lookAt
     return orthoInverse( m4EyeFrame );
 }
 
-inline const Matrix4 Matrix4::perspective( float fovyRadians, float aspect, float zNear, float zFar )
+__forceinline const Matrix4 Matrix4::perspective( float fovyRadians, float aspect, float zNear, float zFar )
 {
     float f, rangeInv;
     __m128 zero, col0, col1, col2, col3;
@@ -1375,7 +1375,7 @@ inline const Matrix4 Matrix4::perspective( float fovyRadians, float aspect, floa
     );
 }
 
-inline const Matrix4 Matrix4::frustum( float left, float right, float bottom, float top, float zNear, float zFar )
+__forceinline const Matrix4 Matrix4::frustum( float left, float right, float bottom, float top, float zNear, float zFar )
 {
     /* function implementation based on code from STIDC SDK:           */
     /* --------------------------------------------------------------  */
@@ -1423,7 +1423,7 @@ inline const Matrix4 Matrix4::frustum( float left, float right, float bottom, fl
 	);
 }
 
-inline const Matrix4 Matrix4::orthographic( float left, float right, float bottom, float top, float zNear, float zFar )
+__forceinline const Matrix4 Matrix4::orthographic( float left, float right, float bottom, float top, float zNear, float zFar )
 {
     /* function implementation based on code from STIDC SDK:           */
     /* --------------------------------------------------------------  */
@@ -1470,7 +1470,7 @@ inline const Matrix4 Matrix4::orthographic( float left, float right, float botto
     );
 }
 
-inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 )
+__forceinline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 )
 {
     return Matrix4(
         select( mat0.getCol0(), mat1.getCol0(), select1 ),
@@ -1480,7 +1480,7 @@ inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool se
     );
 }
 
-inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 )
+__forceinline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 )
 {
     return Matrix4(
         select( mat0.getCol0(), mat1.getCol0(), select1 ),
@@ -1492,7 +1492,7 @@ inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const b
 
 #ifdef _VECTORMATH_DEBUG
 
-inline void print( const Matrix4 & mat )
+__forceinline void print( const Matrix4 & mat )
 {
     print( mat.getRow( 0 ) );
     print( mat.getRow( 1 ) );
@@ -1500,7 +1500,7 @@ inline void print( const Matrix4 & mat )
     print( mat.getRow( 3 ) );
 }
 
-inline void print( const Matrix4 & mat, const char * name )
+__forceinline void print( const Matrix4 & mat, const char * name )
 {
     printf("%s:\n", name);
     print( mat );
@@ -1508,7 +1508,7 @@ inline void print( const Matrix4 & mat, const char * name )
 
 #endif
 
-inline Transform3::Transform3( const Transform3 & tfrm )
+__forceinline Transform3::Transform3( const Transform3 & tfrm )
 {
     mCol0 = tfrm.mCol0;
     mCol1 = tfrm.mCol1;
@@ -1516,7 +1516,7 @@ inline Transform3::Transform3( const Transform3 & tfrm )
     mCol3 = tfrm.mCol3;
 }
 
-inline Transform3::Transform3( float scalar )
+__forceinline Transform3::Transform3( float scalar )
 {
     mCol0 = Vector3( scalar );
     mCol1 = Vector3( scalar );
@@ -1524,7 +1524,7 @@ inline Transform3::Transform3( float scalar )
     mCol3 = Vector3( scalar );
 }
 
-inline Transform3::Transform3( const floatInVec &scalar )
+__forceinline Transform3::Transform3( const floatInVec &scalar )
 {
     mCol0 = Vector3( scalar );
     mCol1 = Vector3( scalar );
@@ -1532,7 +1532,7 @@ inline Transform3::Transform3( const floatInVec &scalar )
     mCol3 = Vector3( scalar );
 }
 
-inline Transform3::Transform3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2, const Vector3 &_col3 )
+__forceinline Transform3::Transform3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2, const Vector3 &_col3 )
 {
     mCol0 = _col0;
     mCol1 = _col1;
@@ -1540,49 +1540,49 @@ inline Transform3::Transform3( const Vector3 &_col0, const Vector3 &_col1, const
     mCol3 = _col3;
 }
 
-inline Transform3::Transform3( const Matrix3 & tfrm, const Vector3 &translateVec )
+__forceinline Transform3::Transform3( const Matrix3 & tfrm, const Vector3 &translateVec )
 {
     this->setUpper3x3( tfrm );
     this->setTranslation( translateVec );
 }
 
-inline Transform3::Transform3( const Quat &unitQuat, const Vector3 &translateVec )
+__forceinline Transform3::Transform3( const Quat &unitQuat, const Vector3 &translateVec )
 {
     this->setUpper3x3( Matrix3( unitQuat ) );
     this->setTranslation( translateVec );
 }
 
-inline Transform3 & Transform3::setCol0( const Vector3 &_col0 )
+__forceinline Transform3 & Transform3::setCol0( const Vector3 &_col0 )
 {
     mCol0 = _col0;
     return *this;
 }
 
-inline Transform3 & Transform3::setCol1( const Vector3 &_col1 )
+__forceinline Transform3 & Transform3::setCol1( const Vector3 &_col1 )
 {
     mCol1 = _col1;
     return *this;
 }
 
-inline Transform3 & Transform3::setCol2( const Vector3 &_col2 )
+__forceinline Transform3 & Transform3::setCol2( const Vector3 &_col2 )
 {
     mCol2 = _col2;
     return *this;
 }
 
-inline Transform3 & Transform3::setCol3( const Vector3 &_col3 )
+__forceinline Transform3 & Transform3::setCol3( const Vector3 &_col3 )
 {
     mCol3 = _col3;
     return *this;
 }
 
-inline Transform3 & Transform3::setCol( int col, const Vector3 &vec )
+__forceinline Transform3 & Transform3::setCol( int col, const Vector3 &vec )
 {
     *(&mCol0 + col) = vec;
     return *this;
 }
 
-inline Transform3 & Transform3::setRow( int row, const Vector4 &vec )
+__forceinline Transform3 & Transform3::setRow( int row, const Vector4 &vec )
 {
     mCol0.setElem( row, vec.getElem( 0 ) );
     mCol1.setElem( row, vec.getElem( 1 ) );
@@ -1591,13 +1591,13 @@ inline Transform3 & Transform3::setRow( int row, const Vector4 &vec )
     return *this;
 }
 
-inline Transform3 & Transform3::setElem( int col, int row, float val )
+__forceinline Transform3 & Transform3::setElem( int col, int row, float val )
 {
     (*this)[col].setElem(row, val);
     return *this;
 }
 
-inline Transform3 & Transform3::setElem( int col, int row, const floatInVec &val )
+__forceinline Transform3 & Transform3::setElem( int col, int row, const floatInVec &val )
 {
     Vector3 tmpV3_0;
     tmpV3_0 = this->getCol( col );
@@ -1606,52 +1606,52 @@ inline Transform3 & Transform3::setElem( int col, int row, const floatInVec &val
     return *this;
 }
 
-inline const floatInVec Transform3::getElem( int col, int row ) const
+__forceinline const floatInVec Transform3::getElem( int col, int row ) const
 {
     return this->getCol( col ).getElem( row );
 }
 
-inline const Vector3 Transform3::getCol0( ) const
+__forceinline const Vector3 Transform3::getCol0( ) const
 {
     return mCol0;
 }
 
-inline const Vector3 Transform3::getCol1( ) const
+__forceinline const Vector3 Transform3::getCol1( ) const
 {
     return mCol1;
 }
 
-inline const Vector3 Transform3::getCol2( ) const
+__forceinline const Vector3 Transform3::getCol2( ) const
 {
     return mCol2;
 }
 
-inline const Vector3 Transform3::getCol3( ) const
+__forceinline const Vector3 Transform3::getCol3( ) const
 {
     return mCol3;
 }
 
-inline const Vector3 Transform3::getCol( int col ) const
+__forceinline const Vector3 Transform3::getCol( int col ) const
 {
     return *(&mCol0 + col);
 }
 
-inline const Vector4 Transform3::getRow( int row ) const
+__forceinline const Vector4 Transform3::getRow( int row ) const
 {
     return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
 }
 
-inline Vector3 & Transform3::operator []( int col )
+__forceinline Vector3 & Transform3::operator []( int col )
 {
     return *(&mCol0 + col);
 }
 
-inline const Vector3 Transform3::operator []( int col ) const
+__forceinline const Vector3 Transform3::operator []( int col ) const
 {
     return *(&mCol0 + col);
 }
 
-inline Transform3 & Transform3::operator =( const Transform3 & tfrm )
+__forceinline Transform3 & Transform3::operator =( const Transform3 & tfrm )
 {
     mCol0 = tfrm.mCol0;
     mCol1 = tfrm.mCol1;
@@ -1660,7 +1660,7 @@ inline Transform3 & Transform3::operator =( const Transform3 & tfrm )
     return *this;
 }
 
-inline const Transform3 inverse( const Transform3 & tfrm )
+__forceinline const Transform3 inverse( const Transform3 & tfrm )
 {
     __m128 inv0, inv1, inv2, inv3;
     __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet;
@@ -1700,7 +1700,7 @@ inline const Transform3 inverse( const Transform3 & tfrm )
     );
 }
 
-inline const Transform3 orthoInverse( const Transform3 & tfrm )
+__forceinline const Transform3 orthoInverse( const Transform3 & tfrm )
 {
     __m128 inv0, inv1, inv2, inv3;
     __m128 tmp0, tmp1;
@@ -1730,7 +1730,7 @@ inline const Transform3 orthoInverse( const Transform3 & tfrm )
     );
 }
 
-inline const Transform3 absPerElem( const Transform3 & tfrm )
+__forceinline const Transform3 absPerElem( const Transform3 & tfrm )
 {
     return Transform3(
         absPerElem( tfrm.getCol0() ),
@@ -1740,7 +1740,7 @@ inline const Transform3 absPerElem( const Transform3 & tfrm )
     );
 }
 
-inline const Vector3 Transform3::operator *( const Vector3 &vec ) const
+__forceinline const Vector3 Transform3::operator *( const Vector3 &vec ) const
 {
     __m128 res;
     __m128 xxxx, yyyy, zzzz;
@@ -1753,7 +1753,7 @@ inline const Vector3 Transform3::operator *( const Vector3 &vec ) const
     return Vector3( res );
 }
 
-inline const Point3 Transform3::operator *( const Point3 &pnt ) const
+__forceinline const Point3 Transform3::operator *( const Point3 &pnt ) const
 {
     __m128 tmp0, tmp1, res;
     __m128 xxxx, yyyy, zzzz;
@@ -1768,7 +1768,7 @@ inline const Point3 Transform3::operator *( const Point3 &pnt ) const
     return Point3( res );
 }
 
-inline const Transform3 Transform3::operator *( const Transform3 & tfrm ) const
+__forceinline const Transform3 Transform3::operator *( const Transform3 & tfrm ) const
 {
     return Transform3(
         ( *this * tfrm.mCol0 ),
@@ -1778,13 +1778,13 @@ inline const Transform3 Transform3::operator *( const Transform3 & tfrm ) const
     );
 }
 
-inline Transform3 & Transform3::operator *=( const Transform3 & tfrm )
+__forceinline Transform3 & Transform3::operator *=( const Transform3 & tfrm )
 {
     *this = *this * tfrm;
     return *this;
 }
 
-inline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 )
+__forceinline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 )
 {
     return Transform3(
         mulPerElem( tfrm0.getCol0(), tfrm1.getCol0() ),
@@ -1794,7 +1794,7 @@ inline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 &
     );
 }
 
-inline const Transform3 Transform3::identity( )
+__forceinline const Transform3 Transform3::identity( )
 {
     return Transform3(
         Vector3::xAxis( ),
@@ -1804,7 +1804,7 @@ inline const Transform3 Transform3::identity( )
     );
 }
 
-inline Transform3 & Transform3::setUpper3x3( const Matrix3 & tfrm )
+__forceinline Transform3 & Transform3::setUpper3x3( const Matrix3 & tfrm )
 {
     mCol0 = tfrm.getCol0();
     mCol1 = tfrm.getCol1();
@@ -1812,28 +1812,28 @@ inline Transform3 & Transform3::setUpper3x3( const Matrix3 & tfrm )
     return *this;
 }
 
-inline const Matrix3 Transform3::getUpper3x3( ) const
+__forceinline const Matrix3 Transform3::getUpper3x3( ) const
 {
     return Matrix3( mCol0, mCol1, mCol2 );
 }
 
-inline Transform3 & Transform3::setTranslation( const Vector3 &translateVec )
+__forceinline Transform3 & Transform3::setTranslation( const Vector3 &translateVec )
 {
     mCol3 = translateVec;
     return *this;
 }
 
-inline const Vector3 Transform3::getTranslation( ) const
+__forceinline const Vector3 Transform3::getTranslation( ) const
 {
     return mCol3;
 }
 
-inline const Transform3 Transform3::rotationX( float radians )
+__forceinline const Transform3 Transform3::rotationX( float radians )
 {
     return rotationX( floatInVec(radians) );
 }
 
-inline const Transform3 Transform3::rotationX( const floatInVec &radians )
+__forceinline const Transform3 Transform3::rotationX( const floatInVec &radians )
 {
     __m128 s, c, res1, res2;
     __m128 zero;
@@ -1853,12 +1853,12 @@ inline const Transform3 Transform3::rotationX( const floatInVec &radians )
     );
 }
 
-inline const Transform3 Transform3::rotationY( float radians )
+__forceinline const Transform3 Transform3::rotationY( float radians )
 {
     return rotationY( floatInVec(radians) );
 }
 
-inline const Transform3 Transform3::rotationY( const floatInVec &radians )
+__forceinline const Transform3 Transform3::rotationY( const floatInVec &radians )
 {
     __m128 s, c, res0, res2;
     __m128 zero;
@@ -1878,12 +1878,12 @@ inline const Transform3 Transform3::rotationY( const floatInVec &radians )
     );
 }
 
-inline const Transform3 Transform3::rotationZ( float radians )
+__forceinline const Transform3 Transform3::rotationZ( float radians )
 {
     return rotationZ( floatInVec(radians) );
 }
 
-inline const Transform3 Transform3::rotationZ( const floatInVec &radians )
+__forceinline const Transform3 Transform3::rotationZ( const floatInVec &radians )
 {
     __m128 s, c, res0, res1;
 	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
@@ -1902,7 +1902,7 @@ inline const Transform3 Transform3::rotationZ( const floatInVec &radians )
     );
 }
 
-inline const Transform3 Transform3::rotationZYX( const Vector3 &radiansXYZ )
+__forceinline const Transform3 Transform3::rotationZYX( const Vector3 &radiansXYZ )
 {
     __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
     angles = Vector4( radiansXYZ, 0.0f ).get128();
@@ -1925,22 +1925,22 @@ inline const Transform3 Transform3::rotationZYX( const Vector3 &radiansXYZ )
     );
 }
 
-inline const Transform3 Transform3::rotation( float radians, const Vector3 &unitVec )
+__forceinline const Transform3 Transform3::rotation( float radians, const Vector3 &unitVec )
 {
     return rotation( floatInVec(radians), unitVec );
 }
 
-inline const Transform3 Transform3::rotation( const floatInVec &radians, const Vector3 &unitVec )
+__forceinline const Transform3 Transform3::rotation( const floatInVec &radians, const Vector3 &unitVec )
 {
     return Transform3( Matrix3::rotation( radians, unitVec ), Vector3( 0.0f ) );
 }
 
-inline const Transform3 Transform3::rotation( const Quat &unitQuat )
+__forceinline const Transform3 Transform3::rotation( const Quat &unitQuat )
 {
     return Transform3( Matrix3( unitQuat ), Vector3( 0.0f ) );
 }
 
-inline const Transform3 Transform3::scale( const Vector3 &scaleVec )
+__forceinline const Transform3 Transform3::scale( const Vector3 &scaleVec )
 {
     __m128 zero = _mm_setzero_ps();
 	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
@@ -1954,7 +1954,7 @@ inline const Transform3 Transform3::scale( const Vector3 &scaleVec )
     );
 }
 
-inline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec )
+__forceinline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec )
 {
     return Transform3(
         ( tfrm.getCol0() * scaleVec.getX( ) ),
@@ -1964,7 +1964,7 @@ inline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &sca
     );
 }
 
-inline const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm )
+__forceinline const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm )
 {
     return Transform3(
         mulPerElem( tfrm.getCol0(), scaleVec ),
@@ -1974,7 +1974,7 @@ inline const Transform3 prependScale( const Vector3 &scaleVec, const Transform3
     );
 }
 
-inline const Transform3 Transform3::translation( const Vector3 &translateVec )
+__forceinline const Transform3 Transform3::translation( const Vector3 &translateVec )
 {
     return Transform3(
         Vector3::xAxis( ),
@@ -1984,7 +1984,7 @@ inline const Transform3 Transform3::translation( const Vector3 &translateVec )
     );
 }
 
-inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 )
+__forceinline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 )
 {
     return Transform3(
         select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
@@ -1994,7 +1994,7 @@ inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfr
     );
 }
 
-inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 )
+__forceinline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 )
 {
     return Transform3(
         select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
@@ -2006,14 +2006,14 @@ inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfr
 
 #ifdef _VECTORMATH_DEBUG
 
-inline void print( const Transform3 & tfrm )
+__forceinline void print( const Transform3 & tfrm )
 {
     print( tfrm.getRow( 0 ) );
     print( tfrm.getRow( 1 ) );
     print( tfrm.getRow( 2 ) );
 }
 
-inline void print( const Transform3 & tfrm, const char * name )
+__forceinline void print( const Transform3 & tfrm, const char * name )
 {
     printf("%s:\n", name);
     print( tfrm );
@@ -2021,7 +2021,7 @@ inline void print( const Transform3 & tfrm, const char * name )
 
 #endif
 
-inline Quat::Quat( const Matrix3 & tfrm )
+__forceinline Quat::Quat( const Matrix3 & tfrm )
 {
     __m128 res;
     __m128 col0, col1, col2;
@@ -2109,7 +2109,7 @@ inline Quat::Quat( const Matrix3 & tfrm )
     mVec128 = res;
 }
 
-inline const Matrix3 outer( const Vector3 &tfrm0, const Vector3 &tfrm1 )
+__forceinline const Matrix3 outer( const Vector3 &tfrm0, const Vector3 &tfrm1 )
 {
     return Matrix3(
         ( tfrm0 * tfrm1.getX( ) ),
@@ -2118,7 +2118,7 @@ inline const Matrix3 outer( const Vector3 &tfrm0, const Vector3 &tfrm1 )
     );
 }
 
-inline const Matrix4 outer( const Vector4 &tfrm0, const Vector4 &tfrm1 )
+__forceinline const Matrix4 outer( const Vector4 &tfrm0, const Vector4 &tfrm1 )
 {
     return Matrix4(
         ( tfrm0 * tfrm1.getX( ) ),
@@ -2128,7 +2128,7 @@ inline const Matrix4 outer( const Vector4 &tfrm0, const Vector4 &tfrm1 )
     );
 }
 
-inline const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat )
+__forceinline const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat )
 {
     __m128 tmp0, tmp1, mcol0, mcol1, mcol2, res;
     __m128 xxxx, yyyy, zzzz;
@@ -2151,7 +2151,7 @@ inline const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat )
     return Vector3( res );
 }
 
-inline const Matrix3 crossMatrix( const Vector3 &vec )
+__forceinline const Matrix3 crossMatrix( const Vector3 &vec )
 {
     __m128 neg, res0, res1, res2;
     neg = negatef4( vec.get128() );
@@ -2179,7 +2179,7 @@ inline const Matrix3 crossMatrix( const Vector3 &vec )
     );
 }
 
-inline const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat )
+__forceinline const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat )
 {
     return Matrix3( cross( vec, mat.getCol0() ), cross( vec, mat.getCol1() ), cross( vec, mat.getCol2() ) );
 }
diff --git a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/quat_aos.h b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/quat_aos.h
index a7cd4e145..081cb3a4d 100644
--- a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/quat_aos.h
+++ b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/quat_aos.h
@@ -1,5 +1,5 @@
 /*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   Copyright (C) 2006, 2010 Sony Computer Entertainment Inc.
    All rights reserved.
 
    Redistribution and use in source and binary forms,
@@ -42,80 +42,96 @@
 namespace Vectormath {
 namespace Aos {
 
-inline Quat::Quat( float _x, float _y, float _z, float _w )
+__forceinline void Quat::set128(vec_float4 vec)
 {
-    mVec128 = _mm_setr_ps(_x, _y, _z, _w);
+    mVec128 = vec;
 }
 
-inline Quat::Quat( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
+__forceinline Quat::Quat( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
 {
 	mVec128 = _mm_unpacklo_ps(
 		_mm_unpacklo_ps( _x.get128(), _z.get128() ),
 		_mm_unpacklo_ps( _y.get128(), _w.get128() ) );
 }
 
-inline Quat::Quat( const Vector3 &xyz, float _w )
+__forceinline Quat::Quat( const Vector3 &xyz, float _w )
 {
     mVec128 = xyz.get128();
     _vmathVfSetElement(mVec128, _w, 3);
 }
 
-inline Quat::Quat( const Vector3 &xyz, const floatInVec &_w )
+
+
+__forceinline  Quat::Quat(const Quat& quat)
+{
+	mVec128 = quat.get128();
+}
+
+__forceinline Quat::Quat( float _x, float _y, float _z, float _w )
+{
+	mVec128 = _mm_setr_ps(_x, _y, _z, _w);
+}
+
+
+
+
+
+__forceinline Quat::Quat( const Vector3 &xyz, const floatInVec &_w )
 {
     mVec128 = xyz.get128();
     mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
 }
 
-inline Quat::Quat( const Vector4 &vec )
+__forceinline Quat::Quat( const Vector4 &vec )
 {
     mVec128 = vec.get128();
 }
 
-inline Quat::Quat( float scalar )
+__forceinline Quat::Quat( float scalar )
 {
     mVec128 = floatInVec(scalar).get128();
 }
 
-inline Quat::Quat( const floatInVec &scalar )
+__forceinline Quat::Quat( const floatInVec &scalar )
 {
     mVec128 = scalar.get128();
 }
 
-inline Quat::Quat( __m128 vf4 )
+__forceinline Quat::Quat( __m128 vf4 )
 {
     mVec128 = vf4;
 }
 
-inline const Quat Quat::identity( )
+__forceinline const Quat Quat::identity( )
 {
     return Quat( _VECTORMATH_UNIT_0001 );
 }
 
-inline const Quat lerp( float t, const Quat &quat0, const Quat &quat1 )
+__forceinline const Quat lerp( float t, const Quat &quat0, const Quat &quat1 )
 {
     return lerp( floatInVec(t), quat0, quat1 );
 }
 
-inline const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 )
+__forceinline const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 )
 {
     return ( quat0 + ( ( quat1 - quat0 ) * t ) );
 }
 
-inline const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 )
+__forceinline const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 )
 {
     return slerp( floatInVec(t), unitQuat0, unitQuat1 );
 }
 
-inline const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 )
+__forceinline const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 )
 {
     Quat start;
     vec_float4 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
-    vec_uint4 selectMask;
+    __m128 selectMask;
     cosAngle = _vmathVfDot4( unitQuat0.get128(), unitQuat1.get128() );
-    selectMask = (vec_uint4)vec_cmpgt( _mm_setzero_ps(), cosAngle );
+    selectMask = (__m128)vec_cmpgt( _mm_setzero_ps(), cosAngle );
     cosAngle = vec_sel( cosAngle, negatef4( cosAngle ), selectMask );
     start = Quat( vec_sel( unitQuat0.get128(), negatef4( unitQuat0.get128() ), selectMask ) );
-    selectMask = (vec_uint4)vec_cmpgt( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
+    selectMask = (__m128)vec_cmpgt( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
     angle = acosf4( cosAngle );
     tttt = t.get128();
     oneMinusT = vec_sub( _mm_set1_ps(1.0f), tttt );
@@ -129,236 +145,239 @@ inline const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat
     return Quat( vec_madd( start.get128(), scale0, vec_mul( unitQuat1.get128(), scale1 ) ) );
 }
 
-inline const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
+__forceinline const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
 {
     return squad( floatInVec(t), unitQuat0, unitQuat1, unitQuat2, unitQuat3 );
 }
 
-inline const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
+__forceinline const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
 {
     return slerp( ( ( floatInVec(2.0f) * t ) * ( floatInVec(1.0f) - t ) ), slerp( t, unitQuat0, unitQuat3 ), slerp( t, unitQuat1, unitQuat2 ) );
 }
 
-inline __m128 Quat::get128( ) const
+__forceinline __m128 Quat::get128( ) const
 {
     return mVec128;
 }
 
-inline Quat & Quat::operator =( const Quat &quat )
+__forceinline Quat & Quat::operator =( const Quat &quat )
 {
     mVec128 = quat.mVec128;
     return *this;
 }
 
-inline Quat & Quat::setXYZ( const Vector3 &vec )
+__forceinline Quat & Quat::setXYZ( const Vector3 &vec )
 {
 	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff};
 	mVec128 = vec_sel( vec.get128(), mVec128, sw );
     return *this;
 }
 
-inline const Vector3 Quat::getXYZ( ) const
+__forceinline const Vector3 Quat::getXYZ( ) const
 {
     return Vector3( mVec128 );
 }
 
-inline Quat & Quat::setX( float _x )
+__forceinline Quat & Quat::setX( float _x )
 {
     _vmathVfSetElement(mVec128, _x, 0);
     return *this;
 }
 
-inline Quat & Quat::setX( const floatInVec &_x )
+__forceinline Quat & Quat::setX( const floatInVec &_x )
 {
     mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
     return *this;
 }
 
-inline const floatInVec Quat::getX( ) const
+__forceinline const floatInVec Quat::getX( ) const
 {
     return floatInVec( mVec128, 0 );
 }
 
-inline Quat & Quat::setY( float _y )
+__forceinline Quat & Quat::setY( float _y )
 {
     _vmathVfSetElement(mVec128, _y, 1);
     return *this;
 }
 
-inline Quat & Quat::setY( const floatInVec &_y )
+__forceinline Quat & Quat::setY( const floatInVec &_y )
 {
     mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
     return *this;
 }
 
-inline const floatInVec Quat::getY( ) const
+__forceinline const floatInVec Quat::getY( ) const
 {
     return floatInVec( mVec128, 1 );
 }
 
-inline Quat & Quat::setZ( float _z )
+__forceinline Quat & Quat::setZ( float _z )
 {
     _vmathVfSetElement(mVec128, _z, 2);
     return *this;
 }
 
-inline Quat & Quat::setZ( const floatInVec &_z )
+__forceinline Quat & Quat::setZ( const floatInVec &_z )
 {
     mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
     return *this;
 }
 
-inline const floatInVec Quat::getZ( ) const
+__forceinline const floatInVec Quat::getZ( ) const
 {
     return floatInVec( mVec128, 2 );
 }
 
-inline Quat & Quat::setW( float _w )
+__forceinline Quat & Quat::setW( float _w )
 {
     _vmathVfSetElement(mVec128, _w, 3);
     return *this;
 }
 
-inline Quat & Quat::setW( const floatInVec &_w )
+__forceinline Quat & Quat::setW( const floatInVec &_w )
 {
     mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
     return *this;
 }
 
-inline const floatInVec Quat::getW( ) const
+__forceinline const floatInVec Quat::getW( ) const
 {
     return floatInVec( mVec128, 3 );
 }
 
-inline Quat & Quat::setElem( int idx, float value )
+__forceinline Quat & Quat::setElem( int idx, float value )
 {
     _vmathVfSetElement(mVec128, value, idx);
     return *this;
 }
 
-inline Quat & Quat::setElem( int idx, const floatInVec &value )
+__forceinline Quat & Quat::setElem( int idx, const floatInVec &value )
 {
     mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
     return *this;
 }
 
-inline const floatInVec Quat::getElem( int idx ) const
+__forceinline const floatInVec Quat::getElem( int idx ) const
 {
     return floatInVec( mVec128, idx );
 }
 
-inline VecIdx Quat::operator []( int idx )
+__forceinline VecIdx Quat::operator []( int idx )
 {
     return VecIdx( mVec128, idx );
 }
 
-inline const floatInVec Quat::operator []( int idx ) const
+__forceinline const floatInVec Quat::operator []( int idx ) const
 {
     return floatInVec( mVec128, idx );
 }
 
-inline const Quat Quat::operator +( const Quat &quat ) const
+__forceinline const Quat Quat::operator +( const Quat &quat ) const
 {
     return Quat( _mm_add_ps( mVec128, quat.mVec128 ) );
 }
 
-inline const Quat Quat::operator -( const Quat &quat ) const
+
+__forceinline const Quat Quat::operator -( const Quat &quat ) const
 {
     return Quat( _mm_sub_ps( mVec128, quat.mVec128 ) );
 }
 
-inline const Quat Quat::operator *( float scalar ) const
+__forceinline const Quat Quat::operator *( float scalar ) const
 {
     return *this * floatInVec(scalar);
 }
 
-inline const Quat Quat::operator *( const floatInVec &scalar ) const
+__forceinline const Quat Quat::operator *( const floatInVec &scalar ) const
 {
     return Quat( _mm_mul_ps( mVec128, scalar.get128() ) );
 }
 
-inline Quat & Quat::operator +=( const Quat &quat )
+__forceinline Quat & Quat::operator +=( const Quat &quat )
 {
     *this = *this + quat;
     return *this;
 }
 
-inline Quat & Quat::operator -=( const Quat &quat )
+__forceinline Quat & Quat::operator -=( const Quat &quat )
 {
     *this = *this - quat;
     return *this;
 }
 
-inline Quat & Quat::operator *=( float scalar )
+__forceinline Quat & Quat::operator *=( float scalar )
 {
     *this = *this * scalar;
     return *this;
 }
 
-inline Quat & Quat::operator *=( const floatInVec &scalar )
+__forceinline Quat & Quat::operator *=( const floatInVec &scalar )
 {
     *this = *this * scalar;
     return *this;
 }
 
-inline const Quat Quat::operator /( float scalar ) const
+__forceinline const Quat Quat::operator /( float scalar ) const
 {
     return *this / floatInVec(scalar);
 }
 
-inline const Quat Quat::operator /( const floatInVec &scalar ) const
+__forceinline const Quat Quat::operator /( const floatInVec &scalar ) const
 {
     return Quat( _mm_div_ps( mVec128, scalar.get128() ) );
 }
 
-inline Quat & Quat::operator /=( float scalar )
+__forceinline Quat & Quat::operator /=( float scalar )
 {
     *this = *this / scalar;
     return *this;
 }
 
-inline Quat & Quat::operator /=( const floatInVec &scalar )
+__forceinline Quat & Quat::operator /=( const floatInVec &scalar )
 {
     *this = *this / scalar;
     return *this;
 }
 
-inline const Quat Quat::operator -( ) const
+__forceinline const Quat Quat::operator -( ) const
 {
 	return Quat(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
 }
 
-inline const Quat operator *( float scalar, const Quat &quat )
+__forceinline const Quat operator *( float scalar, const Quat &quat )
 {
     return floatInVec(scalar) * quat;
 }
 
-inline const Quat operator *( const floatInVec &scalar, const Quat &quat )
+__forceinline const Quat operator *( const floatInVec &scalar, const Quat &quat )
 {
     return quat * scalar;
 }
 
-inline const floatInVec dot( const Quat &quat0, const Quat &quat1 )
+__forceinline const floatInVec dot( const Quat &quat0, const Quat &quat1 )
 {
     return floatInVec( _vmathVfDot4( quat0.get128(), quat1.get128() ), 0 );
 }
 
-inline const floatInVec norm( const Quat &quat )
+__forceinline const floatInVec norm( const Quat &quat )
 {
     return floatInVec(  _vmathVfDot4( quat.get128(), quat.get128() ), 0 );
 }
 
-inline const floatInVec length( const Quat &quat )
+__forceinline const floatInVec length( const Quat &quat )
 {
     return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( quat.get128(), quat.get128() )), 0 );
 }
 
-inline const Quat normalize( const Quat &quat )
+__forceinline const Quat normalize( const Quat &quat )
 {
-    return Quat( _mm_mul_ps( quat.get128(), _mm_rsqrt_ps( _vmathVfDot4( quat.get128(), quat.get128() ) ) ) );
+	vec_float4 dot =_vmathVfDot4( quat.get128(), quat.get128());
+    return Quat( _mm_mul_ps( quat.get128(), newtonrapson_rsqrt4( dot ) ) );
 }
 
-inline const Quat Quat::rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 )
+
+__forceinline const Quat Quat::rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 )
 {
     Vector3 crossVec;
     __m128 cosAngle, cosAngleX2Plus2, recipCosHalfAngleX2, cosHalfAngleX2, res;
@@ -373,12 +392,12 @@ inline const Quat Quat::rotation( const Vector3 &unitVec0, const Vector3 &unitVe
     return Quat( res );
 }
 
-inline const Quat Quat::rotation( float radians, const Vector3 &unitVec )
+__forceinline const Quat Quat::rotation( float radians, const Vector3 &unitVec )
 {
     return rotation( floatInVec(radians), unitVec );
 }
 
-inline const Quat Quat::rotation( const floatInVec &radians, const Vector3 &unitVec )
+__forceinline const Quat Quat::rotation( const floatInVec &radians, const Vector3 &unitVec )
 {
     __m128 s, c, angle, res;
     angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
@@ -388,12 +407,12 @@ inline const Quat Quat::rotation( const floatInVec &radians, const Vector3 &unit
     return Quat( res );
 }
 
-inline const Quat Quat::rotationX( float radians )
+__forceinline const Quat Quat::rotationX( float radians )
 {
     return rotationX( floatInVec(radians) );
 }
 
-inline const Quat Quat::rotationX( const floatInVec &radians )
+__forceinline const Quat Quat::rotationX( const floatInVec &radians )
 {
     __m128 s, c, angle, res;
     angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
@@ -405,12 +424,12 @@ inline const Quat Quat::rotationX( const floatInVec &radians )
     return Quat( res );
 }
 
-inline const Quat Quat::rotationY( float radians )
+__forceinline const Quat Quat::rotationY( float radians )
 {
     return rotationY( floatInVec(radians) );
 }
 
-inline const Quat Quat::rotationY( const floatInVec &radians )
+__forceinline const Quat Quat::rotationY( const floatInVec &radians )
 {
     __m128 s, c, angle, res;
     angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
@@ -422,12 +441,12 @@ inline const Quat Quat::rotationY( const floatInVec &radians )
     return Quat( res );
 }
 
-inline const Quat Quat::rotationZ( float radians )
+__forceinline const Quat Quat::rotationZ( float radians )
 {
     return rotationZ( floatInVec(radians) );
 }
 
-inline const Quat Quat::rotationZ( const floatInVec &radians )
+__forceinline const Quat Quat::rotationZ( const floatInVec &radians )
 {
     __m128 s, c, angle, res;
     angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
@@ -439,7 +458,7 @@ inline const Quat Quat::rotationZ( const floatInVec &radians )
     return Quat( res );
 }
 
-inline const Quat Quat::operator *( const Quat &quat ) const
+__forceinline const Quat Quat::operator *( const Quat &quat ) const
 {
     __m128 ldata, rdata, qv, tmp0, tmp1, tmp2, tmp3;
     __m128 product, l_wxyz, r_wxyz, xy, qw;
@@ -463,13 +482,13 @@ inline const Quat Quat::operator *( const Quat &quat ) const
     return Quat( vec_sel( qv, qw, sw ) );
 }
 
-inline Quat & Quat::operator *=( const Quat &quat )
+__forceinline Quat & Quat::operator *=( const Quat &quat )
 {
     *this = *this * quat;
     return *this;
 }
 
-inline const Vector3 rotate( const Quat &quat, const Vector3 &vec )
+__forceinline const Vector3 rotate( const Quat &quat, const Vector3 &vec )
 {    __m128 qdata, vdata, product, tmp0, tmp1, tmp2, tmp3, wwww, qv, qw, res;
     qdata = quat.get128();
     vdata = vec.get128();
@@ -493,32 +512,48 @@ inline const Vector3 rotate( const Quat &quat, const Vector3 &vec )
     return Vector3( res );
 }
 
-inline const Quat conj( const Quat &quat )
+__forceinline const Quat conj( const Quat &quat )
 {
 	__declspec(align(16)) unsigned int sw[4] = {0x80000000,0x80000000,0x80000000,0};
     return Quat( vec_xor( quat.get128(), _mm_load_ps((float *)sw) ) );
 }
 
-inline const Quat select( const Quat &quat0, const Quat &quat1, bool select1 )
+__forceinline const Quat select( const Quat &quat0, const Quat &quat1, bool select1 )
 {
     return select( quat0, quat1, boolInVec(select1) );
 }
 
-inline const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 )
+//__forceinline const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 )
+//{
+//    return Quat( vec_sel( quat0.get128(), quat1.get128(), select1.get128() ) );
+//}
+
+__forceinline void loadXYZW(Quat& quat, const float* fptr)
 {
-    return Quat( vec_sel( quat0.get128(), quat1.get128(), select1.get128() ) );
+#ifdef USE_SSE2_LDDQU
+	quat = Quat(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128		);
+#else
+	SSEFloat fl;
+	fl.f[0] = fptr[0];
+	fl.f[1] = fptr[1];
+	fl.f[2] = fptr[2];
+	fl.f[3] = fptr[3];
+    quat = Quat(	fl.m128);
+#endif
+    
+
 }
 
 #ifdef _VECTORMATH_DEBUG
 
-inline void print( const Quat &quat )
+__forceinline void print( const Quat &quat )
 {
     union { __m128 v; float s[4]; } tmp;
     tmp.v = quat.get128();
     printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
 }
 
-inline void print( const Quat &quat, const char * name )
+__forceinline void print( const Quat &quat, const char * name )
 {
     union { __m128 v; float s[4]; } tmp;
     tmp.v = quat.get128();
diff --git a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vec_aos.h b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vec_aos.h
index 4fe957928..98f560738 100644
--- a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vec_aos.h
+++ b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vec_aos.h
@@ -1,5 +1,5 @@
 /*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   Copyright (C) 2006, 2010 Sony Computer Entertainment Inc.
    All rights reserved.
 
    Redistribution and use in source and binary forms,
@@ -58,6 +58,7 @@
 #define _VECTORMATH_UNIT_0010 _mm_setr_ps(0.0f,0.0f,1.0f,0.0f) // (__m128){ 0.0f, 0.0f, 1.0f, 0.0f }
 #define _VECTORMATH_UNIT_0001 _mm_setr_ps(0.0f,0.0f,0.0f,1.0f) // (__m128){ 0.0f, 0.0f, 0.0f, 1.0f }
 #define _VECTORMATH_SLERP_TOL 0.999f
+//_VECTORMATH_SLERP_TOLF
 
 //-----------------------------------------------------------------------------
 // Definitions
@@ -65,13 +66,14 @@
 #ifndef _VECTORMATH_INTERNAL_FUNCTIONS
 #define _VECTORMATH_INTERNAL_FUNCTIONS
 
-static inline __m128 _vmathVfDot3( __m128 vec0, __m128 vec1 )
+#define     _vmath_shufps(a, b, immx, immy, immz, immw) _mm_shuffle_ps(a, b, _MM_SHUFFLE(immw, immz, immy, immx))
+static __forceinline __m128 _vmathVfDot3( __m128 vec0, __m128 vec1 )
 {
-    __m128 result = _mm_mul_ps( vec0, vec1);
+	__m128 result = _mm_mul_ps( vec0, vec1);
     return _mm_add_ps( vec_splat( result, 0 ), _mm_add_ps( vec_splat( result, 1 ), vec_splat( result, 2 ) ) );
 }
 
-static inline __m128 _vmathVfDot4( __m128 vec0, __m128 vec1 )
+static __forceinline __m128 _vmathVfDot4( __m128 vec0, __m128 vec1 )
 {
     __m128 result = _mm_mul_ps(vec0, vec1);
 	return _mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(0,0,0,0)),
@@ -79,7 +81,7 @@ static inline __m128 _vmathVfDot4( __m128 vec0, __m128 vec1 )
 			_mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(2,2,2,2)), _mm_shuffle_ps(result, result, _MM_SHUFFLE(3,3,3,3)))));
 }
 
-static inline __m128 _vmathVfCross( __m128 vec0, __m128 vec1 )
+static __forceinline __m128 _vmathVfCross( __m128 vec0, __m128 vec1 )
 {
     __m128 tmp0, tmp1, tmp2, tmp3, result;
     tmp0 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,0,2,1) );
@@ -91,7 +93,7 @@ static inline __m128 _vmathVfCross( __m128 vec0, __m128 vec1 )
     return result;
 }
 /*
-static inline vec_uint4 _vmathVfToHalfFloatsUnpacked(__m128 v)
+static __forceinline vec_uint4 _vmathVfToHalfFloatsUnpacked(__m128 v)
 {
 #if 0
     vec_int4 bexp;
@@ -123,7 +125,7 @@ static inline vec_uint4 _vmathVfToHalfFloatsUnpacked(__m128 v)
 #endif
 }
 
-static inline vec_ushort8 _vmath2VfToHalfFloats(__m128 u, __m128 v)
+static __forceinline vec_ushort8 _vmath2VfToHalfFloats(__m128 u, __m128 v)
 {
 #if 0
     vec_uint4 hfloat_u, hfloat_v;
@@ -138,7 +140,7 @@ static inline vec_ushort8 _vmath2VfToHalfFloats(__m128 u, __m128 v)
 }
 */
 
-static inline __m128 _vmathVfInsert(__m128 dst, __m128 src, int slot)
+static __forceinline __m128 _vmathVfInsert(__m128 dst, __m128 src, int slot)
 {
 	SSEFloat s;
 	s.m128 = src;
@@ -150,7 +152,7 @@ static inline __m128 _vmathVfInsert(__m128 dst, __m128 src, int slot)
 
 #define _vmathVfSetElement(vec, scalar, slot) ((float *)&(vec))[slot] = scalar
 
-static inline __m128 _vmathVfSplatScalar(float scalar)
+static __forceinline __m128 _vmathVfSplatScalar(float scalar)
 {
 	return _mm_set1_ps(scalar);
 }
@@ -160,48 +162,49 @@ static inline __m128 _vmathVfSplatScalar(float scalar)
 namespace Vectormath {
 namespace Aos {
 
+	
 #ifdef _VECTORMATH_NO_SCALAR_CAST
-inline VecIdx::operator floatInVec() const
+__forceinline VecIdx::operator floatInVec() const
 {
     return floatInVec(ref, i);
 }
 
-inline float VecIdx::getAsFloat() const
+__forceinline float VecIdx::getAsFloat() const
 #else
-inline VecIdx::operator float() const
+__forceinline VecIdx::operator float() const
 #endif
 {
     return ((float *)&ref)[i];
 }
 
-inline float VecIdx::operator =( float scalar )
+__forceinline float VecIdx::operator =( float scalar )
 {
     _vmathVfSetElement(ref, scalar, i);
     return scalar;
 }
 
-inline floatInVec VecIdx::operator =( const floatInVec &scalar )
+__forceinline floatInVec VecIdx::operator =( const floatInVec &scalar )
 {
     ref = _vmathVfInsert(ref, scalar.get128(), i);
     return scalar;
 }
 
-inline floatInVec VecIdx::operator =( const VecIdx& scalar )
+__forceinline floatInVec VecIdx::operator =( const VecIdx& scalar )
 {
     return *this = floatInVec(scalar.ref, scalar.i);
 }
 
-inline floatInVec VecIdx::operator *=( float scalar )
+__forceinline floatInVec VecIdx::operator *=( float scalar )
 {
     return *this *= floatInVec(scalar);
 }
 
-inline floatInVec VecIdx::operator *=( const floatInVec &scalar )
+__forceinline floatInVec VecIdx::operator *=( const floatInVec &scalar )
 {
     return *this = floatInVec(ref, i) * scalar;
 }
 
-inline floatInVec VecIdx::operator /=( float scalar )
+__forceinline floatInVec VecIdx::operator /=( float scalar )
 {
     return *this /= floatInVec(scalar);
 }
@@ -211,88 +214,99 @@ inline floatInVec VecIdx::operator /=( const floatInVec &scalar )
     return *this = floatInVec(ref, i) / scalar;
 }
 
-inline floatInVec VecIdx::operator +=( float scalar )
+__forceinline floatInVec VecIdx::operator +=( float scalar )
 {
     return *this += floatInVec(scalar);
 }
 
-inline floatInVec VecIdx::operator +=( const floatInVec &scalar )
+__forceinline floatInVec VecIdx::operator +=( const floatInVec &scalar )
 {
     return *this = floatInVec(ref, i) + scalar;
 }
 
-inline floatInVec VecIdx::operator -=( float scalar )
+__forceinline floatInVec VecIdx::operator -=( float scalar )
 {
     return *this -= floatInVec(scalar);
 }
 
-inline floatInVec VecIdx::operator -=( const floatInVec &scalar )
+__forceinline floatInVec VecIdx::operator -=( const floatInVec &scalar )
 {
     return *this = floatInVec(ref, i) - scalar;
 }
 
-inline Vector3::Vector3( float _x, float _y, float _z )
+__forceinline Vector3::Vector3(const Vector3& vec)
+{
+    set128(vec.get128());
+}
+
+__forceinline void Vector3::set128(vec_float4 vec)
+{
+    mVec128 = vec;
+}
+
+
+__forceinline Vector3::Vector3( float _x, float _y, float _z )
 {
     mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
 }
 
-inline Vector3::Vector3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
+__forceinline Vector3::Vector3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
 {
 	__m128 xz = _mm_unpacklo_ps( _x.get128(), _z.get128() );
 	mVec128 = _mm_unpacklo_ps( xz, _y.get128() );
 }
 
-inline Vector3::Vector3( const Point3 &pnt )
+__forceinline Vector3::Vector3( const Point3 &pnt )
 {
     mVec128 = pnt.get128();
 }
 
-inline Vector3::Vector3( float scalar )
+__forceinline Vector3::Vector3( float scalar )
 {
     mVec128 = floatInVec(scalar).get128();
 }
 
-inline Vector3::Vector3( const floatInVec &scalar )
+__forceinline Vector3::Vector3( const floatInVec &scalar )
 {
     mVec128 = scalar.get128();
 }
 
-inline Vector3::Vector3( __m128 vf4 )
+__forceinline Vector3::Vector3( __m128 vf4 )
 {
     mVec128 = vf4;
 }
 
-inline const Vector3 Vector3::xAxis( )
+__forceinline const Vector3 Vector3::xAxis( )
 {
     return Vector3( _VECTORMATH_UNIT_1000 );
 }
 
-inline const Vector3 Vector3::yAxis( )
+__forceinline const Vector3 Vector3::yAxis( )
 {
     return Vector3( _VECTORMATH_UNIT_0100 );
 }
 
-inline const Vector3 Vector3::zAxis( )
+__forceinline const Vector3 Vector3::zAxis( )
 {
     return Vector3( _VECTORMATH_UNIT_0010 );
 }
 
-inline const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 )
+__forceinline const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 )
 {
     return lerp( floatInVec(t), vec0, vec1 );
 }
 
-inline const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 )
+__forceinline const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 )
 {
     return ( vec0 + ( ( vec1 - vec0 ) * t ) );
 }
 
-inline const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
+__forceinline const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
 {
     return slerp( floatInVec(t), unitVec0, unitVec1 );
 }
 
-inline const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
+__forceinline const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
 {
     __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
     cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() );
@@ -310,12 +324,27 @@ inline const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const
     return Vector3( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) );
 }
 
-inline __m128 Vector3::get128( ) const
+__forceinline __m128 Vector3::get128( ) const
 {
     return mVec128;
 }
 
-inline void storeXYZ( const Vector3 &vec, __m128 * quad )
+__forceinline void loadXYZ(Vector3& vec, const float* fptr)
+{
+#ifdef USE_SSE2_LDDQU
+	vec = Vector3(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128 );
+#else
+	SSEFloat fl;
+	fl.f[0] = fptr[0];
+	fl.f[1] = fptr[1];
+	fl.f[2] = fptr[2];
+	fl.f[3] = fptr[3];
+    vec = Vector3(	fl.m128);
+#endif //USE_SSE2_LDDQU
+	
+}
+
+__forceinline void storeXYZ( const Vector3 &vec, __m128 * quad )
 {
     __m128 dstVec = *quad;
 	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize
@@ -323,7 +352,15 @@ inline void storeXYZ( const Vector3 &vec, __m128 * quad )
     *quad = dstVec;
 }
 
-inline void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads )
+__forceinline void storeXYZ(const Vector3& vec, float* fptr)
+{
+	fptr[0] = vec.getX();
+	fptr[1] = vec.getY();
+	fptr[2] = vec.getZ();
+}
+
+
+__forceinline void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads )
 {
 	const float *quads = (float *)threeQuads;
     vec0 = Vector3(  _mm_load_ps(quads) );
@@ -332,7 +369,7 @@ inline void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector
     vec3 = Vector3( _mm_loadu_ps(quads + 9) );
 }
 
-inline void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads )
+__forceinline void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads )
 {
 	__m128 xxxx = _mm_shuffle_ps( vec1.get128(), vec1.get128(), _MM_SHUFFLE(0, 0, 0, 0) );
 	__m128 zzzz = _mm_shuffle_ps( vec2.get128(), vec2.get128(), _MM_SHUFFLE(2, 2, 2, 2) );
@@ -343,7 +380,7 @@ inline void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vecto
     threeQuads[2] = vec_sel( _mm_shuffle_ps( vec3.get128(), vec3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
 }
 /*
-inline void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads )
+__forceinline void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads )
 {
 	assert(0);
 #if 0
@@ -357,197 +394,201 @@ inline void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vec
 #endif
 }
 */
-inline Vector3 & Vector3::operator =( const Vector3 &vec )
+__forceinline Vector3 & Vector3::operator =( const Vector3 &vec )
 {
     mVec128 = vec.mVec128;
     return *this;
 }
 
-inline Vector3 & Vector3::setX( float _x )
+__forceinline Vector3 & Vector3::setX( float _x )
 {
     _vmathVfSetElement(mVec128, _x, 0);
     return *this;
 }
 
-inline Vector3 & Vector3::setX( const floatInVec &_x )
+__forceinline Vector3 & Vector3::setX( const floatInVec &_x )
 {
     mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
     return *this;
 }
 
-inline const floatInVec Vector3::getX( ) const
+__forceinline const floatInVec Vector3::getX( ) const
 {
     return floatInVec( mVec128, 0 );
 }
 
-inline Vector3 & Vector3::setY( float _y )
+__forceinline Vector3 & Vector3::setY( float _y )
 {
     _vmathVfSetElement(mVec128, _y, 1);
     return *this;
 }
 
-inline Vector3 & Vector3::setY( const floatInVec &_y )
+__forceinline Vector3 & Vector3::setY( const floatInVec &_y )
 {
     mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
     return *this;
 }
 
-inline const floatInVec Vector3::getY( ) const
+__forceinline const floatInVec Vector3::getY( ) const
 {
     return floatInVec( mVec128, 1 );
 }
 
-inline Vector3 & Vector3::setZ( float _z )
+__forceinline Vector3 & Vector3::setZ( float _z )
 {
     _vmathVfSetElement(mVec128, _z, 2);
     return *this;
 }
 
-inline Vector3 & Vector3::setZ( const floatInVec &_z )
+__forceinline Vector3 & Vector3::setZ( const floatInVec &_z )
 {
     mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
     return *this;
 }
 
-inline const floatInVec Vector3::getZ( ) const
+__forceinline const floatInVec Vector3::getZ( ) const
 {
     return floatInVec( mVec128, 2 );
 }
 
-inline Vector3 & Vector3::setElem( int idx, float value )
+__forceinline Vector3 & Vector3::setElem( int idx, float value )
 {
     _vmathVfSetElement(mVec128, value, idx);
     return *this;
 }
 
-inline Vector3 & Vector3::setElem( int idx, const floatInVec &value )
+__forceinline Vector3 & Vector3::setElem( int idx, const floatInVec &value )
 {
     mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
     return *this;
 }
 
-inline const floatInVec Vector3::getElem( int idx ) const
+__forceinline const floatInVec Vector3::getElem( int idx ) const
 {
     return floatInVec( mVec128, idx );
 }
 
-inline VecIdx Vector3::operator []( int idx )
+__forceinline VecIdx Vector3::operator []( int idx )
 {
     return VecIdx( mVec128, idx );
 }
 
-inline const floatInVec Vector3::operator []( int idx ) const
+__forceinline const floatInVec Vector3::operator []( int idx ) const
 {
     return floatInVec( mVec128, idx );
 }
 
-inline const Vector3 Vector3::operator +( const Vector3 &vec ) const
+__forceinline const Vector3 Vector3::operator +( const Vector3 &vec ) const
 {
     return Vector3( _mm_add_ps( mVec128, vec.mVec128 ) );
 }
 
-inline const Vector3 Vector3::operator -( const Vector3 &vec ) const
+__forceinline const Vector3 Vector3::operator -( const Vector3 &vec ) const
 {
     return Vector3( _mm_sub_ps( mVec128, vec.mVec128 ) );
 }
 
-inline const Point3 Vector3::operator +( const Point3 &pnt ) const
+__forceinline const Point3 Vector3::operator +( const Point3 &pnt ) const
 {
     return Point3( _mm_add_ps( mVec128, pnt.get128() ) );
 }
 
-inline const Vector3 Vector3::operator *( float scalar ) const
+__forceinline const Vector3 Vector3::operator *( float scalar ) const
 {
     return *this * floatInVec(scalar);
 }
 
-inline const Vector3 Vector3::operator *( const floatInVec &scalar ) const
+__forceinline const Vector3 Vector3::operator *( const floatInVec &scalar ) const
 {
     return Vector3( _mm_mul_ps( mVec128, scalar.get128() ) );
 }
 
-inline Vector3 & Vector3::operator +=( const Vector3 &vec )
+__forceinline Vector3 & Vector3::operator +=( const Vector3 &vec )
 {
     *this = *this + vec;
     return *this;
 }
 
-inline Vector3 & Vector3::operator -=( const Vector3 &vec )
+__forceinline Vector3 & Vector3::operator -=( const Vector3 &vec )
 {
     *this = *this - vec;
     return *this;
 }
 
-inline Vector3 & Vector3::operator *=( float scalar )
+__forceinline Vector3 & Vector3::operator *=( float scalar )
 {
     *this = *this * scalar;
     return *this;
 }
 
-inline Vector3 & Vector3::operator *=( const floatInVec &scalar )
+__forceinline Vector3 & Vector3::operator *=( const floatInVec &scalar )
 {
     *this = *this * scalar;
     return *this;
 }
 
-inline const Vector3 Vector3::operator /( float scalar ) const
+__forceinline const Vector3 Vector3::operator /( float scalar ) const
 {
     return *this / floatInVec(scalar);
 }
 
-inline const Vector3 Vector3::operator /( const floatInVec &scalar ) const
+__forceinline const Vector3 Vector3::operator /( const floatInVec &scalar ) const
 {
     return Vector3( _mm_div_ps( mVec128, scalar.get128() ) );
 }
 
-inline Vector3 & Vector3::operator /=( float scalar )
+__forceinline Vector3 & Vector3::operator /=( float scalar )
 {
     *this = *this / scalar;
     return *this;
 }
 
-inline Vector3 & Vector3::operator /=( const floatInVec &scalar )
+__forceinline Vector3 & Vector3::operator /=( const floatInVec &scalar )
 {
     *this = *this / scalar;
     return *this;
 }
 
-inline const Vector3 Vector3::operator -( ) const
+__forceinline const Vector3 Vector3::operator -( ) const
 {
-	return Vector3(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
+	//return Vector3(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
+
+	__declspec(align(16)) static const int array[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+	__m128 NEG_MASK = SSEFloat(*(const vec_float4*)array).vf;
+	return Vector3(_mm_xor_ps(get128(),NEG_MASK));
 }
 
-inline const Vector3 operator *( float scalar, const Vector3 &vec )
+__forceinline const Vector3 operator *( float scalar, const Vector3 &vec )
 {
     return floatInVec(scalar) * vec;
 }
 
-inline const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec )
+__forceinline const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec )
 {
     return vec * scalar;
 }
 
-inline const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+__forceinline const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 )
 {
     return Vector3( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
 }
 
-inline const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+__forceinline const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 )
 {
     return Vector3( _mm_div_ps( vec0.get128(), vec1.get128() ) );
 }
 
-inline const Vector3 recipPerElem( const Vector3 &vec )
+__forceinline const Vector3 recipPerElem( const Vector3 &vec )
 {
     return Vector3( _mm_rcp_ps( vec.get128() ) );
 }
 
-inline const Vector3 absPerElem( const Vector3 &vec )
+__forceinline const Vector3 absPerElem( const Vector3 &vec )
 {
     return Vector3( fabsf4( vec.get128() ) );
 }
 
-inline const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+__forceinline const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 )
 {
 	__m128 vmask = toM128(0x7fffffff);
 	return Vector3( _mm_or_ps(
@@ -555,82 +596,83 @@ inline const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 )
 		_mm_andnot_ps( vmask, vec1.get128() ) ) );		// Signs
 }
 
-inline const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+__forceinline const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 )
 {
     return Vector3( _mm_max_ps( vec0.get128(), vec1.get128() ) );
 }
 
-inline const floatInVec maxElem( const Vector3 &vec )
+__forceinline const floatInVec maxElem( const Vector3 &vec )
 {
     return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
 }
 
-inline const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+__forceinline const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 )
 {
     return Vector3( _mm_min_ps( vec0.get128(), vec1.get128() ) );
 }
 
-inline const floatInVec minElem( const Vector3 &vec )
+__forceinline const floatInVec minElem( const Vector3 &vec )
 {
     return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
 }
 
-inline const floatInVec sum( const Vector3 &vec )
+__forceinline const floatInVec sum( const Vector3 &vec )
 {
     return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
 }
 
-inline const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 )
+__forceinline const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 )
 {
     return floatInVec( _vmathVfDot3( vec0.get128(), vec1.get128() ), 0 );
 }
 
-inline const floatInVec lengthSqr( const Vector3 &vec )
+__forceinline const floatInVec lengthSqr( const Vector3 &vec )
 {
     return floatInVec(  _vmathVfDot3( vec.get128(), vec.get128() ), 0 );
 }
 
-inline const floatInVec length( const Vector3 &vec )
+__forceinline const floatInVec length( const Vector3 &vec )
 {
     return floatInVec(  _mm_sqrt_ps(_vmathVfDot3( vec.get128(), vec.get128() )), 0 );
 }
 
 
-inline const Vector3 normalizeApprox( const Vector3 &vec )
+__forceinline const Vector3 normalizeApprox( const Vector3 &vec )
 {
     return Vector3( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
 }
 
-inline const Vector3 normalize( const Vector3 &vec )
+__forceinline const Vector3 normalize( const Vector3 &vec )
 {
 	return Vector3( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
 }
 
-inline const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 )
+__forceinline const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 )
 {
     return Vector3( _vmathVfCross( vec0.get128(), vec1.get128() ) );
 }
 
-inline const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 )
+__forceinline const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 )
 {
     return select( vec0, vec1, boolInVec(select1) );
 }
 
-inline const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, const boolInVec &select1 )
+
+__forceinline  const Vector4 select(const Vector4& vec0, const Vector4& vec1, const boolInVec& select1)
 {
-	return Vector3(vec_sel( vec0.get128(), vec1.get128(), select1.get128() ));
+    return Vector4(vec_sel(vec0.get128(), vec1.get128(), select1.get128()));
 }
 
 #ifdef _VECTORMATH_DEBUG
 
-inline void print( const Vector3 &vec )
+__forceinline void print( const Vector3 &vec )
 {
     union { __m128 v; float s[4]; } tmp;
     tmp.v = vec.get128();
     printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
 }
 
-inline void print( const Vector3 &vec, const char * name )
+__forceinline void print( const Vector3 &vec, const char * name )
 {
     union { __m128 v; float s[4]; } tmp;
     tmp.v = vec.get128();
@@ -639,98 +681,98 @@ inline void print( const Vector3 &vec, const char * name )
 
 #endif
 
-inline Vector4::Vector4( float _x, float _y, float _z, float _w )
+__forceinline Vector4::Vector4( float _x, float _y, float _z, float _w )
 {
     mVec128 = _mm_setr_ps(_x, _y, _z, _w); 
  }
 
-inline Vector4::Vector4( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
+__forceinline Vector4::Vector4( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
 {
 	mVec128 = _mm_unpacklo_ps(
 		_mm_unpacklo_ps( _x.get128(), _z.get128() ),
 		_mm_unpacklo_ps( _y.get128(), _w.get128() ) );
 }
 
-inline Vector4::Vector4( const Vector3 &xyz, float _w )
+__forceinline Vector4::Vector4( const Vector3 &xyz, float _w )
 {
     mVec128 = xyz.get128();
     _vmathVfSetElement(mVec128, _w, 3);
 }
 
-inline Vector4::Vector4( const Vector3 &xyz, const floatInVec &_w )
+__forceinline Vector4::Vector4( const Vector3 &xyz, const floatInVec &_w )
 {
     mVec128 = xyz.get128();
     mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
 }
 
-inline Vector4::Vector4( const Vector3 &vec )
+__forceinline Vector4::Vector4( const Vector3 &vec )
 {
     mVec128 = vec.get128();
     mVec128 = _vmathVfInsert(mVec128, _mm_setzero_ps(), 3);
 }
 
-inline Vector4::Vector4( const Point3 &pnt )
+__forceinline Vector4::Vector4( const Point3 &pnt )
 {
     mVec128 = pnt.get128();
     mVec128 = _vmathVfInsert(mVec128, _mm_set1_ps(1.0f), 3);
 }
 
-inline Vector4::Vector4( const Quat &quat )
+__forceinline Vector4::Vector4( const Quat &quat )
 {
     mVec128 = quat.get128();
 }
 
-inline Vector4::Vector4( float scalar )
+__forceinline Vector4::Vector4( float scalar )
 {
     mVec128 = floatInVec(scalar).get128();
 }
 
-inline Vector4::Vector4( const floatInVec &scalar )
+__forceinline Vector4::Vector4( const floatInVec &scalar )
 {
     mVec128 = scalar.get128();
 }
 
-inline Vector4::Vector4( __m128 vf4 )
+__forceinline Vector4::Vector4( __m128 vf4 )
 {
     mVec128 = vf4;
 }
 
-inline const Vector4 Vector4::xAxis( )
+__forceinline const Vector4 Vector4::xAxis( )
 {
     return Vector4( _VECTORMATH_UNIT_1000 );
 }
 
-inline const Vector4 Vector4::yAxis( )
+__forceinline const Vector4 Vector4::yAxis( )
 {
     return Vector4( _VECTORMATH_UNIT_0100 );
 }
 
-inline const Vector4 Vector4::zAxis( )
+__forceinline const Vector4 Vector4::zAxis( )
 {
     return Vector4( _VECTORMATH_UNIT_0010 );
 }
 
-inline const Vector4 Vector4::wAxis( )
+__forceinline const Vector4 Vector4::wAxis( )
 {
     return Vector4( _VECTORMATH_UNIT_0001 );
 }
 
-inline const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 )
+__forceinline const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 )
 {
     return lerp( floatInVec(t), vec0, vec1 );
 }
 
-inline const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 )
+__forceinline const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 )
 {
     return ( vec0 + ( ( vec1 - vec0 ) * t ) );
 }
 
-inline const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
+__forceinline const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
 {
     return slerp( floatInVec(t), unitVec0, unitVec1 );
 }
 
-inline const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
+__forceinline const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
 {
     __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
     cosAngle = _vmathVfDot4( unitVec0.get128(), unitVec1.get128() );
@@ -748,232 +790,232 @@ inline const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const
     return Vector4( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) );
 }
 
-inline __m128 Vector4::get128( ) const
+__forceinline __m128 Vector4::get128( ) const
 {
     return mVec128;
 }
 /*
-inline void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads )
+__forceinline void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads )
 {
     twoQuads[0] = _vmath2VfToHalfFloats(vec0.get128(), vec1.get128());
     twoQuads[1] = _vmath2VfToHalfFloats(vec2.get128(), vec3.get128());
 }
 */
-inline Vector4 & Vector4::operator =( const Vector4 &vec )
+__forceinline Vector4 & Vector4::operator =( const Vector4 &vec )
 {
     mVec128 = vec.mVec128;
     return *this;
 }
 
-inline Vector4 & Vector4::setXYZ( const Vector3 &vec )
+__forceinline Vector4 & Vector4::setXYZ( const Vector3 &vec )
 {
 	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff};
 	mVec128 = vec_sel( vec.get128(), mVec128, sw );
     return *this;
 }
 
-inline const Vector3 Vector4::getXYZ( ) const
+__forceinline const Vector3 Vector4::getXYZ( ) const
 {
     return Vector3( mVec128 );
 }
 
-inline Vector4 & Vector4::setX( float _x )
+__forceinline Vector4 & Vector4::setX( float _x )
 {
     _vmathVfSetElement(mVec128, _x, 0);
     return *this;
 }
 
-inline Vector4 & Vector4::setX( const floatInVec &_x )
+__forceinline Vector4 & Vector4::setX( const floatInVec &_x )
 {
     mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
     return *this;
 }
 
-inline const floatInVec Vector4::getX( ) const
+__forceinline const floatInVec Vector4::getX( ) const
 {
     return floatInVec( mVec128, 0 );
 }
 
-inline Vector4 & Vector4::setY( float _y )
+__forceinline Vector4 & Vector4::setY( float _y )
 {
     _vmathVfSetElement(mVec128, _y, 1);
     return *this;
 }
 
-inline Vector4 & Vector4::setY( const floatInVec &_y )
+__forceinline Vector4 & Vector4::setY( const floatInVec &_y )
 {
     mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
     return *this;
 }
 
-inline const floatInVec Vector4::getY( ) const
+__forceinline const floatInVec Vector4::getY( ) const
 {
     return floatInVec( mVec128, 1 );
 }
 
-inline Vector4 & Vector4::setZ( float _z )
+__forceinline Vector4 & Vector4::setZ( float _z )
 {
     _vmathVfSetElement(mVec128, _z, 2);
     return *this;
 }
 
-inline Vector4 & Vector4::setZ( const floatInVec &_z )
+__forceinline Vector4 & Vector4::setZ( const floatInVec &_z )
 {
     mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
     return *this;
 }
 
-inline const floatInVec Vector4::getZ( ) const
+__forceinline const floatInVec Vector4::getZ( ) const
 {
     return floatInVec( mVec128, 2 );
 }
 
-inline Vector4 & Vector4::setW( float _w )
+__forceinline Vector4 & Vector4::setW( float _w )
 {
     _vmathVfSetElement(mVec128, _w, 3);
     return *this;
 }
 
-inline Vector4 & Vector4::setW( const floatInVec &_w )
+__forceinline Vector4 & Vector4::setW( const floatInVec &_w )
 {
     mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
     return *this;
 }
 
-inline const floatInVec Vector4::getW( ) const
+__forceinline const floatInVec Vector4::getW( ) const
 {
     return floatInVec( mVec128, 3 );
 }
 
-inline Vector4 & Vector4::setElem( int idx, float value )
+__forceinline Vector4 & Vector4::setElem( int idx, float value )
 {
     _vmathVfSetElement(mVec128, value, idx);
     return *this;
 }
 
-inline Vector4 & Vector4::setElem( int idx, const floatInVec &value )
+__forceinline Vector4 & Vector4::setElem( int idx, const floatInVec &value )
 {
     mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
     return *this;
 }
 
-inline const floatInVec Vector4::getElem( int idx ) const
+__forceinline const floatInVec Vector4::getElem( int idx ) const
 {
     return floatInVec( mVec128, idx );
 }
 
-inline VecIdx Vector4::operator []( int idx )
+__forceinline VecIdx Vector4::operator []( int idx )
 {
     return VecIdx( mVec128, idx );
 }
 
-inline const floatInVec Vector4::operator []( int idx ) const
+__forceinline const floatInVec Vector4::operator []( int idx ) const
 {
     return floatInVec( mVec128, idx );
 }
 
-inline const Vector4 Vector4::operator +( const Vector4 &vec ) const
+__forceinline const Vector4 Vector4::operator +( const Vector4 &vec ) const
 {
     return Vector4( _mm_add_ps( mVec128, vec.mVec128 ) );
 }
 
-inline const Vector4 Vector4::operator -( const Vector4 &vec ) const
+__forceinline const Vector4 Vector4::operator -( const Vector4 &vec ) const
 {
     return Vector4( _mm_sub_ps( mVec128, vec.mVec128 ) );
 }
 
-inline const Vector4 Vector4::operator *( float scalar ) const
+__forceinline const Vector4 Vector4::operator *( float scalar ) const
 {
     return *this * floatInVec(scalar);
 }
 
-inline const Vector4 Vector4::operator *( const floatInVec &scalar ) const
+__forceinline const Vector4 Vector4::operator *( const floatInVec &scalar ) const
 {
     return Vector4( _mm_mul_ps( mVec128, scalar.get128() ) );
 }
 
-inline Vector4 & Vector4::operator +=( const Vector4 &vec )
+__forceinline Vector4 & Vector4::operator +=( const Vector4 &vec )
 {
     *this = *this + vec;
     return *this;
 }
 
-inline Vector4 & Vector4::operator -=( const Vector4 &vec )
+__forceinline Vector4 & Vector4::operator -=( const Vector4 &vec )
 {
     *this = *this - vec;
     return *this;
 }
 
-inline Vector4 & Vector4::operator *=( float scalar )
+__forceinline Vector4 & Vector4::operator *=( float scalar )
 {
     *this = *this * scalar;
     return *this;
 }
 
-inline Vector4 & Vector4::operator *=( const floatInVec &scalar )
+__forceinline Vector4 & Vector4::operator *=( const floatInVec &scalar )
 {
     *this = *this * scalar;
     return *this;
 }
 
-inline const Vector4 Vector4::operator /( float scalar ) const
+__forceinline const Vector4 Vector4::operator /( float scalar ) const
 {
     return *this / floatInVec(scalar);
 }
 
-inline const Vector4 Vector4::operator /( const floatInVec &scalar ) const
+__forceinline const Vector4 Vector4::operator /( const floatInVec &scalar ) const
 {
     return Vector4( _mm_div_ps( mVec128, scalar.get128() ) );
 }
 
-inline Vector4 & Vector4::operator /=( float scalar )
+__forceinline Vector4 & Vector4::operator /=( float scalar )
 {
     *this = *this / scalar;
     return *this;
 }
 
-inline Vector4 & Vector4::operator /=( const floatInVec &scalar )
+__forceinline Vector4 & Vector4::operator /=( const floatInVec &scalar )
 {
     *this = *this / scalar;
     return *this;
 }
 
-inline const Vector4 Vector4::operator -( ) const
+__forceinline const Vector4 Vector4::operator -( ) const
 {
 	return Vector4(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
 }
 
-inline const Vector4 operator *( float scalar, const Vector4 &vec )
+__forceinline const Vector4 operator *( float scalar, const Vector4 &vec )
 {
     return floatInVec(scalar) * vec;
 }
 
-inline const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec )
+__forceinline const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec )
 {
     return vec * scalar;
 }
 
-inline const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+__forceinline const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 )
 {
     return Vector4( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
 }
 
-inline const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+__forceinline const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 )
 {
     return Vector4( _mm_div_ps( vec0.get128(), vec1.get128() ) );
 }
 
-inline const Vector4 recipPerElem( const Vector4 &vec )
+__forceinline const Vector4 recipPerElem( const Vector4 &vec )
 {
     return Vector4( _mm_rcp_ps( vec.get128() ) );
 }
 
-inline const Vector4 absPerElem( const Vector4 &vec )
+__forceinline const Vector4 absPerElem( const Vector4 &vec )
 {
     return Vector4( fabsf4( vec.get128() ) );
 }
 
-inline const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+__forceinline const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 )
 {
 	__m128 vmask = toM128(0x7fffffff);
 	return Vector4( _mm_or_ps(
@@ -981,82 +1023,78 @@ inline const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 )
 		_mm_andnot_ps( vmask, vec1.get128() ) ) );		// Signs
 }
 
-inline const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+__forceinline const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 )
 {
     return Vector4( _mm_max_ps( vec0.get128(), vec1.get128() ) );
 }
 
-inline const floatInVec maxElem( const Vector4 &vec )
+__forceinline const floatInVec maxElem( const Vector4 &vec )
 {
     return floatInVec( _mm_max_ps(
 		_mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
 		_mm_max_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
 }
 
-inline const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+__forceinline const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 )
 {
     return Vector4( _mm_min_ps( vec0.get128(), vec1.get128() ) );
 }
 
-inline const floatInVec minElem( const Vector4 &vec )
+__forceinline const floatInVec minElem( const Vector4 &vec )
 {
     return floatInVec( _mm_min_ps(
 		_mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
 		_mm_min_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
 }
 
-inline const floatInVec sum( const Vector4 &vec )
+__forceinline const floatInVec sum( const Vector4 &vec )
 {
     return floatInVec( _mm_add_ps(
 		_mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
 		_mm_add_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
 }
 
-inline const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 )
+__forceinline const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 )
 {
     return floatInVec( _vmathVfDot4( vec0.get128(), vec1.get128() ), 0 );
 }
 
-inline const floatInVec lengthSqr( const Vector4 &vec )
+__forceinline const floatInVec lengthSqr( const Vector4 &vec )
 {
     return floatInVec(  _vmathVfDot4( vec.get128(), vec.get128() ), 0 );
 }
 
-inline const floatInVec length( const Vector4 &vec )
+__forceinline const floatInVec length( const Vector4 &vec )
 {
     return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( vec.get128(), vec.get128() )), 0 );
 }
 
-inline const Vector4 normalizeApprox( const Vector4 &vec )
+__forceinline const Vector4 normalizeApprox( const Vector4 &vec )
 {
     return Vector4( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
 }
 
-inline const Vector4 normalize( const Vector4 &vec )
+__forceinline const Vector4 normalize( const Vector4 &vec )
 {
     return Vector4( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
 }
 
-inline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 )
+__forceinline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 )
 {
     return select( vec0, vec1, boolInVec(select1) );
 }
 
-inline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, const boolInVec &select1 )
-{
-    return Vector4( vec_sel( vec0.get128(), vec1.get128(), select1.get128() ) );
-}
 
 #ifdef _VECTORMATH_DEBUG
 
-inline void print( const Vector4 &vec )
+__forceinline void print( const Vector4 &vec )
 {
     union { __m128 v; float s[4]; } tmp;
     tmp.v = vec.get128();
     printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
 }
 
-inline void print( const Vector4 &vec, const char * name )
+__forceinline void print( const Vector4 &vec, const char * name )
 {
     union { __m128 v; float s[4]; } tmp;
     tmp.v = vec.get128();
@@ -1065,52 +1103,52 @@ inline void print( const Vector4 &vec, const char * name )
 
 #endif
 
-inline Point3::Point3( float _x, float _y, float _z )
+__forceinline Point3::Point3( float _x, float _y, float _z )
 {
     mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
 }
 
-inline Point3::Point3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
+__forceinline Point3::Point3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
 {
 	mVec128 = _mm_unpacklo_ps( _mm_unpacklo_ps( _x.get128(), _z.get128() ), _y.get128() );
 }
 
-inline Point3::Point3( const Vector3 &vec )
+__forceinline Point3::Point3( const Vector3 &vec )
 {
     mVec128 = vec.get128();
 }
 
-inline Point3::Point3( float scalar )
+__forceinline Point3::Point3( float scalar )
 {
     mVec128 = floatInVec(scalar).get128();
 }
 
-inline Point3::Point3( const floatInVec &scalar )
+__forceinline Point3::Point3( const floatInVec &scalar )
 {
     mVec128 = scalar.get128();
 }
 
-inline Point3::Point3( __m128 vf4 )
+__forceinline Point3::Point3( __m128 vf4 )
 {
     mVec128 = vf4;
 }
 
-inline const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 )
+__forceinline const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 )
 {
     return lerp( floatInVec(t), pnt0, pnt1 );
 }
 
-inline const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 )
+__forceinline const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 )
 {
     return ( pnt0 + ( ( pnt1 - pnt0 ) * t ) );
 }
 
-inline __m128 Point3::get128( ) const
+__forceinline __m128 Point3::get128( ) const
 {
     return mVec128;
 }
 
-inline void storeXYZ( const Point3 &pnt, __m128 * quad )
+__forceinline void storeXYZ( const Point3 &pnt, __m128 * quad )
 {
     __m128 dstVec = *quad;
 	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize
@@ -1118,7 +1156,7 @@ inline void storeXYZ( const Point3 &pnt, __m128 * quad )
     *quad = dstVec;
 }
 
-inline void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads )
+__forceinline void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads )
 {
 	const float *quads = (float *)threeQuads;
     pnt0 = Point3(  _mm_load_ps(quads) );
@@ -1127,7 +1165,7 @@ inline void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 &
     pnt3 = Point3( _mm_loadu_ps(quads + 9) );
 }
 
-inline void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads )
+__forceinline void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads )
 {
 	__m128 xxxx = _mm_shuffle_ps( pnt1.get128(), pnt1.get128(), _MM_SHUFFLE(0, 0, 0, 0) );
 	__m128 zzzz = _mm_shuffle_ps( pnt2.get128(), pnt2.get128(), _MM_SHUFFLE(2, 2, 2, 2) );
@@ -1138,7 +1176,7 @@ inline void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3
     threeQuads[2] = vec_sel( _mm_shuffle_ps( pnt3.get128(), pnt3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
 }
 /*
-inline void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads )
+__forceinline void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads )
 {
 #if 0
     __m128 xyz0[3];
@@ -1153,138 +1191,138 @@ inline void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point
 #endif
 }
 */
-inline Point3 & Point3::operator =( const Point3 &pnt )
+__forceinline Point3 & Point3::operator =( const Point3 &pnt )
 {
     mVec128 = pnt.mVec128;
     return *this;
 }
 
-inline Point3 & Point3::setX( float _x )
+__forceinline Point3 & Point3::setX( float _x )
 {
     _vmathVfSetElement(mVec128, _x, 0);
     return *this;
 }
 
-inline Point3 & Point3::setX( const floatInVec &_x )
+__forceinline Point3 & Point3::setX( const floatInVec &_x )
 {
     mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
     return *this;
 }
 
-inline const floatInVec Point3::getX( ) const
+__forceinline const floatInVec Point3::getX( ) const
 {
     return floatInVec( mVec128, 0 );
 }
 
-inline Point3 & Point3::setY( float _y )
+__forceinline Point3 & Point3::setY( float _y )
 {
     _vmathVfSetElement(mVec128, _y, 1);
     return *this;
 }
 
-inline Point3 & Point3::setY( const floatInVec &_y )
+__forceinline Point3 & Point3::setY( const floatInVec &_y )
 {
     mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
     return *this;
 }
 
-inline const floatInVec Point3::getY( ) const
+__forceinline const floatInVec Point3::getY( ) const
 {
     return floatInVec( mVec128, 1 );
 }
 
-inline Point3 & Point3::setZ( float _z )
+__forceinline Point3 & Point3::setZ( float _z )
 {
     _vmathVfSetElement(mVec128, _z, 2);
     return *this;
 }
 
-inline Point3 & Point3::setZ( const floatInVec &_z )
+__forceinline Point3 & Point3::setZ( const floatInVec &_z )
 {
     mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
     return *this;
 }
 
-inline const floatInVec Point3::getZ( ) const
+__forceinline const floatInVec Point3::getZ( ) const
 {
     return floatInVec( mVec128, 2 );
 }
 
-inline Point3 & Point3::setElem( int idx, float value )
+__forceinline Point3 & Point3::setElem( int idx, float value )
 {
     _vmathVfSetElement(mVec128, value, idx);
     return *this;
 }
 
-inline Point3 & Point3::setElem( int idx, const floatInVec &value )
+__forceinline Point3 & Point3::setElem( int idx, const floatInVec &value )
 {
     mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
     return *this;
 }
 
-inline const floatInVec Point3::getElem( int idx ) const
+__forceinline const floatInVec Point3::getElem( int idx ) const
 {
     return floatInVec( mVec128, idx );
 }
 
-inline VecIdx Point3::operator []( int idx )
+__forceinline VecIdx Point3::operator []( int idx )
 {
     return VecIdx( mVec128, idx );
 }
 
-inline const floatInVec Point3::operator []( int idx ) const
+__forceinline const floatInVec Point3::operator []( int idx ) const
 {
     return floatInVec( mVec128, idx );
 }
 
-inline const Vector3 Point3::operator -( const Point3 &pnt ) const
+__forceinline const Vector3 Point3::operator -( const Point3 &pnt ) const
 {
     return Vector3( _mm_sub_ps( mVec128, pnt.mVec128 ) );
 }
 
-inline const Point3 Point3::operator +( const Vector3 &vec ) const
+__forceinline const Point3 Point3::operator +( const Vector3 &vec ) const
 {
     return Point3( _mm_add_ps( mVec128, vec.get128() ) );
 }
 
-inline const Point3 Point3::operator -( const Vector3 &vec ) const
+__forceinline const Point3 Point3::operator -( const Vector3 &vec ) const
 {
     return Point3( _mm_sub_ps( mVec128, vec.get128() ) );
 }
 
-inline Point3 & Point3::operator +=( const Vector3 &vec )
+__forceinline Point3 & Point3::operator +=( const Vector3 &vec )
 {
     *this = *this + vec;
     return *this;
 }
 
-inline Point3 & Point3::operator -=( const Vector3 &vec )
+__forceinline Point3 & Point3::operator -=( const Vector3 &vec )
 {
     *this = *this - vec;
     return *this;
 }
 
-inline const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+__forceinline const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 )
 {
     return Point3( _mm_mul_ps( pnt0.get128(), pnt1.get128() ) );
 }
 
-inline const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+__forceinline const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 )
 {
     return Point3( _mm_div_ps( pnt0.get128(), pnt1.get128() ) );
 }
 
-inline const Point3 recipPerElem( const Point3 &pnt )
+__forceinline const Point3 recipPerElem( const Point3 &pnt )
 {
     return Point3( _mm_rcp_ps( pnt.get128() ) );
 }
 
-inline const Point3 absPerElem( const Point3 &pnt )
+__forceinline const Point3 absPerElem( const Point3 &pnt )
 {
     return Point3( fabsf4( pnt.get128() ) );
 }
 
-inline const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+__forceinline const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 )
 {
 	__m128 vmask = toM128(0x7fffffff);
 	return Point3( _mm_or_ps(
@@ -1292,91 +1330,93 @@ inline const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 )
 		_mm_andnot_ps( vmask, pnt1.get128() ) ) );		// Signs
 }
 
-inline const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+__forceinline const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 )
 {
     return Point3( _mm_max_ps( pnt0.get128(), pnt1.get128() ) );
 }
 
-inline const floatInVec maxElem( const Point3 &pnt )
+__forceinline const floatInVec maxElem( const Point3 &pnt )
 {
     return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
 }
 
-inline const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+__forceinline const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 )
 {
     return Point3( _mm_min_ps( pnt0.get128(), pnt1.get128() ) );
 }
 
-inline const floatInVec minElem( const Point3 &pnt )
+__forceinline const floatInVec minElem( const Point3 &pnt )
 {
     return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
 }
 
-inline const floatInVec sum( const Point3 &pnt )
+__forceinline const floatInVec sum( const Point3 &pnt )
 {
     return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
 }
 
-inline const Point3 scale( const Point3 &pnt, float scaleVal )
+__forceinline const Point3 scale( const Point3 &pnt, float scaleVal )
 {
     return scale( pnt, floatInVec( scaleVal ) );
 }
 
-inline const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal )
+__forceinline const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal )
 {
     return mulPerElem( pnt, Point3( scaleVal ) );
 }
 
-inline const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec )
+__forceinline const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec )
 {
     return mulPerElem( pnt, Point3( scaleVec ) );
 }
 
-inline const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec )
+__forceinline const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec )
 {
     return floatInVec( _vmathVfDot3( pnt.get128(), unitVec.get128() ), 0 );
 }
 
-inline const floatInVec distSqrFromOrigin( const Point3 &pnt )
+__forceinline const floatInVec distSqrFromOrigin( const Point3 &pnt )
 {
     return lengthSqr( Vector3( pnt ) );
 }
 
-inline const floatInVec distFromOrigin( const Point3 &pnt )
+__forceinline const floatInVec distFromOrigin( const Point3 &pnt )
 {
     return length( Vector3( pnt ) );
 }
 
-inline const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 )
+__forceinline const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 )
 {
     return lengthSqr( ( pnt1 - pnt0 ) );
 }
 
-inline const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 )
+__forceinline const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 )
 {
     return length( ( pnt1 - pnt0 ) );
 }
 
-inline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 )
+__forceinline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 )
 {
     return select( pnt0, pnt1, boolInVec(select1) );
 }
 
-inline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 )
+__forceinline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 )
 {
     return Point3( vec_sel( pnt0.get128(), pnt1.get128(), select1.get128() ) );
 }
 
+
+
 #ifdef _VECTORMATH_DEBUG
 
-inline void print( const Point3 &pnt )
+__forceinline void print( const Point3 &pnt )
 {
     union { __m128 v; float s[4]; } tmp;
     tmp.v = pnt.get128();
     printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
 }
 
-inline void print( const Point3 &pnt, const char * name )
+__forceinline void print( const Point3 &pnt, const char * name )
 {
     union { __m128 v; float s[4]; } tmp;
     tmp.v = pnt.get128();
diff --git a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vecidx_aos.h b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vecidx_aos.h
index 14dc408d0..703790f96 100644
--- a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vecidx_aos.h
+++ b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vecidx_aos.h
@@ -1,5 +1,5 @@
 /*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   Copyright (C) 2006, 2010 Sony Computer Entertainment Inc.
    All rights reserved.
 
    Redistribution and use in source and binary forms,
diff --git a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vectormath_aos.h b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vectormath_aos.h
index ebca2094d..131c754a6 100644
--- a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vectormath_aos.h
+++ b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vectormath_aos.h
@@ -1,2500 +1,2527 @@
-/*
-   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
-   All rights reserved.
-
-   Redistribution and use in source and binary forms,
-   with or without modification, are permitted provided that the
-   following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the Sony Computer Entertainment Inc nor the names
-      of its contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-   POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifndef _VECTORMATH_AOS_CPP_SSE_H
-#define _VECTORMATH_AOS_CPP_SSE_H
-
-#include <math.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#include <assert.h>
-
-// TODO: Tidy
-typedef __m128 vec_float4;
-typedef __m128 vec_uint4;
-typedef __m128 vec_int4;
-typedef __m128i vec_uchar16;
-typedef __m128i vec_ushort8;
-
-#define vec_splat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e))
-
-#define _mm_ror_ps(vec,i)	\
-	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(i+3)%4,(unsigned char)(i+2)%4,(unsigned char)(i+1)%4,(unsigned char)(i+0)%4))) : (vec))
-#define _mm_rol_ps(vec,i)	\
-	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(7-i)%4,(unsigned char)(6-i)%4,(unsigned char)(5-i)%4,(unsigned char)(4-i)%4))) : (vec))
-
-#define vec_sld(vec,vec2,x) _mm_ror_ps(vec, ((x)/4))
-
-#define _mm_abs_ps(vec)		_mm_andnot_ps(_MASKSIGN_,vec)
-#define _mm_neg_ps(vec)		_mm_xor_ps(_MASKSIGN_,vec)
-
-#define vec_madd(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b) )
-
-union SSEFloat
-{
-	__m128 m128;
-	float f[4];
-};
-
-static inline __m128 vec_sel(__m128 a, __m128 b, __m128 mask)
-{
-	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
-}
-static inline __m128 vec_sel(__m128 a, __m128 b, const unsigned int *_mask)
-{
-	return vec_sel(a, b, _mm_load_ps((float *)_mask));
-}
-static inline __m128 vec_sel(__m128 a, __m128 b, unsigned int _mask)
-{
-	return vec_sel(a, b, _mm_set1_ps(*(float *)&_mask));
-}
-
-static inline __m128 toM128(unsigned int x)
-{
-    return _mm_set1_ps( *(float *)&x );
-}
-
-static inline __m128 fabsf4(__m128 x)
-{
-    return _mm_and_ps( x, toM128( 0x7fffffff ) );
-}
-/*
-union SSE64
-{
-	__m128 m128;
-	struct
-	{
-		__m64 m01;
-		__m64 m23;
-	} m64;
-};
-
-static inline __m128 vec_cts(__m128 x, int a)
-{
-	assert(a == 0); // Only 2^0 supported
-	(void)a;
-	SSE64 sse64;
-	sse64.m64.m01 = _mm_cvttps_pi32(x);
-	sse64.m64.m23 = _mm_cvttps_pi32(_mm_ror_ps(x,2));
-	_mm_empty();
-    return sse64.m128;
-}
-
-static inline __m128 vec_ctf(__m128 x, int a)
-{
-	assert(a == 0); // Only 2^0 supported
-	(void)a;
-	SSE64 sse64;
-	sse64.m128 = x;
-	__m128 result =_mm_movelh_ps(
-		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m01),
-		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m23));
-	_mm_empty();
-	return result;
-}
-*/
-static inline __m128 vec_cts(__m128 x, int a)
-{
-	assert(a == 0); // Only 2^0 supported
-	(void)a;
-	__m128i result = _mm_cvtps_epi32(x);
-    return (__m128 &)result;
-}
-
-static inline __m128 vec_ctf(__m128 x, int a)
-{
-	assert(a == 0); // Only 2^0 supported
-	(void)a;
-	return _mm_cvtepi32_ps((__m128i &)x);
-}
-
-#define vec_nmsub(a,b,c) _mm_sub_ps( c, _mm_mul_ps( a, b ) )
-#define vec_sub(a,b) _mm_sub_ps( a, b )
-#define vec_add(a,b) _mm_add_ps( a, b )
-#define vec_mul(a,b) _mm_mul_ps( a, b )
-#define vec_xor(a,b) _mm_xor_ps( a, b )
-#define vec_and(a,b) _mm_and_ps( a, b )
-#define vec_cmpeq(a,b) _mm_cmpeq_ps( a, b )
-#define vec_cmpgt(a,b) _mm_cmpgt_ps( a, b )
-
-#define vec_mergeh(a,b) _mm_unpacklo_ps( a, b )
-#define vec_mergel(a,b) _mm_unpackhi_ps( a, b )
-
-#define vec_andc(a,b) _mm_andnot_ps( b, a )
-
-#define sqrtf4(x) _mm_sqrt_ps( x )
-#define rsqrtf4(x) _mm_rsqrt_ps( x )
-#define recipf4(x) _mm_rcp_ps( x )
-#define negatef4(x) _mm_sub_ps( _mm_setzero_ps(), x )
-
+/*
+   Copyright (C) 2006, 2010 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef _VECTORMATH_AOS_CPP_SSE_H
+#define _VECTORMATH_AOS_CPP_SSE_H
+
+#include <math.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <assert.h>
+
+#define USE_SSE2_LDDQU
+#ifdef USE_SSE2_LDDQU
+#include <intrin.h>  //used for _mm_lddqu_si128
+#endif //USE_SSE2_LDDQU
+
+// TODO: Tidy
+typedef __m128 vec_float4;
+typedef __m128 vec_uint4;
+typedef __m128 vec_int4;
+typedef __m128i vec_uchar16;
+typedef __m128i vec_ushort8;
+
+#define vec_splat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e))
+
+#define _mm_ror_ps(vec,i)	\
+	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(i+3)%4,(unsigned char)(i+2)%4,(unsigned char)(i+1)%4,(unsigned char)(i+0)%4))) : (vec))
+#define _mm_rol_ps(vec,i)	\
+	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(7-i)%4,(unsigned char)(6-i)%4,(unsigned char)(5-i)%4,(unsigned char)(4-i)%4))) : (vec))
+
+#define vec_sld(vec,vec2,x) _mm_ror_ps(vec, ((x)/4))
+
+#define _mm_abs_ps(vec)		_mm_andnot_ps(_MASKSIGN_,vec)
+#define _mm_neg_ps(vec)		_mm_xor_ps(_MASKSIGN_,vec)
+
+#define vec_madd(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b) )
+
+union SSEFloat
+{
+	__m128i vi;
+	__m128 m128;
+	__m128 vf;
+	unsigned int	ui[4];
+	unsigned short s[8];
+	float f[4];
+	SSEFloat(__m128 v) : m128(v) {}
+    SSEFloat(__m128i v) : vi(v) {}
+	SSEFloat() {}//uninitialized
+};
+
+static __forceinline __m128 vec_sel(__m128 a, __m128 b, __m128 mask)
+{
+	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
+}
+static __forceinline __m128 vec_sel(__m128 a, __m128 b, const unsigned int *_mask)
+{
+	return vec_sel(a, b, _mm_load_ps((float *)_mask));
+}
+static __forceinline __m128 vec_sel(__m128 a, __m128 b, unsigned int _mask)
+{
+	return vec_sel(a, b, _mm_set1_ps(*(float *)&_mask));
+}
+
+static __forceinline __m128 toM128(unsigned int x)
+{
+    return _mm_set1_ps( *(float *)&x );
+}
+
+static __forceinline __m128 fabsf4(__m128 x)
+{
+    return _mm_and_ps( x, toM128( 0x7fffffff ) );
+}
+/*
+union SSE64
+{
+	__m128 m128;
+	struct
+	{
+		__m64 m01;
+		__m64 m23;
+	} m64;
+};
+
+static __forceinline __m128 vec_cts(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	SSE64 sse64;
+	sse64.m64.m01 = _mm_cvttps_pi32(x);
+	sse64.m64.m23 = _mm_cvttps_pi32(_mm_ror_ps(x,2));
+	_mm_empty();
+    return sse64.m128;
+}
+
+static __forceinline __m128 vec_ctf(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	SSE64 sse64;
+	sse64.m128 = x;
+	__m128 result =_mm_movelh_ps(
+		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m01),
+		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m23));
+	_mm_empty();
+	return result;
+}
+*/
+static __forceinline __m128 vec_cts(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	__m128i result = _mm_cvtps_epi32(x);
+    return (__m128 &)result;
+}
+
+static __forceinline __m128 vec_ctf(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	return _mm_cvtepi32_ps((__m128i &)x);
+}
+
+#define vec_nmsub(a,b,c) _mm_sub_ps( c, _mm_mul_ps( a, b ) )
+#define vec_sub(a,b) _mm_sub_ps( a, b )
+#define vec_add(a,b) _mm_add_ps( a, b )
+#define vec_mul(a,b) _mm_mul_ps( a, b )
+#define vec_xor(a,b) _mm_xor_ps( a, b )
+#define vec_and(a,b) _mm_and_ps( a, b )
+#define vec_cmpeq(a,b) _mm_cmpeq_ps( a, b )
+#define vec_cmpgt(a,b) _mm_cmpgt_ps( a, b )
+
+#define vec_mergeh(a,b) _mm_unpacklo_ps( a, b )
+#define vec_mergel(a,b) _mm_unpackhi_ps( a, b )
+
+#define vec_andc(a,b) _mm_andnot_ps( b, a )
+
+#define sqrtf4(x) _mm_sqrt_ps( x )
+#define rsqrtf4(x) _mm_rsqrt_ps( x )
+#define recipf4(x) _mm_rcp_ps( x )
+#define negatef4(x) _mm_sub_ps( _mm_setzero_ps(), x )
+
 static __forceinline __m128 newtonrapson_rsqrt4( const __m128 v )
 {   
-#define _half4 _mm_setr_ps(.5f,.5f,.5f,.5f) 
+#define _half4 _mm_setr_ps(.5f,.5f,.5f,.5f) 
 #define _three _mm_setr_ps(3.f,3.f,3.f,3.f)
 const __m128 approx = _mm_rsqrt_ps( v );   
 const __m128 muls = _mm_mul_ps(_mm_mul_ps(v, approx), approx);   
 return _mm_mul_ps(_mm_mul_ps(_half4, approx), _mm_sub_ps(_three, muls) );
-}
-
-static inline __m128 acosf4(__m128 x)
-{
-    __m128 xabs = fabsf4(x);
-	__m128 select = _mm_cmplt_ps( x, _mm_setzero_ps() );
-    __m128 t1 = sqrtf4(vec_sub(_mm_set1_ps(1.0f), xabs));
-    
-    /* Instruction counts can be reduced if the polynomial was
-     * computed entirely from nested (dependent) fma's. However, 
-     * to reduce the number of pipeline stalls, the polygon is evaluated 
-     * in two halves (hi amd lo). 
-     */
-    __m128 xabs2 = _mm_mul_ps(xabs,  xabs);
-    __m128 xabs4 = _mm_mul_ps(xabs2, xabs2);
-    __m128 hi = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0012624911f),
-		xabs, _mm_set1_ps(0.0066700901f)),
-			xabs, _mm_set1_ps(-0.0170881256f)),
-				xabs, _mm_set1_ps( 0.0308918810f));
-    __m128 lo = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0501743046f),
-		xabs, _mm_set1_ps(0.0889789874f)),
-			xabs, _mm_set1_ps(-0.2145988016f)),
-				xabs, _mm_set1_ps( 1.5707963050f));
-    
-    __m128 result = vec_madd(hi, xabs4, lo);
-    
-    // Adjust the result if x is negactive.
-    return vec_sel(
-		vec_mul(t1, result),									// Positive
-		vec_nmsub(t1, result, _mm_set1_ps(3.1415926535898f)),	// Negative
-		select);
-}
-
-static inline __m128 sinf4(vec_float4 x)
-{
-
-//
-// Common constants used to evaluate sinf4/cosf4/tanf4
-//
-#define _SINCOS_CC0  -0.0013602249f
-#define _SINCOS_CC1   0.0416566950f
-#define _SINCOS_CC2  -0.4999990225f
-#define _SINCOS_SC0  -0.0001950727f
-#define _SINCOS_SC1   0.0083320758f
-#define _SINCOS_SC2  -0.1666665247f
-
-#define _SINCOS_KC1  1.57079625129f
-#define _SINCOS_KC2  7.54978995489e-8f
-
-    vec_float4 xl,xl2,xl3,res;
-
-    // Range reduction using : xl = angle * TwoOverPi;
-    //  
-    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
-
-    // Find the quadrant the angle falls in
-    // using:  q = (int) (ceil(abs(xl))*sign(xl))
-    //
-    vec_int4 q = vec_cts(xl,0);
-
-    // Compute an offset based on the quadrant that the angle falls in
-    // 
-    vec_int4 offset = _mm_and_ps(q,toM128(0x3));
-
-    // Remainder in range [-pi/4..pi/4]
-    //
-    vec_float4 qf = vec_ctf(q,0);
-    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
-    
-    // Compute x^2 and x^3
-    //
-    xl2 = vec_mul(xl,xl);
-    xl3 = vec_mul(xl2,xl);
-    
-    // Compute both the sin and cos of the angles
-    // using a polynomial expression:
-    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
-    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
-    //
-    
-    vec_float4 cx =
-		vec_madd(
-			vec_madd(
-				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
-    vec_float4 sx =
-		vec_madd(
-			vec_madd(
-				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
-
-    // Use the cosine when the offset is odd and the sin
-    // when the offset is even
-    //
-    res = vec_sel(cx,sx,vec_cmpeq(vec_and(offset,
-                                          toM128(0x1)),
-										  _mm_setzero_ps()));
-
-    // Flip the sign of the result when (offset mod 4) = 1 or 2
-    //
-    return vec_sel(
-		vec_xor(toM128(0x80000000U), res),	// Negative
-		res,								// Positive
-		vec_cmpeq(vec_and(offset,toM128(0x2)),_mm_setzero_ps()));
-}
-
-static inline void sincosf4(vec_float4 x, vec_float4* s, vec_float4* c)
-{
-    vec_float4 xl,xl2,xl3;
-    vec_int4   offsetSin, offsetCos;
-
-    // Range reduction using : xl = angle * TwoOverPi;
-    //  
-    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
-
-    // Find the quadrant the angle falls in
-    // using:  q = (int) (ceil(abs(xl))*sign(xl))
-    //
-    //vec_int4 q = vec_cts(vec_add(xl,vec_sel(_mm_set1_ps(0.5f),xl,(0x80000000))),0);
-    vec_int4 q = vec_cts(xl,0);
-     
-    // Compute the offset based on the quadrant that the angle falls in.
-    // Add 1 to the offset for the cosine. 
-    //
-    offsetSin = vec_and(q,toM128((int)0x3));
-	__m128i temp = _mm_add_epi32(_mm_set1_epi32(1),(__m128i &)offsetSin);
-	offsetCos = (__m128 &)temp;
-
-    // Remainder in range [-pi/4..pi/4]
-    //
-    vec_float4 qf = vec_ctf(q,0);
-    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
-    
-    // Compute x^2 and x^3
-    //
-    xl2 = vec_mul(xl,xl);
-    xl3 = vec_mul(xl2,xl);
-    
-    // Compute both the sin and cos of the angles
-    // using a polynomial expression:
-    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
-    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
-    //
-    vec_float4 cx =
-		vec_madd(
-			vec_madd(
-				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
-    vec_float4 sx =
-		vec_madd(
-			vec_madd(
-				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
-
-    // Use the cosine when the offset is odd and the sin
-    // when the offset is even
-    //
-    vec_uint4 sinMask = (vec_uint4)vec_cmpeq(vec_and(offsetSin,toM128(0x1)),_mm_setzero_ps());
-    vec_uint4 cosMask = (vec_uint4)vec_cmpeq(vec_and(offsetCos,toM128(0x1)),_mm_setzero_ps());    
-    *s = vec_sel(cx,sx,sinMask);
-    *c = vec_sel(cx,sx,cosMask);
-
-    // Flip the sign of the result when (offset mod 4) = 1 or 2
-    //
-    sinMask = vec_cmpeq(vec_and(offsetSin,toM128(0x2)),_mm_setzero_ps());
-    cosMask = vec_cmpeq(vec_and(offsetCos,toM128(0x2)),_mm_setzero_ps());
-    
-    *s = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*s),*s,sinMask);
-    *c = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*c),*c,cosMask);    
-}
-
-#include "vecidx_aos.h"
-#include "floatInVec.h"
-#include "boolInVec.h"
-
-#ifdef _VECTORMATH_DEBUG
-#include <stdio.h>
-#endif
-namespace Vectormath {
-
-namespace Aos {
-
-//-----------------------------------------------------------------------------
-// Forward Declarations
-//
-
-class Vector3;
-class Vector4;
-class Point3;
-class Quat;
-class Matrix3;
-class Matrix4;
-class Transform3;
-
-// A 3-D vector in array-of-structures format
-//
-class Vector3
-{
-    __m128 mVec128;
-
-public:
-    // Default constructor; does no initialization
-    // 
-    inline Vector3( ) { };
-
-    // Construct a 3-D vector from x, y, and z elements
-    // 
-    inline Vector3( float x, float y, float z );
-
-    // Construct a 3-D vector from x, y, and z elements (scalar data contained in vector data type)
-    // 
-    inline Vector3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
-
-    // Copy elements from a 3-D point into a 3-D vector
-    // 
-    explicit inline Vector3( const Point3 &pnt );
-
-    // Set all elements of a 3-D vector to the same scalar value
-    // 
-    explicit inline Vector3( float scalar );
-
-    // Set all elements of a 3-D vector to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit inline Vector3( const floatInVec &scalar );
-
-    // Set vector float data in a 3-D vector
-    // 
-    explicit inline Vector3( __m128 vf4 );
-
-    // Get vector float data from a 3-D vector
-    // 
-    inline __m128 get128( ) const;
-
-    // Assign one 3-D vector to another
-    // 
-    inline Vector3 & operator =( const Vector3 &vec );
-
-    // Set the x element of a 3-D vector
-    // 
-    inline Vector3 & setX( float x );
-
-    // Set the y element of a 3-D vector
-    // 
-    inline Vector3 & setY( float y );
-
-    // Set the z element of a 3-D vector
-    // 
-    inline Vector3 & setZ( float z );
-
-    // Set the x element of a 3-D vector (scalar data contained in vector data type)
-    // 
-    inline Vector3 & setX( const floatInVec &x );
-
-    // Set the y element of a 3-D vector (scalar data contained in vector data type)
-    // 
-    inline Vector3 & setY( const floatInVec &y );
-
-    // Set the z element of a 3-D vector (scalar data contained in vector data type)
-    // 
-    inline Vector3 & setZ( const floatInVec &z );
-
-    // Get the x element of a 3-D vector
-    // 
-    inline const floatInVec getX( ) const;
-
-    // Get the y element of a 3-D vector
-    // 
-    inline const floatInVec getY( ) const;
-
-    // Get the z element of a 3-D vector
-    // 
-    inline const floatInVec getZ( ) const;
-
-    // Set an x, y, or z element of a 3-D vector by index
-    // 
-    inline Vector3 & setElem( int idx, float value );
-
-    // Set an x, y, or z element of a 3-D vector by index (scalar data contained in vector data type)
-    // 
-    inline Vector3 & setElem( int idx, const floatInVec &value );
-
-    // Get an x, y, or z element of a 3-D vector by index
-    // 
-    inline const floatInVec getElem( int idx ) const;
-
-    // Subscripting operator to set or get an element
-    // 
-    inline VecIdx operator []( int idx );
-
-    // Subscripting operator to get an element
-    // 
-    inline const floatInVec operator []( int idx ) const;
-
-    // Add two 3-D vectors
-    // 
-    inline const Vector3 operator +( const Vector3 &vec ) const;
-
-    // Subtract a 3-D vector from another 3-D vector
-    // 
-    inline const Vector3 operator -( const Vector3 &vec ) const;
-
-    // Add a 3-D vector to a 3-D point
-    // 
-    inline const Point3 operator +( const Point3 &pnt ) const;
-
-    // Multiply a 3-D vector by a scalar
-    // 
-    inline const Vector3 operator *( float scalar ) const;
-
-    // Divide a 3-D vector by a scalar
-    // 
-    inline const Vector3 operator /( float scalar ) const;
-
-    // Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
-    // 
-    inline const Vector3 operator *( const floatInVec &scalar ) const;
-
-    // Divide a 3-D vector by a scalar (scalar data contained in vector data type)
-    // 
-    inline const Vector3 operator /( const floatInVec &scalar ) const;
-
-    // Perform compound assignment and addition with a 3-D vector
-    // 
-    inline Vector3 & operator +=( const Vector3 &vec );
-
-    // Perform compound assignment and subtraction by a 3-D vector
-    // 
-    inline Vector3 & operator -=( const Vector3 &vec );
-
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    inline Vector3 & operator *=( float scalar );
-
-    // Perform compound assignment and division by a scalar
-    // 
-    inline Vector3 & operator /=( float scalar );
-
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    inline Vector3 & operator *=( const floatInVec &scalar );
-
-    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
-    // 
-    inline Vector3 & operator /=( const floatInVec &scalar );
-
-    // Negate all elements of a 3-D vector
-    // 
-    inline const Vector3 operator -( ) const;
-
-    // Construct x axis
-    // 
-    static inline const Vector3 xAxis( );
-
-    // Construct y axis
-    // 
-    static inline const Vector3 yAxis( );
-
-    // Construct z axis
-    // 
-    static inline const Vector3 zAxis( );
-
-};
-
-// Multiply a 3-D vector by a scalar
-// 
-inline const Vector3 operator *( float scalar, const Vector3 &vec );
-
-// Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
-// 
-inline const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec );
-
-// Multiply two 3-D vectors per element
-// 
-inline const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Divide two 3-D vectors per element
-// NOTE: 
-// Floating-point behavior matches standard library function divf4.
-// 
-inline const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Compute the reciprocal of a 3-D vector per element
-// NOTE: 
-// Floating-point behavior matches standard library function recipf4.
-// 
-inline const Vector3 recipPerElem( const Vector3 &vec );
-
-// Compute the absolute value of a 3-D vector per element
-// 
-inline const Vector3 absPerElem( const Vector3 &vec );
-
-// Copy sign from one 3-D vector to another, per element
-// 
-inline const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Maximum of two 3-D vectors per element
-// 
-inline const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Minimum of two 3-D vectors per element
-// 
-inline const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Maximum element of a 3-D vector
-// 
-inline const floatInVec maxElem( const Vector3 &vec );
-
-// Minimum element of a 3-D vector
-// 
-inline const floatInVec minElem( const Vector3 &vec );
-
-// Compute the sum of all elements of a 3-D vector
-// 
-inline const floatInVec sum( const Vector3 &vec );
-
-// Compute the dot product of two 3-D vectors
-// 
-inline const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Compute the square of the length of a 3-D vector
-// 
-inline const floatInVec lengthSqr( const Vector3 &vec );
-
-// Compute the length of a 3-D vector
-// 
-inline const floatInVec length( const Vector3 &vec );
-
-// Normalize a 3-D vector
-// NOTE: 
-// The result is unpredictable when all elements of vec are at or near zero.
-// 
-inline const Vector3 normalize( const Vector3 &vec );
-
-// Compute cross product of two 3-D vectors
-// 
-inline const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Outer product of two 3-D vectors
-// 
-inline const Matrix3 outer( const Vector3 &vec0, const Vector3 &vec1 );
-
-// Pre-multiply a row vector by a 3x3 matrix
-// NOTE: 
-// Slower than column post-multiply.
-// 
-inline const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat );
-
-// Cross-product matrix of a 3-D vector
-// 
-inline const Matrix3 crossMatrix( const Vector3 &vec );
-
-// Create cross-product matrix and multiply
-// NOTE: 
-// Faster than separately creating a cross-product matrix and multiplying.
-// 
-inline const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat );
-
-// Linear interpolation between two 3-D vectors
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-inline const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 );
-
-// Linear interpolation between two 3-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-inline const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 );
-
-// Spherical linear interpolation between two 3-D vectors
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-// 
-inline const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
-
-// Spherical linear interpolation between two 3-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-// 
-inline const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
-
-// Conditionally select between two 3-D vectors
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-inline const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 );
-
-// Conditionally select between two 3-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-inline const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, const boolInVec &select1 );
-
-// Store x, y, and z elements of 3-D vector in first three words of a quadword, preserving fourth word
-// 
-inline void storeXYZ( const Vector3 &vec, __m128 * quad );
-
-// Load four three-float 3-D vectors, stored in three quadwords
-// 
-inline void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads );
-
-// Store four 3-D vectors in three quadwords
-// 
-inline void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads );
-
-// Store eight 3-D vectors as half-floats
-// 
-inline void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a 3-D vector
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Vector3 &vec );
-
-// Print a 3-D vector and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Vector3 &vec, const char * name );
-
-#endif
-
-// A 4-D vector in array-of-structures format
-//
-class Vector4
-{
-    __m128 mVec128;
-
-public:
-    // Default constructor; does no initialization
-    // 
-    inline Vector4( ) { };
-
-    // Construct a 4-D vector from x, y, z, and w elements
-    // 
-    inline Vector4( float x, float y, float z, float w );
-
-    // Construct a 4-D vector from x, y, z, and w elements (scalar data contained in vector data type)
-    // 
-    inline Vector4( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
-
-    // Construct a 4-D vector from a 3-D vector and a scalar
-    // 
-    inline Vector4( const Vector3 &xyz, float w );
-
-    // Construct a 4-D vector from a 3-D vector and a scalar (scalar data contained in vector data type)
-    // 
-    inline Vector4( const Vector3 &xyz, const floatInVec &w );
-
-    // Copy x, y, and z from a 3-D vector into a 4-D vector, and set w to 0
-    // 
-    explicit inline Vector4( const Vector3 &vec );
-
-    // Copy x, y, and z from a 3-D point into a 4-D vector, and set w to 1
-    // 
-    explicit inline Vector4( const Point3 &pnt );
-
-    // Copy elements from a quaternion into a 4-D vector
-    // 
-    explicit inline Vector4( const Quat &quat );
-
-    // Set all elements of a 4-D vector to the same scalar value
-    // 
-    explicit inline Vector4( float scalar );
-
-    // Set all elements of a 4-D vector to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit inline Vector4( const floatInVec &scalar );
-
-    // Set vector float data in a 4-D vector
-    // 
-    explicit inline Vector4( __m128 vf4 );
-
-    // Get vector float data from a 4-D vector
-    // 
-    inline __m128 get128( ) const;
-
-    // Assign one 4-D vector to another
-    // 
-    inline Vector4 & operator =( const Vector4 &vec );
-
-    // Set the x, y, and z elements of a 4-D vector
-    // NOTE: 
-    // This function does not change the w element.
-    // 
-    inline Vector4 & setXYZ( const Vector3 &vec );
-
-    // Get the x, y, and z elements of a 4-D vector
-    // 
-    inline const Vector3 getXYZ( ) const;
-
-    // Set the x element of a 4-D vector
-    // 
-    inline Vector4 & setX( float x );
-
-    // Set the y element of a 4-D vector
-    // 
-    inline Vector4 & setY( float y );
-
-    // Set the z element of a 4-D vector
-    // 
-    inline Vector4 & setZ( float z );
-
-    // Set the w element of a 4-D vector
-    // 
-    inline Vector4 & setW( float w );
-
-    // Set the x element of a 4-D vector (scalar data contained in vector data type)
-    // 
-    inline Vector4 & setX( const floatInVec &x );
-
-    // Set the y element of a 4-D vector (scalar data contained in vector data type)
-    // 
-    inline Vector4 & setY( const floatInVec &y );
-
-    // Set the z element of a 4-D vector (scalar data contained in vector data type)
-    // 
-    inline Vector4 & setZ( const floatInVec &z );
-
-    // Set the w element of a 4-D vector (scalar data contained in vector data type)
-    // 
-    inline Vector4 & setW( const floatInVec &w );
-
-    // Get the x element of a 4-D vector
-    // 
-    inline const floatInVec getX( ) const;
-
-    // Get the y element of a 4-D vector
-    // 
-    inline const floatInVec getY( ) const;
-
-    // Get the z element of a 4-D vector
-    // 
-    inline const floatInVec getZ( ) const;
-
-    // Get the w element of a 4-D vector
-    // 
-    inline const floatInVec getW( ) const;
-
-    // Set an x, y, z, or w element of a 4-D vector by index
-    // 
-    inline Vector4 & setElem( int idx, float value );
-
-    // Set an x, y, z, or w element of a 4-D vector by index (scalar data contained in vector data type)
-    // 
-    inline Vector4 & setElem( int idx, const floatInVec &value );
-
-    // Get an x, y, z, or w element of a 4-D vector by index
-    // 
-    inline const floatInVec getElem( int idx ) const;
-
-    // Subscripting operator to set or get an element
-    // 
-    inline VecIdx operator []( int idx );
-
-    // Subscripting operator to get an element
-    // 
-    inline const floatInVec operator []( int idx ) const;
-
-    // Add two 4-D vectors
-    // 
-    inline const Vector4 operator +( const Vector4 &vec ) const;
-
-    // Subtract a 4-D vector from another 4-D vector
-    // 
-    inline const Vector4 operator -( const Vector4 &vec ) const;
-
-    // Multiply a 4-D vector by a scalar
-    // 
-    inline const Vector4 operator *( float scalar ) const;
-
-    // Divide a 4-D vector by a scalar
-    // 
-    inline const Vector4 operator /( float scalar ) const;
-
-    // Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
-    // 
-    inline const Vector4 operator *( const floatInVec &scalar ) const;
-
-    // Divide a 4-D vector by a scalar (scalar data contained in vector data type)
-    // 
-    inline const Vector4 operator /( const floatInVec &scalar ) const;
-
-    // Perform compound assignment and addition with a 4-D vector
-    // 
-    inline Vector4 & operator +=( const Vector4 &vec );
-
-    // Perform compound assignment and subtraction by a 4-D vector
-    // 
-    inline Vector4 & operator -=( const Vector4 &vec );
-
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    inline Vector4 & operator *=( float scalar );
-
-    // Perform compound assignment and division by a scalar
-    // 
-    inline Vector4 & operator /=( float scalar );
-
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    inline Vector4 & operator *=( const floatInVec &scalar );
-
-    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
-    // 
-    inline Vector4 & operator /=( const floatInVec &scalar );
-
-    // Negate all elements of a 4-D vector
-    // 
-    inline const Vector4 operator -( ) const;
-
-    // Construct x axis
-    // 
-    static inline const Vector4 xAxis( );
-
-    // Construct y axis
-    // 
-    static inline const Vector4 yAxis( );
-
-    // Construct z axis
-    // 
-    static inline const Vector4 zAxis( );
-
-    // Construct w axis
-    // 
-    static inline const Vector4 wAxis( );
-
-};
-
-// Multiply a 4-D vector by a scalar
-// 
-inline const Vector4 operator *( float scalar, const Vector4 &vec );
-
-// Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
-// 
-inline const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec );
-
-// Multiply two 4-D vectors per element
-// 
-inline const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Divide two 4-D vectors per element
-// NOTE: 
-// Floating-point behavior matches standard library function divf4.
-// 
-inline const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Compute the reciprocal of a 4-D vector per element
-// NOTE: 
-// Floating-point behavior matches standard library function recipf4.
-// 
-inline const Vector4 recipPerElem( const Vector4 &vec );
-
-// Compute the absolute value of a 4-D vector per element
-// 
-inline const Vector4 absPerElem( const Vector4 &vec );
-
-// Copy sign from one 4-D vector to another, per element
-// 
-inline const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Maximum of two 4-D vectors per element
-// 
-inline const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Minimum of two 4-D vectors per element
-// 
-inline const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Maximum element of a 4-D vector
-// 
-inline const floatInVec maxElem( const Vector4 &vec );
-
-// Minimum element of a 4-D vector
-// 
-inline const floatInVec minElem( const Vector4 &vec );
-
-// Compute the sum of all elements of a 4-D vector
-// 
-inline const floatInVec sum( const Vector4 &vec );
-
-// Compute the dot product of two 4-D vectors
-// 
-inline const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Compute the square of the length of a 4-D vector
-// 
-inline const floatInVec lengthSqr( const Vector4 &vec );
-
-// Compute the length of a 4-D vector
-// 
-inline const floatInVec length( const Vector4 &vec );
-
-// Normalize a 4-D vector
-// NOTE: 
-// The result is unpredictable when all elements of vec are at or near zero.
-// 
-inline const Vector4 normalize( const Vector4 &vec );
-
-// Outer product of two 4-D vectors
-// 
-inline const Matrix4 outer( const Vector4 &vec0, const Vector4 &vec1 );
-
-// Linear interpolation between two 4-D vectors
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-inline const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 );
-
-// Linear interpolation between two 4-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-inline const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 );
-
-// Spherical linear interpolation between two 4-D vectors
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-// 
-inline const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
-
-// Spherical linear interpolation between two 4-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// The result is unpredictable if the vectors point in opposite directions.
-// Does not clamp t between 0 and 1.
-// 
-inline const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
-
-// Conditionally select between two 4-D vectors
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-inline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 );
-
-// Conditionally select between two 4-D vectors (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-inline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, const boolInVec &select1 );
-
-// Store four 4-D vectors as half-floats
-// 
-inline void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a 4-D vector
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Vector4 &vec );
-
-// Print a 4-D vector and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Vector4 &vec, const char * name );
-
-#endif
-
-// A 3-D point in array-of-structures format
-//
-class Point3
-{
-    __m128 mVec128;
-
-public:
-    // Default constructor; does no initialization
-    // 
-    inline Point3( ) { };
-
-    // Construct a 3-D point from x, y, and z elements
-    // 
-    inline Point3( float x, float y, float z );
-
-    // Construct a 3-D point from x, y, and z elements (scalar data contained in vector data type)
-    // 
-    inline Point3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
-
-    // Copy elements from a 3-D vector into a 3-D point
-    // 
-    explicit inline Point3( const Vector3 &vec );
-
-    // Set all elements of a 3-D point to the same scalar value
-    // 
-    explicit inline Point3( float scalar );
-
-    // Set all elements of a 3-D point to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit inline Point3( const floatInVec &scalar );
-
-    // Set vector float data in a 3-D point
-    // 
-    explicit inline Point3( __m128 vf4 );
-
-    // Get vector float data from a 3-D point
-    // 
-    inline __m128 get128( ) const;
-
-    // Assign one 3-D point to another
-    // 
-    inline Point3 & operator =( const Point3 &pnt );
-
-    // Set the x element of a 3-D point
-    // 
-    inline Point3 & setX( float x );
-
-    // Set the y element of a 3-D point
-    // 
-    inline Point3 & setY( float y );
-
-    // Set the z element of a 3-D point
-    // 
-    inline Point3 & setZ( float z );
-
-    // Set the x element of a 3-D point (scalar data contained in vector data type)
-    // 
-    inline Point3 & setX( const floatInVec &x );
-
-    // Set the y element of a 3-D point (scalar data contained in vector data type)
-    // 
-    inline Point3 & setY( const floatInVec &y );
-
-    // Set the z element of a 3-D point (scalar data contained in vector data type)
-    // 
-    inline Point3 & setZ( const floatInVec &z );
-
-    // Get the x element of a 3-D point
-    // 
-    inline const floatInVec getX( ) const;
-
-    // Get the y element of a 3-D point
-    // 
-    inline const floatInVec getY( ) const;
-
-    // Get the z element of a 3-D point
-    // 
-    inline const floatInVec getZ( ) const;
-
-    // Set an x, y, or z element of a 3-D point by index
-    // 
-    inline Point3 & setElem( int idx, float value );
-
-    // Set an x, y, or z element of a 3-D point by index (scalar data contained in vector data type)
-    // 
-    inline Point3 & setElem( int idx, const floatInVec &value );
-
-    // Get an x, y, or z element of a 3-D point by index
-    // 
-    inline const floatInVec getElem( int idx ) const;
-
-    // Subscripting operator to set or get an element
-    // 
-    inline VecIdx operator []( int idx );
-
-    // Subscripting operator to get an element
-    // 
-    inline const floatInVec operator []( int idx ) const;
-
-    // Subtract a 3-D point from another 3-D point
-    // 
-    inline const Vector3 operator -( const Point3 &pnt ) const;
-
-    // Add a 3-D point to a 3-D vector
-    // 
-    inline const Point3 operator +( const Vector3 &vec ) const;
-
-    // Subtract a 3-D vector from a 3-D point
-    // 
-    inline const Point3 operator -( const Vector3 &vec ) const;
-
-    // Perform compound assignment and addition with a 3-D vector
-    // 
-    inline Point3 & operator +=( const Vector3 &vec );
-
-    // Perform compound assignment and subtraction by a 3-D vector
-    // 
-    inline Point3 & operator -=( const Vector3 &vec );
-
-};
-
-// Multiply two 3-D points per element
-// 
-inline const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Divide two 3-D points per element
-// NOTE: 
-// Floating-point behavior matches standard library function divf4.
-// 
-inline const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Compute the reciprocal of a 3-D point per element
-// NOTE: 
-// Floating-point behavior matches standard library function recipf4.
-// 
-inline const Point3 recipPerElem( const Point3 &pnt );
-
-// Compute the absolute value of a 3-D point per element
-// 
-inline const Point3 absPerElem( const Point3 &pnt );
-
-// Copy sign from one 3-D point to another, per element
-// 
-inline const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Maximum of two 3-D points per element
-// 
-inline const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Minimum of two 3-D points per element
-// 
-inline const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Maximum element of a 3-D point
-// 
-inline const floatInVec maxElem( const Point3 &pnt );
-
-// Minimum element of a 3-D point
-// 
-inline const floatInVec minElem( const Point3 &pnt );
-
-// Compute the sum of all elements of a 3-D point
-// 
-inline const floatInVec sum( const Point3 &pnt );
-
-// Apply uniform scale to a 3-D point
-// 
-inline const Point3 scale( const Point3 &pnt, float scaleVal );
-
-// Apply uniform scale to a 3-D point (scalar data contained in vector data type)
-// 
-inline const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal );
-
-// Apply non-uniform scale to a 3-D point
-// 
-inline const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec );
-
-// Scalar projection of a 3-D point on a unit-length 3-D vector
-// 
-inline const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec );
-
-// Compute the square of the distance of a 3-D point from the coordinate-system origin
-// 
-inline const floatInVec distSqrFromOrigin( const Point3 &pnt );
-
-// Compute the distance of a 3-D point from the coordinate-system origin
-// 
-inline const floatInVec distFromOrigin( const Point3 &pnt );
-
-// Compute the square of the distance between two 3-D points
-// 
-inline const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Compute the distance between two 3-D points
-// 
-inline const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 );
-
-// Linear interpolation between two 3-D points
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-inline const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 );
-
-// Linear interpolation between two 3-D points (scalar data contained in vector data type)
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-inline const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 );
-
-// Conditionally select between two 3-D points
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-inline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 );
-
-// Conditionally select between two 3-D points (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-inline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 );
-
-// Store x, y, and z elements of 3-D point in first three words of a quadword, preserving fourth word
-// 
-inline void storeXYZ( const Point3 &pnt, __m128 * quad );
-
-// Load four three-float 3-D points, stored in three quadwords
-// 
-inline void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads );
-
-// Store four 3-D points in three quadwords
-// 
-inline void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads );
-
-// Store eight 3-D points as half-floats
-// 
-inline void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a 3-D point
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Point3 &pnt );
-
-// Print a 3-D point and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Point3 &pnt, const char * name );
-
-#endif
-
-// A quaternion in array-of-structures format
-//
-class Quat
-{
-    __m128 mVec128;
-
-public:
-    // Default constructor; does no initialization
-    // 
-    inline Quat( ) { };
-
-    // Construct a quaternion from x, y, z, and w elements
-    // 
-    inline Quat( float x, float y, float z, float w );
-
-    // Construct a quaternion from x, y, z, and w elements (scalar data contained in vector data type)
-    // 
-    inline Quat( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
-
-    // Construct a quaternion from a 3-D vector and a scalar
-    // 
-    inline Quat( const Vector3 &xyz, float w );
-
-    // Construct a quaternion from a 3-D vector and a scalar (scalar data contained in vector data type)
-    // 
-    inline Quat( const Vector3 &xyz, const floatInVec &w );
-
-    // Copy elements from a 4-D vector into a quaternion
-    // 
-    explicit inline Quat( const Vector4 &vec );
-
-    // Convert a rotation matrix to a unit-length quaternion
-    // 
-    explicit inline Quat( const Matrix3 & rotMat );
-
-    // Set all elements of a quaternion to the same scalar value
-    // 
-    explicit inline Quat( float scalar );
-
-    // Set all elements of a quaternion to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit inline Quat( const floatInVec &scalar );
-
-    // Set vector float data in a quaternion
-    // 
-    explicit inline Quat( __m128 vf4 );
-
-    // Get vector float data from a quaternion
-    // 
-    inline __m128 get128( ) const;
-
-    // Assign one quaternion to another
-    // 
-    inline Quat & operator =( const Quat &quat );
-
-    // Set the x, y, and z elements of a quaternion
-    // NOTE: 
-    // This function does not change the w element.
-    // 
-    inline Quat & setXYZ( const Vector3 &vec );
-
-    // Get the x, y, and z elements of a quaternion
-    // 
-    inline const Vector3 getXYZ( ) const;
-
-    // Set the x element of a quaternion
-    // 
-    inline Quat & setX( float x );
-
-    // Set the y element of a quaternion
-    // 
-    inline Quat & setY( float y );
-
-    // Set the z element of a quaternion
-    // 
-    inline Quat & setZ( float z );
-
-    // Set the w element of a quaternion
-    // 
-    inline Quat & setW( float w );
-
-    // Set the x element of a quaternion (scalar data contained in vector data type)
-    // 
-    inline Quat & setX( const floatInVec &x );
-
-    // Set the y element of a quaternion (scalar data contained in vector data type)
-    // 
-    inline Quat & setY( const floatInVec &y );
-
-    // Set the z element of a quaternion (scalar data contained in vector data type)
-    // 
-    inline Quat & setZ( const floatInVec &z );
-
-    // Set the w element of a quaternion (scalar data contained in vector data type)
-    // 
-    inline Quat & setW( const floatInVec &w );
-
-    // Get the x element of a quaternion
-    // 
-    inline const floatInVec getX( ) const;
-
-    // Get the y element of a quaternion
-    // 
-    inline const floatInVec getY( ) const;
-
-    // Get the z element of a quaternion
-    // 
-    inline const floatInVec getZ( ) const;
-
-    // Get the w element of a quaternion
-    // 
-    inline const floatInVec getW( ) const;
-
-    // Set an x, y, z, or w element of a quaternion by index
-    // 
-    inline Quat & setElem( int idx, float value );
-
-    // Set an x, y, z, or w element of a quaternion by index (scalar data contained in vector data type)
-    // 
-    inline Quat & setElem( int idx, const floatInVec &value );
-
-    // Get an x, y, z, or w element of a quaternion by index
-    // 
-    inline const floatInVec getElem( int idx ) const;
-
-    // Subscripting operator to set or get an element
-    // 
-    inline VecIdx operator []( int idx );
-
-    // Subscripting operator to get an element
-    // 
-    inline const floatInVec operator []( int idx ) const;
-
-    // Add two quaternions
-    // 
-    inline const Quat operator +( const Quat &quat ) const;
-
-    // Subtract a quaternion from another quaternion
-    // 
-    inline const Quat operator -( const Quat &quat ) const;
-
-    // Multiply two quaternions
-    // 
-    inline const Quat operator *( const Quat &quat ) const;
-
-    // Multiply a quaternion by a scalar
-    // 
-    inline const Quat operator *( float scalar ) const;
-
-    // Divide a quaternion by a scalar
-    // 
-    inline const Quat operator /( float scalar ) const;
-
-    // Multiply a quaternion by a scalar (scalar data contained in vector data type)
-    // 
-    inline const Quat operator *( const floatInVec &scalar ) const;
-
-    // Divide a quaternion by a scalar (scalar data contained in vector data type)
-    // 
-    inline const Quat operator /( const floatInVec &scalar ) const;
-
-    // Perform compound assignment and addition with a quaternion
-    // 
-    inline Quat & operator +=( const Quat &quat );
-
-    // Perform compound assignment and subtraction by a quaternion
-    // 
-    inline Quat & operator -=( const Quat &quat );
-
-    // Perform compound assignment and multiplication by a quaternion
-    // 
-    inline Quat & operator *=( const Quat &quat );
-
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    inline Quat & operator *=( float scalar );
-
-    // Perform compound assignment and division by a scalar
-    // 
-    inline Quat & operator /=( float scalar );
-
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    inline Quat & operator *=( const floatInVec &scalar );
-
-    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
-    // 
-    inline Quat & operator /=( const floatInVec &scalar );
-
-    // Negate all elements of a quaternion
-    // 
-    inline const Quat operator -( ) const;
-
-    // Construct an identity quaternion
-    // 
-    static inline const Quat identity( );
-
-    // Construct a quaternion to rotate between two unit-length 3-D vectors
-    // NOTE: 
-    // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions.
-    // 
-    static inline const Quat rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 );
-
-    // Construct a quaternion to rotate around a unit-length 3-D vector
-    // 
-    static inline const Quat rotation( float radians, const Vector3 &unitVec );
-
-    // Construct a quaternion to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
-    // 
-    static inline const Quat rotation( const floatInVec &radians, const Vector3 &unitVec );
-
-    // Construct a quaternion to rotate around the x axis
-    // 
-    static inline const Quat rotationX( float radians );
-
-    // Construct a quaternion to rotate around the y axis
-    // 
-    static inline const Quat rotationY( float radians );
-
-    // Construct a quaternion to rotate around the z axis
-    // 
-    static inline const Quat rotationZ( float radians );
-
-    // Construct a quaternion to rotate around the x axis (scalar data contained in vector data type)
-    // 
-    static inline const Quat rotationX( const floatInVec &radians );
-
-    // Construct a quaternion to rotate around the y axis (scalar data contained in vector data type)
-    // 
-    static inline const Quat rotationY( const floatInVec &radians );
-
-    // Construct a quaternion to rotate around the z axis (scalar data contained in vector data type)
-    // 
-    static inline const Quat rotationZ( const floatInVec &radians );
-
-};
-
-// Multiply a quaternion by a scalar
-// 
-inline const Quat operator *( float scalar, const Quat &quat );
-
-// Multiply a quaternion by a scalar (scalar data contained in vector data type)
-// 
-inline const Quat operator *( const floatInVec &scalar, const Quat &quat );
-
-// Compute the conjugate of a quaternion
-// 
-inline const Quat conj( const Quat &quat );
-
-// Use a unit-length quaternion to rotate a 3-D vector
-// 
-inline const Vector3 rotate( const Quat &unitQuat, const Vector3 &vec );
-
-// Compute the dot product of two quaternions
-// 
-inline const floatInVec dot( const Quat &quat0, const Quat &quat1 );
-
-// Compute the norm of a quaternion
-// 
-inline const floatInVec norm( const Quat &quat );
-
-// Compute the length of a quaternion
-// 
-inline const floatInVec length( const Quat &quat );
-
-// Normalize a quaternion
-// NOTE: 
-// The result is unpredictable when all elements of quat are at or near zero.
-// 
-inline const Quat normalize( const Quat &quat );
-
-// Linear interpolation between two quaternions
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-inline const Quat lerp( float t, const Quat &quat0, const Quat &quat1 );
-
-// Linear interpolation between two quaternions (scalar data contained in vector data type)
-// NOTE: 
-// Does not clamp t between 0 and 1.
-// 
-inline const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 );
-
-// Spherical linear interpolation between two quaternions
-// NOTE: 
-// Interpolates along the shortest path between orientations.
-// Does not clamp t between 0 and 1.
-// 
-inline const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 );
-
-// Spherical linear interpolation between two quaternions (scalar data contained in vector data type)
-// NOTE: 
-// Interpolates along the shortest path between orientations.
-// Does not clamp t between 0 and 1.
-// 
-inline const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 );
-
-// Spherical quadrangle interpolation
-// 
-inline const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
-
-// Spherical quadrangle interpolation (scalar data contained in vector data type)
-// 
-inline const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
-
-// Conditionally select between two quaternions
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-inline const Quat select( const Quat &quat0, const Quat &quat1, bool select1 );
-
-// Conditionally select between two quaternions (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-inline const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a quaternion
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Quat &quat );
-
-// Print a quaternion and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Quat &quat, const char * name );
-
-#endif
-
-// A 3x3 matrix in array-of-structures format
-//
-class Matrix3
-{
-    Vector3 mCol0;
-    Vector3 mCol1;
-    Vector3 mCol2;
-
-public:
-    // Default constructor; does no initialization
-    // 
-    inline Matrix3( ) { };
-
-    // Copy a 3x3 matrix
-    // 
-    inline Matrix3( const Matrix3 & mat );
-
-    // Construct a 3x3 matrix containing the specified columns
-    // 
-    inline Matrix3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2 );
-
-    // Construct a 3x3 rotation matrix from a unit-length quaternion
-    // 
-    explicit inline Matrix3( const Quat &unitQuat );
-
-    // Set all elements of a 3x3 matrix to the same scalar value
-    // 
-    explicit inline Matrix3( float scalar );
-
-    // Set all elements of a 3x3 matrix to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit inline Matrix3( const floatInVec &scalar );
-
-    // Assign one 3x3 matrix to another
-    // 
-    inline Matrix3 & operator =( const Matrix3 & mat );
-
-    // Set column 0 of a 3x3 matrix
-    // 
-    inline Matrix3 & setCol0( const Vector3 &col0 );
-
-    // Set column 1 of a 3x3 matrix
-    // 
-    inline Matrix3 & setCol1( const Vector3 &col1 );
-
-    // Set column 2 of a 3x3 matrix
-    // 
-    inline Matrix3 & setCol2( const Vector3 &col2 );
-
-    // Get column 0 of a 3x3 matrix
-    // 
-    inline const Vector3 getCol0( ) const;
-
-    // Get column 1 of a 3x3 matrix
-    // 
-    inline const Vector3 getCol1( ) const;
-
-    // Get column 2 of a 3x3 matrix
-    // 
-    inline const Vector3 getCol2( ) const;
-
-    // Set the column of a 3x3 matrix referred to by the specified index
-    // 
-    inline Matrix3 & setCol( int col, const Vector3 &vec );
-
-    // Set the row of a 3x3 matrix referred to by the specified index
-    // 
-    inline Matrix3 & setRow( int row, const Vector3 &vec );
-
-    // Get the column of a 3x3 matrix referred to by the specified index
-    // 
-    inline const Vector3 getCol( int col ) const;
-
-    // Get the row of a 3x3 matrix referred to by the specified index
-    // 
-    inline const Vector3 getRow( int row ) const;
-
-    // Subscripting operator to set or get a column
-    // 
-    inline Vector3 & operator []( int col );
-
-    // Subscripting operator to get a column
-    // 
-    inline const Vector3 operator []( int col ) const;
-
-    // Set the element of a 3x3 matrix referred to by column and row indices
-    // 
-    inline Matrix3 & setElem( int col, int row, float val );
-
-    // Set the element of a 3x3 matrix referred to by column and row indices (scalar data contained in vector data type)
-    // 
-    inline Matrix3 & setElem( int col, int row, const floatInVec &val );
-
-    // Get the element of a 3x3 matrix referred to by column and row indices
-    // 
-    inline const floatInVec getElem( int col, int row ) const;
-
-    // Add two 3x3 matrices
-    // 
-    inline const Matrix3 operator +( const Matrix3 & mat ) const;
-
-    // Subtract a 3x3 matrix from another 3x3 matrix
-    // 
-    inline const Matrix3 operator -( const Matrix3 & mat ) const;
-
-    // Negate all elements of a 3x3 matrix
-    // 
-    inline const Matrix3 operator -( ) const;
-
-    // Multiply a 3x3 matrix by a scalar
-    // 
-    inline const Matrix3 operator *( float scalar ) const;
-
-    // Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
-    // 
-    inline const Matrix3 operator *( const floatInVec &scalar ) const;
-
-    // Multiply a 3x3 matrix by a 3-D vector
-    // 
-    inline const Vector3 operator *( const Vector3 &vec ) const;
-
-    // Multiply two 3x3 matrices
-    // 
-    inline const Matrix3 operator *( const Matrix3 & mat ) const;
-
-    // Perform compound assignment and addition with a 3x3 matrix
-    // 
-    inline Matrix3 & operator +=( const Matrix3 & mat );
-
-    // Perform compound assignment and subtraction by a 3x3 matrix
-    // 
-    inline Matrix3 & operator -=( const Matrix3 & mat );
-
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    inline Matrix3 & operator *=( float scalar );
-
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    inline Matrix3 & operator *=( const floatInVec &scalar );
-
-    // Perform compound assignment and multiplication by a 3x3 matrix
-    // 
-    inline Matrix3 & operator *=( const Matrix3 & mat );
-
-    // Construct an identity 3x3 matrix
-    // 
-    static inline const Matrix3 identity( );
-
-    // Construct a 3x3 matrix to rotate around the x axis
-    // 
-    static inline const Matrix3 rotationX( float radians );
-
-    // Construct a 3x3 matrix to rotate around the y axis
-    // 
-    static inline const Matrix3 rotationY( float radians );
-
-    // Construct a 3x3 matrix to rotate around the z axis
-    // 
-    static inline const Matrix3 rotationZ( float radians );
-
-    // Construct a 3x3 matrix to rotate around the x axis (scalar data contained in vector data type)
-    // 
-    static inline const Matrix3 rotationX( const floatInVec &radians );
-
-    // Construct a 3x3 matrix to rotate around the y axis (scalar data contained in vector data type)
-    // 
-    static inline const Matrix3 rotationY( const floatInVec &radians );
-
-    // Construct a 3x3 matrix to rotate around the z axis (scalar data contained in vector data type)
-    // 
-    static inline const Matrix3 rotationZ( const floatInVec &radians );
-
-    // Construct a 3x3 matrix to rotate around the x, y, and z axes
-    // 
-    static inline const Matrix3 rotationZYX( const Vector3 &radiansXYZ );
-
-    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector
-    // 
-    static inline const Matrix3 rotation( float radians, const Vector3 &unitVec );
-
-    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
-    // 
-    static inline const Matrix3 rotation( const floatInVec &radians, const Vector3 &unitVec );
-
-    // Construct a rotation matrix from a unit-length quaternion
-    // 
-    static inline const Matrix3 rotation( const Quat &unitQuat );
-
-    // Construct a 3x3 matrix to perform scaling
-    // 
-    static inline const Matrix3 scale( const Vector3 &scaleVec );
-
-};
-// Multiply a 3x3 matrix by a scalar
-// 
-inline const Matrix3 operator *( float scalar, const Matrix3 & mat );
-
-// Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
-// 
-inline const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat );
-
-// Append (post-multiply) a scale transformation to a 3x3 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-// 
-inline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec );
-
-// Prepend (pre-multiply) a scale transformation to a 3x3 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-// 
-inline const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat );
-
-// Multiply two 3x3 matrices per element
-// 
-inline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 );
-
-// Compute the absolute value of a 3x3 matrix per element
-// 
-inline const Matrix3 absPerElem( const Matrix3 & mat );
-
-// Transpose of a 3x3 matrix
-// 
-inline const Matrix3 transpose( const Matrix3 & mat );
-
-// Compute the inverse of a 3x3 matrix
-// NOTE: 
-// Result is unpredictable when the determinant of mat is equal to or near 0.
-// 
-inline const Matrix3 inverse( const Matrix3 & mat );
-
-// Determinant of a 3x3 matrix
-// 
-inline const floatInVec determinant( const Matrix3 & mat );
-
-// Conditionally select between two 3x3 matrices
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 );
-
-// Conditionally select between two 3x3 matrices (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-inline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a 3x3 matrix
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Matrix3 & mat );
-
-// Print a 3x3 matrix and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Matrix3 & mat, const char * name );
-
-#endif
-
-// A 4x4 matrix in array-of-structures format
-//
-class Matrix4
-{
-    Vector4 mCol0;
-    Vector4 mCol1;
-    Vector4 mCol2;
-    Vector4 mCol3;
-
-public:
-    // Default constructor; does no initialization
-    // 
-    inline Matrix4( ) { };
-
-    // Copy a 4x4 matrix
-    // 
-    inline Matrix4( const Matrix4 & mat );
-
-    // Construct a 4x4 matrix containing the specified columns
-    // 
-    inline Matrix4( const Vector4 &col0, const Vector4 &col1, const Vector4 &col2, const Vector4 &col3 );
-
-    // Construct a 4x4 matrix from a 3x4 transformation matrix
-    // 
-    explicit inline Matrix4( const Transform3 & mat );
-
-    // Construct a 4x4 matrix from a 3x3 matrix and a 3-D vector
-    // 
-    inline Matrix4( const Matrix3 & mat, const Vector3 &translateVec );
-
-    // Construct a 4x4 matrix from a unit-length quaternion and a 3-D vector
-    // 
-    inline Matrix4( const Quat &unitQuat, const Vector3 &translateVec );
-
-    // Set all elements of a 4x4 matrix to the same scalar value
-    // 
-    explicit inline Matrix4( float scalar );
-
-    // Set all elements of a 4x4 matrix to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit inline Matrix4( const floatInVec &scalar );
-
-    // Assign one 4x4 matrix to another
-    // 
-    inline Matrix4 & operator =( const Matrix4 & mat );
-
-    // Set the upper-left 3x3 submatrix
-    // NOTE: 
-    // This function does not change the bottom row elements.
-    // 
-    inline Matrix4 & setUpper3x3( const Matrix3 & mat3 );
-
-    // Get the upper-left 3x3 submatrix of a 4x4 matrix
-    // 
-    inline const Matrix3 getUpper3x3( ) const;
-
-    // Set translation component
-    // NOTE: 
-    // This function does not change the bottom row elements.
-    // 
-    inline Matrix4 & setTranslation( const Vector3 &translateVec );
-
-    // Get the translation component of a 4x4 matrix
-    // 
-    inline const Vector3 getTranslation( ) const;
-
-    // Set column 0 of a 4x4 matrix
-    // 
-    inline Matrix4 & setCol0( const Vector4 &col0 );
-
-    // Set column 1 of a 4x4 matrix
-    // 
-    inline Matrix4 & setCol1( const Vector4 &col1 );
-
-    // Set column 2 of a 4x4 matrix
-    // 
-    inline Matrix4 & setCol2( const Vector4 &col2 );
-
-    // Set column 3 of a 4x4 matrix
-    // 
-    inline Matrix4 & setCol3( const Vector4 &col3 );
-
-    // Get column 0 of a 4x4 matrix
-    // 
-    inline const Vector4 getCol0( ) const;
-
-    // Get column 1 of a 4x4 matrix
-    // 
-    inline const Vector4 getCol1( ) const;
-
-    // Get column 2 of a 4x4 matrix
-    // 
-    inline const Vector4 getCol2( ) const;
-
-    // Get column 3 of a 4x4 matrix
-    // 
-    inline const Vector4 getCol3( ) const;
-
-    // Set the column of a 4x4 matrix referred to by the specified index
-    // 
-    inline Matrix4 & setCol( int col, const Vector4 &vec );
-
-    // Set the row of a 4x4 matrix referred to by the specified index
-    // 
-    inline Matrix4 & setRow( int row, const Vector4 &vec );
-
-    // Get the column of a 4x4 matrix referred to by the specified index
-    // 
-    inline const Vector4 getCol( int col ) const;
-
-    // Get the row of a 4x4 matrix referred to by the specified index
-    // 
-    inline const Vector4 getRow( int row ) const;
-
-    // Subscripting operator to set or get a column
-    // 
-    inline Vector4 & operator []( int col );
-
-    // Subscripting operator to get a column
-    // 
-    inline const Vector4 operator []( int col ) const;
-
-    // Set the element of a 4x4 matrix referred to by column and row indices
-    // 
-    inline Matrix4 & setElem( int col, int row, float val );
-
-    // Set the element of a 4x4 matrix referred to by column and row indices (scalar data contained in vector data type)
-    // 
-    inline Matrix4 & setElem( int col, int row, const floatInVec &val );
-
-    // Get the element of a 4x4 matrix referred to by column and row indices
-    // 
-    inline const floatInVec getElem( int col, int row ) const;
-
-    // Add two 4x4 matrices
-    // 
-    inline const Matrix4 operator +( const Matrix4 & mat ) const;
-
-    // Subtract a 4x4 matrix from another 4x4 matrix
-    // 
-    inline const Matrix4 operator -( const Matrix4 & mat ) const;
-
-    // Negate all elements of a 4x4 matrix
-    // 
-    inline const Matrix4 operator -( ) const;
-
-    // Multiply a 4x4 matrix by a scalar
-    // 
-    inline const Matrix4 operator *( float scalar ) const;
-
-    // Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
-    // 
-    inline const Matrix4 operator *( const floatInVec &scalar ) const;
-
-    // Multiply a 4x4 matrix by a 4-D vector
-    // 
-    inline const Vector4 operator *( const Vector4 &vec ) const;
-
-    // Multiply a 4x4 matrix by a 3-D vector
-    // 
-    inline const Vector4 operator *( const Vector3 &vec ) const;
-
-    // Multiply a 4x4 matrix by a 3-D point
-    // 
-    inline const Vector4 operator *( const Point3 &pnt ) const;
-
-    // Multiply two 4x4 matrices
-    // 
-    inline const Matrix4 operator *( const Matrix4 & mat ) const;
-
-    // Multiply a 4x4 matrix by a 3x4 transformation matrix
-    // 
-    inline const Matrix4 operator *( const Transform3 & tfrm ) const;
-
-    // Perform compound assignment and addition with a 4x4 matrix
-    // 
-    inline Matrix4 & operator +=( const Matrix4 & mat );
-
-    // Perform compound assignment and subtraction by a 4x4 matrix
-    // 
-    inline Matrix4 & operator -=( const Matrix4 & mat );
-
-    // Perform compound assignment and multiplication by a scalar
-    // 
-    inline Matrix4 & operator *=( float scalar );
-
-    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
-    // 
-    inline Matrix4 & operator *=( const floatInVec &scalar );
-
-    // Perform compound assignment and multiplication by a 4x4 matrix
-    // 
-    inline Matrix4 & operator *=( const Matrix4 & mat );
-
-    // Perform compound assignment and multiplication by a 3x4 transformation matrix
-    // 
-    inline Matrix4 & operator *=( const Transform3 & tfrm );
-
-    // Construct an identity 4x4 matrix
-    // 
-    static inline const Matrix4 identity( );
-
-    // Construct a 4x4 matrix to rotate around the x axis
-    // 
-    static inline const Matrix4 rotationX( float radians );
-
-    // Construct a 4x4 matrix to rotate around the y axis
-    // 
-    static inline const Matrix4 rotationY( float radians );
-
-    // Construct a 4x4 matrix to rotate around the z axis
-    // 
-    static inline const Matrix4 rotationZ( float radians );
-
-    // Construct a 4x4 matrix to rotate around the x axis (scalar data contained in vector data type)
-    // 
-    static inline const Matrix4 rotationX( const floatInVec &radians );
-
-    // Construct a 4x4 matrix to rotate around the y axis (scalar data contained in vector data type)
-    // 
-    static inline const Matrix4 rotationY( const floatInVec &radians );
-
-    // Construct a 4x4 matrix to rotate around the z axis (scalar data contained in vector data type)
-    // 
-    static inline const Matrix4 rotationZ( const floatInVec &radians );
-
-    // Construct a 4x4 matrix to rotate around the x, y, and z axes
-    // 
-    static inline const Matrix4 rotationZYX( const Vector3 &radiansXYZ );
-
-    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector
-    // 
-    static inline const Matrix4 rotation( float radians, const Vector3 &unitVec );
-
-    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
-    // 
-    static inline const Matrix4 rotation( const floatInVec &radians, const Vector3 &unitVec );
-
-    // Construct a rotation matrix from a unit-length quaternion
-    // 
-    static inline const Matrix4 rotation( const Quat &unitQuat );
-
-    // Construct a 4x4 matrix to perform scaling
-    // 
-    static inline const Matrix4 scale( const Vector3 &scaleVec );
-
-    // Construct a 4x4 matrix to perform translation
-    // 
-    static inline const Matrix4 translation( const Vector3 &translateVec );
-
-    // Construct viewing matrix based on eye, position looked at, and up direction
-    // 
-    static inline const Matrix4 lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec );
-
-    // Construct a perspective projection matrix
-    // 
-    static inline const Matrix4 perspective( float fovyRadians, float aspect, float zNear, float zFar );
-
-    // Construct a perspective projection matrix based on frustum
-    // 
-    static inline const Matrix4 frustum( float left, float right, float bottom, float top, float zNear, float zFar );
-
-    // Construct an orthographic projection matrix
-    // 
-    static inline const Matrix4 orthographic( float left, float right, float bottom, float top, float zNear, float zFar );
-
-};
-// Multiply a 4x4 matrix by a scalar
-// 
-inline const Matrix4 operator *( float scalar, const Matrix4 & mat );
-
-// Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
-// 
-inline const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat );
-
-// Append (post-multiply) a scale transformation to a 4x4 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-// 
-inline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec );
-
-// Prepend (pre-multiply) a scale transformation to a 4x4 matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-// 
-inline const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat );
-
-// Multiply two 4x4 matrices per element
-// 
-inline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 );
-
-// Compute the absolute value of a 4x4 matrix per element
-// 
-inline const Matrix4 absPerElem( const Matrix4 & mat );
-
-// Transpose of a 4x4 matrix
-// 
-inline const Matrix4 transpose( const Matrix4 & mat );
-
-// Compute the inverse of a 4x4 matrix
-// NOTE: 
-// Result is unpredictable when the determinant of mat is equal to or near 0.
-// 
-inline const Matrix4 inverse( const Matrix4 & mat );
-
-// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix
-// NOTE: 
-// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.  The result is unpredictable when the determinant of mat is equal to or near 0.
-// 
-inline const Matrix4 affineInverse( const Matrix4 & mat );
-
-// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix with an orthogonal upper-left 3x3 submatrix
-// NOTE: 
-// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.
-// 
-inline const Matrix4 orthoInverse( const Matrix4 & mat );
-
-// Determinant of a 4x4 matrix
-// 
-inline const floatInVec determinant( const Matrix4 & mat );
-
-// Conditionally select between two 4x4 matrices
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 );
-
-// Conditionally select between two 4x4 matrices (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-inline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a 4x4 matrix
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Matrix4 & mat );
-
-// Print a 4x4 matrix and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Matrix4 & mat, const char * name );
-
-#endif
-
-// A 3x4 transformation matrix in array-of-structures format
-//
-class Transform3
-{
-    Vector3 mCol0;
-    Vector3 mCol1;
-    Vector3 mCol2;
-    Vector3 mCol3;
-
-public:
-    // Default constructor; does no initialization
-    // 
-    inline Transform3( ) { };
-
-    // Copy a 3x4 transformation matrix
-    // 
-    inline Transform3( const Transform3 & tfrm );
-
-    // Construct a 3x4 transformation matrix containing the specified columns
-    // 
-    inline Transform3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2, const Vector3 &col3 );
-
-    // Construct a 3x4 transformation matrix from a 3x3 matrix and a 3-D vector
-    // 
-    inline Transform3( const Matrix3 & tfrm, const Vector3 &translateVec );
-
-    // Construct a 3x4 transformation matrix from a unit-length quaternion and a 3-D vector
-    // 
-    inline Transform3( const Quat &unitQuat, const Vector3 &translateVec );
-
-    // Set all elements of a 3x4 transformation matrix to the same scalar value
-    // 
-    explicit inline Transform3( float scalar );
-
-    // Set all elements of a 3x4 transformation matrix to the same scalar value (scalar data contained in vector data type)
-    // 
-    explicit inline Transform3( const floatInVec &scalar );
-
-    // Assign one 3x4 transformation matrix to another
-    // 
-    inline Transform3 & operator =( const Transform3 & tfrm );
-
-    // Set the upper-left 3x3 submatrix
-    // 
-    inline Transform3 & setUpper3x3( const Matrix3 & mat3 );
-
-    // Get the upper-left 3x3 submatrix of a 3x4 transformation matrix
-    // 
-    inline const Matrix3 getUpper3x3( ) const;
-
-    // Set translation component
-    // 
-    inline Transform3 & setTranslation( const Vector3 &translateVec );
-
-    // Get the translation component of a 3x4 transformation matrix
-    // 
-    inline const Vector3 getTranslation( ) const;
-
-    // Set column 0 of a 3x4 transformation matrix
-    // 
-    inline Transform3 & setCol0( const Vector3 &col0 );
-
-    // Set column 1 of a 3x4 transformation matrix
-    // 
-    inline Transform3 & setCol1( const Vector3 &col1 );
-
-    // Set column 2 of a 3x4 transformation matrix
-    // 
-    inline Transform3 & setCol2( const Vector3 &col2 );
-
-    // Set column 3 of a 3x4 transformation matrix
-    // 
-    inline Transform3 & setCol3( const Vector3 &col3 );
-
-    // Get column 0 of a 3x4 transformation matrix
-    // 
-    inline const Vector3 getCol0( ) const;
-
-    // Get column 1 of a 3x4 transformation matrix
-    // 
-    inline const Vector3 getCol1( ) const;
-
-    // Get column 2 of a 3x4 transformation matrix
-    // 
-    inline const Vector3 getCol2( ) const;
-
-    // Get column 3 of a 3x4 transformation matrix
-    // 
-    inline const Vector3 getCol3( ) const;
-
-    // Set the column of a 3x4 transformation matrix referred to by the specified index
-    // 
-    inline Transform3 & setCol( int col, const Vector3 &vec );
-
-    // Set the row of a 3x4 transformation matrix referred to by the specified index
-    // 
-    inline Transform3 & setRow( int row, const Vector4 &vec );
-
-    // Get the column of a 3x4 transformation matrix referred to by the specified index
-    // 
-    inline const Vector3 getCol( int col ) const;
-
-    // Get the row of a 3x4 transformation matrix referred to by the specified index
-    // 
-    inline const Vector4 getRow( int row ) const;
-
-    // Subscripting operator to set or get a column
-    // 
-    inline Vector3 & operator []( int col );
-
-    // Subscripting operator to get a column
-    // 
-    inline const Vector3 operator []( int col ) const;
-
-    // Set the element of a 3x4 transformation matrix referred to by column and row indices
-    // 
-    inline Transform3 & setElem( int col, int row, float val );
-
-    // Set the element of a 3x4 transformation matrix referred to by column and row indices (scalar data contained in vector data type)
-    // 
-    inline Transform3 & setElem( int col, int row, const floatInVec &val );
-
-    // Get the element of a 3x4 transformation matrix referred to by column and row indices
-    // 
-    inline const floatInVec getElem( int col, int row ) const;
-
-    // Multiply a 3x4 transformation matrix by a 3-D vector
-    // 
-    inline const Vector3 operator *( const Vector3 &vec ) const;
-
-    // Multiply a 3x4 transformation matrix by a 3-D point
-    // 
-    inline const Point3 operator *( const Point3 &pnt ) const;
-
-    // Multiply two 3x4 transformation matrices
-    // 
-    inline const Transform3 operator *( const Transform3 & tfrm ) const;
-
-    // Perform compound assignment and multiplication by a 3x4 transformation matrix
-    // 
-    inline Transform3 & operator *=( const Transform3 & tfrm );
-
-    // Construct an identity 3x4 transformation matrix
-    // 
-    static inline const Transform3 identity( );
-
-    // Construct a 3x4 transformation matrix to rotate around the x axis
-    // 
-    static inline const Transform3 rotationX( float radians );
-
-    // Construct a 3x4 transformation matrix to rotate around the y axis
-    // 
-    static inline const Transform3 rotationY( float radians );
-
-    // Construct a 3x4 transformation matrix to rotate around the z axis
-    // 
-    static inline const Transform3 rotationZ( float radians );
-
-    // Construct a 3x4 transformation matrix to rotate around the x axis (scalar data contained in vector data type)
-    // 
-    static inline const Transform3 rotationX( const floatInVec &radians );
-
-    // Construct a 3x4 transformation matrix to rotate around the y axis (scalar data contained in vector data type)
-    // 
-    static inline const Transform3 rotationY( const floatInVec &radians );
-
-    // Construct a 3x4 transformation matrix to rotate around the z axis (scalar data contained in vector data type)
-    // 
-    static inline const Transform3 rotationZ( const floatInVec &radians );
-
-    // Construct a 3x4 transformation matrix to rotate around the x, y, and z axes
-    // 
-    static inline const Transform3 rotationZYX( const Vector3 &radiansXYZ );
-
-    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector
-    // 
-    static inline const Transform3 rotation( float radians, const Vector3 &unitVec );
-
-    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
-    // 
-    static inline const Transform3 rotation( const floatInVec &radians, const Vector3 &unitVec );
-
-    // Construct a rotation matrix from a unit-length quaternion
-    // 
-    static inline const Transform3 rotation( const Quat &unitQuat );
-
-    // Construct a 3x4 transformation matrix to perform scaling
-    // 
-    static inline const Transform3 scale( const Vector3 &scaleVec );
-
-    // Construct a 3x4 transformation matrix to perform translation
-    // 
-    static inline const Transform3 translation( const Vector3 &translateVec );
-
-};
-// Append (post-multiply) a scale transformation to a 3x4 transformation matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-// 
-inline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec );
-
-// Prepend (pre-multiply) a scale transformation to a 3x4 transformation matrix
-// NOTE: 
-// Faster than creating and multiplying a scale transformation matrix.
-// 
-inline const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm );
-
-// Multiply two 3x4 transformation matrices per element
-// 
-inline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 );
-
-// Compute the absolute value of a 3x4 transformation matrix per element
-// 
-inline const Transform3 absPerElem( const Transform3 & tfrm );
-
-// Inverse of a 3x4 transformation matrix
-// NOTE: 
-// Result is unpredictable when the determinant of the left 3x3 submatrix is equal to or near 0.
-// 
-inline const Transform3 inverse( const Transform3 & tfrm );
-
-// Compute the inverse of a 3x4 transformation matrix, expected to have an orthogonal upper-left 3x3 submatrix
-// NOTE: 
-// This can be used to achieve better performance than a general inverse when the specified 3x4 transformation matrix meets the given restrictions.
-// 
-inline const Transform3 orthoInverse( const Transform3 & tfrm );
-
-// Conditionally select between two 3x4 transformation matrices
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// However, the transfer of select1 to a VMX register may use more processing time than a branch.
-// Use the boolInVec version for better performance.
-// 
-inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 );
-
-// Conditionally select between two 3x4 transformation matrices (scalar data contained in vector data type)
-// NOTE: 
-// This function uses a conditional select instruction to avoid a branch.
-// 
-inline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 );
-
-#ifdef _VECTORMATH_DEBUG
-
-// Print a 3x4 transformation matrix
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Transform3 & tfrm );
-
-// Print a 3x4 transformation matrix and an associated string identifier
-// NOTE: 
-// Function is only defined when _VECTORMATH_DEBUG is defined.
-// 
-inline void print( const Transform3 & tfrm, const char * name );
-
-#endif
-
-} // namespace Aos
-} // namespace Vectormath
-
-#include "vec_aos.h"
-#include "quat_aos.h"
-#include "mat_aos.h"
-
-#endif
+}
+
+static __forceinline __m128 acosf4(__m128 x)
+{
+    __m128 xabs = fabsf4(x);
+	__m128 select = _mm_cmplt_ps( x, _mm_setzero_ps() );
+    __m128 t1 = sqrtf4(vec_sub(_mm_set1_ps(1.0f), xabs));
+    
+    /* Instruction counts can be reduced if the polynomial was
+     * computed entirely from nested (dependent) fma's. However, 
+     * to reduce the number of pipeline stalls, the polygon is evaluated 
+     * in two halves (hi amd lo). 
+     */
+    __m128 xabs2 = _mm_mul_ps(xabs,  xabs);
+    __m128 xabs4 = _mm_mul_ps(xabs2, xabs2);
+    __m128 hi = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0012624911f),
+		xabs, _mm_set1_ps(0.0066700901f)),
+			xabs, _mm_set1_ps(-0.0170881256f)),
+				xabs, _mm_set1_ps( 0.0308918810f));
+    __m128 lo = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0501743046f),
+		xabs, _mm_set1_ps(0.0889789874f)),
+			xabs, _mm_set1_ps(-0.2145988016f)),
+				xabs, _mm_set1_ps( 1.5707963050f));
+    
+    __m128 result = vec_madd(hi, xabs4, lo);
+    
+    // Adjust the result if x is negactive.
+    return vec_sel(
+		vec_mul(t1, result),									// Positive
+		vec_nmsub(t1, result, _mm_set1_ps(3.1415926535898f)),	// Negative
+		select);
+}
+
+static __forceinline __m128 sinf4(vec_float4 x)
+{
+
+//
+// Common constants used to evaluate sinf4/cosf4/tanf4
+//
+#define _SINCOS_CC0  -0.0013602249f
+#define _SINCOS_CC1   0.0416566950f
+#define _SINCOS_CC2  -0.4999990225f
+#define _SINCOS_SC0  -0.0001950727f
+#define _SINCOS_SC1   0.0083320758f
+#define _SINCOS_SC2  -0.1666665247f
+
+#define _SINCOS_KC1  1.57079625129f
+#define _SINCOS_KC2  7.54978995489e-8f
+
+    vec_float4 xl,xl2,xl3,res;
+
+    // Range reduction using : xl = angle * TwoOverPi;
+    //  
+    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
+
+    // Find the quadrant the angle falls in
+    // using:  q = (int) (ceil(abs(xl))*sign(xl))
+    //
+    vec_int4 q = vec_cts(xl,0);
+
+    // Compute an offset based on the quadrant that the angle falls in
+    // 
+    vec_int4 offset = _mm_and_ps(q,toM128(0x3));
+
+    // Remainder in range [-pi/4..pi/4]
+    //
+    vec_float4 qf = vec_ctf(q,0);
+    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
+    
+    // Compute x^2 and x^3
+    //
+    xl2 = vec_mul(xl,xl);
+    xl3 = vec_mul(xl2,xl);
+    
+    // Compute both the sin and cos of the angles
+    // using a polynomial expression:
+    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
+    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
+    //
+    
+    vec_float4 cx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
+    vec_float4 sx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
+
+    // Use the cosine when the offset is odd and the sin
+    // when the offset is even
+    //
+    res = vec_sel(cx,sx,vec_cmpeq(vec_and(offset,
+                                          toM128(0x1)),
+										  _mm_setzero_ps()));
+
+    // Flip the sign of the result when (offset mod 4) = 1 or 2
+    //
+    return vec_sel(
+		vec_xor(toM128(0x80000000U), res),	// Negative
+		res,								// Positive
+		vec_cmpeq(vec_and(offset,toM128(0x2)),_mm_setzero_ps()));
+}
+
+static __forceinline void sincosf4(vec_float4 x, vec_float4* s, vec_float4* c)
+{
+    vec_float4 xl,xl2,xl3;
+    vec_int4   offsetSin, offsetCos;
+
+    // Range reduction using : xl = angle * TwoOverPi;
+    //  
+    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
+
+    // Find the quadrant the angle falls in
+    // using:  q = (int) (ceil(abs(xl))*sign(xl))
+    //
+    //vec_int4 q = vec_cts(vec_add(xl,vec_sel(_mm_set1_ps(0.5f),xl,(0x80000000))),0);
+    vec_int4 q = vec_cts(xl,0);
+     
+    // Compute the offset based on the quadrant that the angle falls in.
+    // Add 1 to the offset for the cosine. 
+    //
+    offsetSin = vec_and(q,toM128((int)0x3));
+	__m128i temp = _mm_add_epi32(_mm_set1_epi32(1),(__m128i &)offsetSin);
+	offsetCos = (__m128 &)temp;
+
+    // Remainder in range [-pi/4..pi/4]
+    //
+    vec_float4 qf = vec_ctf(q,0);
+    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
+    
+    // Compute x^2 and x^3
+    //
+    xl2 = vec_mul(xl,xl);
+    xl3 = vec_mul(xl2,xl);
+    
+    // Compute both the sin and cos of the angles
+    // using a polynomial expression:
+    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
+    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
+    //
+    vec_float4 cx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
+    vec_float4 sx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
+
+    // Use the cosine when the offset is odd and the sin
+    // when the offset is even
+    //
+    vec_uint4 sinMask = (vec_uint4)vec_cmpeq(vec_and(offsetSin,toM128(0x1)),_mm_setzero_ps());
+    vec_uint4 cosMask = (vec_uint4)vec_cmpeq(vec_and(offsetCos,toM128(0x1)),_mm_setzero_ps());    
+    *s = vec_sel(cx,sx,sinMask);
+    *c = vec_sel(cx,sx,cosMask);
+
+    // Flip the sign of the result when (offset mod 4) = 1 or 2
+    //
+    sinMask = vec_cmpeq(vec_and(offsetSin,toM128(0x2)),_mm_setzero_ps());
+    cosMask = vec_cmpeq(vec_and(offsetCos,toM128(0x2)),_mm_setzero_ps());
+    
+    *s = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*s),*s,sinMask);
+    *c = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*c),*c,cosMask);    
+}
+
+#include "vecidx_aos.h"
+#include "floatInVec.h"
+#include "boolInVec.h"
+
+#ifdef _VECTORMATH_DEBUG
+#include <stdio.h>
+#endif
+namespace Vectormath {
+
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// Forward Declarations
+//
+
+class Vector3;
+class Vector4;
+class Point3;
+class Quat;
+class Matrix3;
+class Matrix4;
+class Transform3;
+
+// A 3-D vector in array-of-structures format
+//
+class Vector3
+{
+    __m128 mVec128;
+
+	__forceinline void set128(vec_float4 vec);
+	 
+	 __forceinline  vec_float4& get128Ref();
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Vector3( ) { };
+
+	// Default copy constructor
+    // 
+	__forceinline Vector3(const Vector3& vec);
+
+    // Construct a 3-D vector from x, y, and z elements
+    // 
+    __forceinline Vector3( float x, float y, float z );
+
+    // Construct a 3-D vector from x, y, and z elements (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
+
+    // Copy elements from a 3-D point into a 3-D vector
+    // 
+    explicit __forceinline Vector3( const Point3 &pnt );
+
+    // Set all elements of a 3-D vector to the same scalar value
+    // 
+    explicit __forceinline Vector3( float scalar );
+
+    // Set all elements of a 3-D vector to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Vector3( const floatInVec &scalar );
+
+    // Set vector float data in a 3-D vector
+    // 
+    explicit __forceinline Vector3( __m128 vf4 );
+
+    // Get vector float data from a 3-D vector
+    // 
+    __forceinline __m128 get128( ) const;
+
+    // Assign one 3-D vector to another
+    // 
+    __forceinline Vector3 & operator =( const Vector3 &vec );
+
+    // Set the x element of a 3-D vector
+    // 
+    __forceinline Vector3 & setX( float x );
+
+    // Set the y element of a 3-D vector
+    // 
+    __forceinline Vector3 & setY( float y );
+
+    // Set the z element of a 3-D vector
+    // 
+    __forceinline Vector3 & setZ( float z );
+
+    // Set the x element of a 3-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3 & setX( const floatInVec &x );
+
+    // Set the y element of a 3-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3 & setY( const floatInVec &y );
+
+    // Set the z element of a 3-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3 & setZ( const floatInVec &z );
+
+    // Get the x element of a 3-D vector
+    // 
+    __forceinline const floatInVec getX( ) const;
+
+    // Get the y element of a 3-D vector
+    // 
+    __forceinline const floatInVec getY( ) const;
+
+    // Get the z element of a 3-D vector
+    // 
+    __forceinline const floatInVec getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D vector by index
+    // 
+    __forceinline Vector3 & setElem( int idx, float value );
+
+    // Set an x, y, or z element of a 3-D vector by index (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3 & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, or z element of a 3-D vector by index
+    // 
+    __forceinline const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    __forceinline VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    __forceinline const floatInVec operator []( int idx ) const;
+
+    // Add two 3-D vectors
+    // 
+    __forceinline const Vector3 operator +( const Vector3 &vec ) const;
+
+    // Subtract a 3-D vector from another 3-D vector
+    // 
+    __forceinline const Vector3 operator -( const Vector3 &vec ) const;
+
+    // Add a 3-D vector to a 3-D point
+    // 
+    __forceinline const Point3 operator +( const Point3 &pnt ) const;
+
+    // Multiply a 3-D vector by a scalar
+    // 
+    __forceinline const Vector3 operator *( float scalar ) const;
+
+    // Divide a 3-D vector by a scalar
+    // 
+    __forceinline const Vector3 operator /( float scalar ) const;
+
+    // Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Vector3 operator *( const floatInVec &scalar ) const;
+
+    // Divide a 3-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Vector3 operator /( const floatInVec &scalar ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    __forceinline Vector3 & operator +=( const Vector3 &vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    __forceinline Vector3 & operator -=( const Vector3 &vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    __forceinline Vector3 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    __forceinline Vector3 & operator /=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3 & operator /=( const floatInVec &scalar );
+
+    // Negate all elements of a 3-D vector
+    // 
+    __forceinline const Vector3 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static __forceinline const Vector3 xAxis( );
+
+    // Construct y axis
+    // 
+    static __forceinline const Vector3 yAxis( );
+
+    // Construct z axis
+    // 
+    static __forceinline const Vector3 zAxis( );
+
+};
+
+// Multiply a 3-D vector by a scalar
+// 
+__forceinline const Vector3 operator *( float scalar, const Vector3 &vec );
+
+// Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
+// 
+__forceinline const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec );
+
+// Multiply two 3-D vectors per element
+// 
+__forceinline const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Divide two 3-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+__forceinline const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Compute the reciprocal of a 3-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+__forceinline const Vector3 recipPerElem( const Vector3 &vec );
+
+// Compute the absolute value of a 3-D vector per element
+// 
+__forceinline const Vector3 absPerElem( const Vector3 &vec );
+
+// Copy sign from one 3-D vector to another, per element
+// 
+__forceinline const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Maximum of two 3-D vectors per element
+// 
+__forceinline const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Minimum of two 3-D vectors per element
+// 
+__forceinline const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Maximum element of a 3-D vector
+// 
+__forceinline const floatInVec maxElem( const Vector3 &vec );
+
+// Minimum element of a 3-D vector
+// 
+__forceinline const floatInVec minElem( const Vector3 &vec );
+
+// Compute the sum of all elements of a 3-D vector
+// 
+__forceinline const floatInVec sum( const Vector3 &vec );
+
+// Compute the dot product of two 3-D vectors
+// 
+__forceinline const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Compute the square of the length of a 3-D vector
+// 
+__forceinline const floatInVec lengthSqr( const Vector3 &vec );
+
+// Compute the length of a 3-D vector
+// 
+__forceinline const floatInVec length( const Vector3 &vec );
+
+// Normalize a 3-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+__forceinline const Vector3 normalize( const Vector3 &vec );
+
+// Compute cross product of two 3-D vectors
+// 
+__forceinline const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Outer product of two 3-D vectors
+// 
+__forceinline const Matrix3 outer( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Pre-multiply a row vector by a 3x3 matrix
+// NOTE: 
+// Slower than column post-multiply.
+// 
+__forceinline const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat );
+
+// Cross-product matrix of a 3-D vector
+// 
+__forceinline const Matrix3 crossMatrix( const Vector3 &vec );
+
+// Create cross-product matrix and multiply
+// NOTE: 
+// Faster than separately creating a cross-product matrix and multiplying.
+// 
+__forceinline const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat );
+
+// Linear interpolation between two 3-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 );
+
+// Linear interpolation between two 3-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 );
+
+// Spherical linear interpolation between two 3-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
+
+// Spherical linear interpolation between two 3-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
+
+// Conditionally select between two 3-D vectors
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 );
+
+// Conditionally select between two 3-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, const boolInVec &select1 );
+
+// Store x, y, and z elements of 3-D vector in first three words of a quadword, preserving fourth word
+// 
+__forceinline void storeXYZ( const Vector3 &vec, __m128 * quad );
+
+// Load four three-float 3-D vectors, stored in three quadwords
+// 
+__forceinline void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads );
+
+// Store four 3-D vectors in three quadwords
+// 
+__forceinline void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads );
+
+// Store eight 3-D vectors as half-floats
+// 
+__forceinline void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Vector3 &vec );
+
+// Print a 3-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Vector3 &vec, const char * name );
+
+#endif
+
+// A 4-D vector in array-of-structures format
+//
+class Vector4
+{
+    __m128 mVec128;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Vector4( ) { };
+
+    // Construct a 4-D vector from x, y, z, and w elements
+    // 
+    __forceinline Vector4( float x, float y, float z, float w );
+
+    // Construct a 4-D vector from x, y, z, and w elements (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
+
+    // Construct a 4-D vector from a 3-D vector and a scalar
+    // 
+    __forceinline Vector4( const Vector3 &xyz, float w );
+
+    // Construct a 4-D vector from a 3-D vector and a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4( const Vector3 &xyz, const floatInVec &w );
+
+    // Copy x, y, and z from a 3-D vector into a 4-D vector, and set w to 0
+    // 
+    explicit __forceinline Vector4( const Vector3 &vec );
+
+    // Copy x, y, and z from a 3-D point into a 4-D vector, and set w to 1
+    // 
+    explicit __forceinline Vector4( const Point3 &pnt );
+
+    // Copy elements from a quaternion into a 4-D vector
+    // 
+    explicit __forceinline Vector4( const Quat &quat );
+
+    // Set all elements of a 4-D vector to the same scalar value
+    // 
+    explicit __forceinline Vector4( float scalar );
+
+    // Set all elements of a 4-D vector to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Vector4( const floatInVec &scalar );
+
+    // Set vector float data in a 4-D vector
+    // 
+    explicit __forceinline Vector4( __m128 vf4 );
+
+    // Get vector float data from a 4-D vector
+    // 
+    __forceinline __m128 get128( ) const;
+
+    // Assign one 4-D vector to another
+    // 
+    __forceinline Vector4 & operator =( const Vector4 &vec );
+
+    // Set the x, y, and z elements of a 4-D vector
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    __forceinline Vector4 & setXYZ( const Vector3 &vec );
+
+    // Get the x, y, and z elements of a 4-D vector
+    // 
+    __forceinline const Vector3 getXYZ( ) const;
+
+    // Set the x element of a 4-D vector
+    // 
+    __forceinline Vector4 & setX( float x );
+
+    // Set the y element of a 4-D vector
+    // 
+    __forceinline Vector4 & setY( float y );
+
+    // Set the z element of a 4-D vector
+    // 
+    __forceinline Vector4 & setZ( float z );
+
+    // Set the w element of a 4-D vector
+    // 
+    __forceinline Vector4 & setW( float w );
+
+    // Set the x element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & setX( const floatInVec &x );
+
+    // Set the y element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & setY( const floatInVec &y );
+
+    // Set the z element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & setZ( const floatInVec &z );
+
+    // Set the w element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & setW( const floatInVec &w );
+
+    // Get the x element of a 4-D vector
+    // 
+    __forceinline const floatInVec getX( ) const;
+
+    // Get the y element of a 4-D vector
+    // 
+    __forceinline const floatInVec getY( ) const;
+
+    // Get the z element of a 4-D vector
+    // 
+    __forceinline const floatInVec getZ( ) const;
+
+    // Get the w element of a 4-D vector
+    // 
+    __forceinline const floatInVec getW( ) const;
+
+    // Set an x, y, z, or w element of a 4-D vector by index
+    // 
+    __forceinline Vector4 & setElem( int idx, float value );
+
+    // Set an x, y, z, or w element of a 4-D vector by index (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, z, or w element of a 4-D vector by index
+    // 
+    __forceinline const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    __forceinline VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    __forceinline const floatInVec operator []( int idx ) const;
+
+    // Add two 4-D vectors
+    // 
+    __forceinline const Vector4 operator +( const Vector4 &vec ) const;
+
+    // Subtract a 4-D vector from another 4-D vector
+    // 
+    __forceinline const Vector4 operator -( const Vector4 &vec ) const;
+
+    // Multiply a 4-D vector by a scalar
+    // 
+    __forceinline const Vector4 operator *( float scalar ) const;
+
+    // Divide a 4-D vector by a scalar
+    // 
+    __forceinline const Vector4 operator /( float scalar ) const;
+
+    // Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Vector4 operator *( const floatInVec &scalar ) const;
+
+    // Divide a 4-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Vector4 operator /( const floatInVec &scalar ) const;
+
+    // Perform compound assignment and addition with a 4-D vector
+    // 
+    __forceinline Vector4 & operator +=( const Vector4 &vec );
+
+    // Perform compound assignment and subtraction by a 4-D vector
+    // 
+    __forceinline Vector4 & operator -=( const Vector4 &vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    __forceinline Vector4 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    __forceinline Vector4 & operator /=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & operator /=( const floatInVec &scalar );
+
+    // Negate all elements of a 4-D vector
+    // 
+    __forceinline const Vector4 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static __forceinline const Vector4 xAxis( );
+
+    // Construct y axis
+    // 
+    static __forceinline const Vector4 yAxis( );
+
+    // Construct z axis
+    // 
+    static __forceinline const Vector4 zAxis( );
+
+    // Construct w axis
+    // 
+    static __forceinline const Vector4 wAxis( );
+
+};
+
+// Multiply a 4-D vector by a scalar
+// 
+__forceinline const Vector4 operator *( float scalar, const Vector4 &vec );
+
+// Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
+// 
+__forceinline const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec );
+
+// Multiply two 4-D vectors per element
+// 
+__forceinline const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Divide two 4-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+__forceinline const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Compute the reciprocal of a 4-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+__forceinline const Vector4 recipPerElem( const Vector4 &vec );
+
+// Compute the absolute value of a 4-D vector per element
+// 
+__forceinline const Vector4 absPerElem( const Vector4 &vec );
+
+// Copy sign from one 4-D vector to another, per element
+// 
+__forceinline const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Maximum of two 4-D vectors per element
+// 
+__forceinline const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Minimum of two 4-D vectors per element
+// 
+__forceinline const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Maximum element of a 4-D vector
+// 
+__forceinline const floatInVec maxElem( const Vector4 &vec );
+
+// Minimum element of a 4-D vector
+// 
+__forceinline const floatInVec minElem( const Vector4 &vec );
+
+// Compute the sum of all elements of a 4-D vector
+// 
+__forceinline const floatInVec sum( const Vector4 &vec );
+
+// Compute the dot product of two 4-D vectors
+// 
+__forceinline const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Compute the square of the length of a 4-D vector
+// 
+__forceinline const floatInVec lengthSqr( const Vector4 &vec );
+
+// Compute the length of a 4-D vector
+// 
+__forceinline const floatInVec length( const Vector4 &vec );
+
+// Normalize a 4-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+__forceinline const Vector4 normalize( const Vector4 &vec );
+
+// Outer product of two 4-D vectors
+// 
+__forceinline const Matrix4 outer( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Linear interpolation between two 4-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 );
+
+// Linear interpolation between two 4-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 );
+
+// Spherical linear interpolation between two 4-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
+
+// Spherical linear interpolation between two 4-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
+
+// Conditionally select between two 4-D vectors
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 );
+
+// Conditionally select between two 4-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, const boolInVec &select1 );
+
+// Store four 4-D vectors as half-floats
+// 
+__forceinline void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Vector4 &vec );
+
+// Print a 4-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Vector4 &vec, const char * name );
+
+#endif
+
+// A 3-D point in array-of-structures format
+//
+class Point3
+{
+    __m128 mVec128;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Point3( ) { };
+
+    // Construct a 3-D point from x, y, and z elements
+    // 
+    __forceinline Point3( float x, float y, float z );
+
+    // Construct a 3-D point from x, y, and z elements (scalar data contained in vector data type)
+    // 
+    __forceinline Point3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
+
+    // Copy elements from a 3-D vector into a 3-D point
+    // 
+    explicit __forceinline Point3( const Vector3 &vec );
+
+    // Set all elements of a 3-D point to the same scalar value
+    // 
+    explicit __forceinline Point3( float scalar );
+
+    // Set all elements of a 3-D point to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Point3( const floatInVec &scalar );
+
+    // Set vector float data in a 3-D point
+    // 
+    explicit __forceinline Point3( __m128 vf4 );
+
+    // Get vector float data from a 3-D point
+    // 
+    __forceinline __m128 get128( ) const;
+
+    // Assign one 3-D point to another
+    // 
+    __forceinline Point3 & operator =( const Point3 &pnt );
+
+    // Set the x element of a 3-D point
+    // 
+    __forceinline Point3 & setX( float x );
+
+    // Set the y element of a 3-D point
+    // 
+    __forceinline Point3 & setY( float y );
+
+    // Set the z element of a 3-D point
+    // 
+    __forceinline Point3 & setZ( float z );
+
+    // Set the x element of a 3-D point (scalar data contained in vector data type)
+    // 
+    __forceinline Point3 & setX( const floatInVec &x );
+
+    // Set the y element of a 3-D point (scalar data contained in vector data type)
+    // 
+    __forceinline Point3 & setY( const floatInVec &y );
+
+    // Set the z element of a 3-D point (scalar data contained in vector data type)
+    // 
+    __forceinline Point3 & setZ( const floatInVec &z );
+
+    // Get the x element of a 3-D point
+    // 
+    __forceinline const floatInVec getX( ) const;
+
+    // Get the y element of a 3-D point
+    // 
+    __forceinline const floatInVec getY( ) const;
+
+    // Get the z element of a 3-D point
+    // 
+    __forceinline const floatInVec getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D point by index
+    // 
+    __forceinline Point3 & setElem( int idx, float value );
+
+    // Set an x, y, or z element of a 3-D point by index (scalar data contained in vector data type)
+    // 
+    __forceinline Point3 & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, or z element of a 3-D point by index
+    // 
+    __forceinline const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    __forceinline VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    __forceinline const floatInVec operator []( int idx ) const;
+
+    // Subtract a 3-D point from another 3-D point
+    // 
+    __forceinline const Vector3 operator -( const Point3 &pnt ) const;
+
+    // Add a 3-D point to a 3-D vector
+    // 
+    __forceinline const Point3 operator +( const Vector3 &vec ) const;
+
+    // Subtract a 3-D vector from a 3-D point
+    // 
+    __forceinline const Point3 operator -( const Vector3 &vec ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    __forceinline Point3 & operator +=( const Vector3 &vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    __forceinline Point3 & operator -=( const Vector3 &vec );
+
+};
+
+// Multiply two 3-D points per element
+// 
+__forceinline const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Divide two 3-D points per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+__forceinline const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Compute the reciprocal of a 3-D point per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+__forceinline const Point3 recipPerElem( const Point3 &pnt );
+
+// Compute the absolute value of a 3-D point per element
+// 
+__forceinline const Point3 absPerElem( const Point3 &pnt );
+
+// Copy sign from one 3-D point to another, per element
+// 
+__forceinline const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Maximum of two 3-D points per element
+// 
+__forceinline const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Minimum of two 3-D points per element
+// 
+__forceinline const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Maximum element of a 3-D point
+// 
+__forceinline const floatInVec maxElem( const Point3 &pnt );
+
+// Minimum element of a 3-D point
+// 
+__forceinline const floatInVec minElem( const Point3 &pnt );
+
+// Compute the sum of all elements of a 3-D point
+// 
+__forceinline const floatInVec sum( const Point3 &pnt );
+
+// Apply uniform scale to a 3-D point
+// 
+__forceinline const Point3 scale( const Point3 &pnt, float scaleVal );
+
+// Apply uniform scale to a 3-D point (scalar data contained in vector data type)
+// 
+__forceinline const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal );
+
+// Apply non-uniform scale to a 3-D point
+// 
+__forceinline const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec );
+
+// Scalar projection of a 3-D point on a unit-length 3-D vector
+// 
+__forceinline const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec );
+
+// Compute the square of the distance of a 3-D point from the coordinate-system origin
+// 
+__forceinline const floatInVec distSqrFromOrigin( const Point3 &pnt );
+
+// Compute the distance of a 3-D point from the coordinate-system origin
+// 
+__forceinline const floatInVec distFromOrigin( const Point3 &pnt );
+
+// Compute the square of the distance between two 3-D points
+// 
+__forceinline const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Compute the distance between two 3-D points
+// 
+__forceinline const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Linear interpolation between two 3-D points
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 );
+
+// Linear interpolation between two 3-D points (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 );
+
+// Conditionally select between two 3-D points
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 );
+
+// Conditionally select between two 3-D points (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 );
+
+// Store x, y, and z elements of 3-D point in first three words of a quadword, preserving fourth word
+// 
+__forceinline void storeXYZ( const Point3 &pnt, __m128 * quad );
+
+// Load four three-float 3-D points, stored in three quadwords
+// 
+__forceinline void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads );
+
+// Store four 3-D points in three quadwords
+// 
+__forceinline void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads );
+
+// Store eight 3-D points as half-floats
+// 
+__forceinline void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D point
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Point3 &pnt );
+
+// Print a 3-D point and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Point3 &pnt, const char * name );
+
+#endif
+
+// A quaternion in array-of-structures format
+//
+class Quat
+{
+    __m128 mVec128;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Quat( ) { };
+
+	__forceinline  Quat::Quat(const Quat& quat);
+
+    // Construct a quaternion from x, y, z, and w elements
+    // 
+    __forceinline Quat( float x, float y, float z, float w );
+
+    // Construct a quaternion from x, y, z, and w elements (scalar data contained in vector data type)
+    // 
+    __forceinline Quat( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
+
+    // Construct a quaternion from a 3-D vector and a scalar
+    // 
+    __forceinline Quat( const Vector3 &xyz, float w );
+
+    // Construct a quaternion from a 3-D vector and a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Quat( const Vector3 &xyz, const floatInVec &w );
+
+    // Copy elements from a 4-D vector into a quaternion
+    // 
+    explicit __forceinline Quat( const Vector4 &vec );
+
+    // Convert a rotation matrix to a unit-length quaternion
+    // 
+    explicit __forceinline Quat( const Matrix3 & rotMat );
+
+    // Set all elements of a quaternion to the same scalar value
+    // 
+    explicit __forceinline Quat( float scalar );
+
+    // Set all elements of a quaternion to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Quat( const floatInVec &scalar );
+
+    // Set vector float data in a quaternion
+    // 
+    explicit __forceinline Quat( __m128 vf4 );
+
+    // Get vector float data from a quaternion
+    // 
+    __forceinline __m128 get128( ) const;
+
+	// Set a quaterion from vector float data
+    //
+	__forceinline void set128(vec_float4 vec);
+
+    // Assign one quaternion to another
+    // 
+    __forceinline Quat & operator =( const Quat &quat );
+
+    // Set the x, y, and z elements of a quaternion
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    __forceinline Quat & setXYZ( const Vector3 &vec );
+
+    // Get the x, y, and z elements of a quaternion
+    // 
+    __forceinline const Vector3 getXYZ( ) const;
+
+    // Set the x element of a quaternion
+    // 
+    __forceinline Quat & setX( float x );
+
+    // Set the y element of a quaternion
+    // 
+    __forceinline Quat & setY( float y );
+
+    // Set the z element of a quaternion
+    // 
+    __forceinline Quat & setZ( float z );
+
+    // Set the w element of a quaternion
+    // 
+    __forceinline Quat & setW( float w );
+
+    // Set the x element of a quaternion (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & setX( const floatInVec &x );
+
+    // Set the y element of a quaternion (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & setY( const floatInVec &y );
+
+    // Set the z element of a quaternion (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & setZ( const floatInVec &z );
+
+    // Set the w element of a quaternion (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & setW( const floatInVec &w );
+
+    // Get the x element of a quaternion
+    // 
+    __forceinline const floatInVec getX( ) const;
+
+    // Get the y element of a quaternion
+    // 
+    __forceinline const floatInVec getY( ) const;
+
+    // Get the z element of a quaternion
+    // 
+    __forceinline const floatInVec getZ( ) const;
+
+    // Get the w element of a quaternion
+    // 
+    __forceinline const floatInVec getW( ) const;
+
+    // Set an x, y, z, or w element of a quaternion by index
+    // 
+    __forceinline Quat & setElem( int idx, float value );
+
+    // Set an x, y, z, or w element of a quaternion by index (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, z, or w element of a quaternion by index
+    // 
+    __forceinline const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    __forceinline VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    __forceinline const floatInVec operator []( int idx ) const;
+
+    // Add two quaternions
+    // 
+    __forceinline const Quat operator +( const Quat &quat ) const;
+
+    // Subtract a quaternion from another quaternion
+    // 
+    __forceinline const Quat operator -( const Quat &quat ) const;
+
+    // Multiply two quaternions
+    // 
+    __forceinline const Quat operator *( const Quat &quat ) const;
+
+    // Multiply a quaternion by a scalar
+    // 
+    __forceinline const Quat operator *( float scalar ) const;
+
+    // Divide a quaternion by a scalar
+    // 
+    __forceinline const Quat operator /( float scalar ) const;
+
+    // Multiply a quaternion by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Quat operator *( const floatInVec &scalar ) const;
+
+    // Divide a quaternion by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Quat operator /( const floatInVec &scalar ) const;
+
+    // Perform compound assignment and addition with a quaternion
+    // 
+    __forceinline Quat & operator +=( const Quat &quat );
+
+    // Perform compound assignment and subtraction by a quaternion
+    // 
+    __forceinline Quat & operator -=( const Quat &quat );
+
+    // Perform compound assignment and multiplication by a quaternion
+    // 
+    __forceinline Quat & operator *=( const Quat &quat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    __forceinline Quat & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    __forceinline Quat & operator /=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & operator /=( const floatInVec &scalar );
+
+    // Negate all elements of a quaternion
+    // 
+    __forceinline const Quat operator -( ) const;
+
+    // Construct an identity quaternion
+    // 
+    static __forceinline const Quat identity( );
+
+    // Construct a quaternion to rotate between two unit-length 3-D vectors
+    // NOTE: 
+    // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions.
+    // 
+    static __forceinline const Quat rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 );
+
+    // Construct a quaternion to rotate around a unit-length 3-D vector
+    // 
+    static __forceinline const Quat rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a quaternion to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static __forceinline const Quat rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a quaternion to rotate around the x axis
+    // 
+    static __forceinline const Quat rotationX( float radians );
+
+    // Construct a quaternion to rotate around the y axis
+    // 
+    static __forceinline const Quat rotationY( float radians );
+
+    // Construct a quaternion to rotate around the z axis
+    // 
+    static __forceinline const Quat rotationZ( float radians );
+
+    // Construct a quaternion to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Quat rotationX( const floatInVec &radians );
+
+    // Construct a quaternion to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Quat rotationY( const floatInVec &radians );
+
+    // Construct a quaternion to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Quat rotationZ( const floatInVec &radians );
+
+};
+
+// Multiply a quaternion by a scalar
+// 
+__forceinline const Quat operator *( float scalar, const Quat &quat );
+
+// Multiply a quaternion by a scalar (scalar data contained in vector data type)
+// 
+__forceinline const Quat operator *( const floatInVec &scalar, const Quat &quat );
+
+// Compute the conjugate of a quaternion
+// 
+__forceinline const Quat conj( const Quat &quat );
+
+// Use a unit-length quaternion to rotate a 3-D vector
+// 
+__forceinline const Vector3 rotate( const Quat &unitQuat, const Vector3 &vec );
+
+// Compute the dot product of two quaternions
+// 
+__forceinline const floatInVec dot( const Quat &quat0, const Quat &quat1 );
+
+// Compute the norm of a quaternion
+// 
+__forceinline const floatInVec norm( const Quat &quat );
+
+// Compute the length of a quaternion
+// 
+__forceinline const floatInVec length( const Quat &quat );
+
+// Normalize a quaternion
+// NOTE: 
+// The result is unpredictable when all elements of quat are at or near zero.
+// 
+__forceinline const Quat normalize( const Quat &quat );
+
+// Linear interpolation between two quaternions
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Quat lerp( float t, const Quat &quat0, const Quat &quat1 );
+
+// Linear interpolation between two quaternions (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 );
+
+// Spherical linear interpolation between two quaternions
+// NOTE: 
+// Interpolates along the shortest path between orientations.
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 );
+
+// Spherical linear interpolation between two quaternions (scalar data contained in vector data type)
+// NOTE: 
+// Interpolates along the shortest path between orientations.
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 );
+
+// Spherical quadrangle interpolation
+// 
+__forceinline const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
+
+// Spherical quadrangle interpolation (scalar data contained in vector data type)
+// 
+__forceinline const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
+
+// Conditionally select between two quaternions
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Quat select( const Quat &quat0, const Quat &quat1, bool select1 );
+
+// Conditionally select between two quaternions (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a quaternion
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Quat &quat );
+
+// Print a quaternion and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Quat &quat, const char * name );
+
+#endif
+
+// A 3x3 matrix in array-of-structures format
+//
+class Matrix3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Matrix3( ) { };
+
+    // Copy a 3x3 matrix
+    // 
+    __forceinline Matrix3( const Matrix3 & mat );
+
+    // Construct a 3x3 matrix containing the specified columns
+    // 
+    __forceinline Matrix3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2 );
+
+    // Construct a 3x3 rotation matrix from a unit-length quaternion
+    // 
+    explicit __forceinline Matrix3( const Quat &unitQuat );
+
+    // Set all elements of a 3x3 matrix to the same scalar value
+    // 
+    explicit __forceinline Matrix3( float scalar );
+
+    // Set all elements of a 3x3 matrix to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Matrix3( const floatInVec &scalar );
+
+    // Assign one 3x3 matrix to another
+    // 
+    __forceinline Matrix3 & operator =( const Matrix3 & mat );
+
+    // Set column 0 of a 3x3 matrix
+    // 
+    __forceinline Matrix3 & setCol0( const Vector3 &col0 );
+
+    // Set column 1 of a 3x3 matrix
+    // 
+    __forceinline Matrix3 & setCol1( const Vector3 &col1 );
+
+    // Set column 2 of a 3x3 matrix
+    // 
+    __forceinline Matrix3 & setCol2( const Vector3 &col2 );
+
+    // Get column 0 of a 3x3 matrix
+    // 
+    __forceinline const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x3 matrix
+    // 
+    __forceinline const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x3 matrix
+    // 
+    __forceinline const Vector3 getCol2( ) const;
+
+    // Set the column of a 3x3 matrix referred to by the specified index
+    // 
+    __forceinline Matrix3 & setCol( int col, const Vector3 &vec );
+
+    // Set the row of a 3x3 matrix referred to by the specified index
+    // 
+    __forceinline Matrix3 & setRow( int row, const Vector3 &vec );
+
+    // Get the column of a 3x3 matrix referred to by the specified index
+    // 
+    __forceinline const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x3 matrix referred to by the specified index
+    // 
+    __forceinline const Vector3 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    __forceinline Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    __forceinline const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x3 matrix referred to by column and row indices
+    // 
+    __forceinline Matrix3 & setElem( int col, int row, float val );
+
+    // Set the element of a 3x3 matrix referred to by column and row indices (scalar data contained in vector data type)
+    // 
+    __forceinline Matrix3 & setElem( int col, int row, const floatInVec &val );
+
+    // Get the element of a 3x3 matrix referred to by column and row indices
+    // 
+    __forceinline const floatInVec getElem( int col, int row ) const;
+
+    // Add two 3x3 matrices
+    // 
+    __forceinline const Matrix3 operator +( const Matrix3 & mat ) const;
+
+    // Subtract a 3x3 matrix from another 3x3 matrix
+    // 
+    __forceinline const Matrix3 operator -( const Matrix3 & mat ) const;
+
+    // Negate all elements of a 3x3 matrix
+    // 
+    __forceinline const Matrix3 operator -( ) const;
+
+    // Multiply a 3x3 matrix by a scalar
+    // 
+    __forceinline const Matrix3 operator *( float scalar ) const;
+
+    // Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Matrix3 operator *( const floatInVec &scalar ) const;
+
+    // Multiply a 3x3 matrix by a 3-D vector
+    // 
+    __forceinline const Vector3 operator *( const Vector3 &vec ) const;
+
+    // Multiply two 3x3 matrices
+    // 
+    __forceinline const Matrix3 operator *( const Matrix3 & mat ) const;
+
+    // Perform compound assignment and addition with a 3x3 matrix
+    // 
+    __forceinline Matrix3 & operator +=( const Matrix3 & mat );
+
+    // Perform compound assignment and subtraction by a 3x3 matrix
+    // 
+    __forceinline Matrix3 & operator -=( const Matrix3 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    __forceinline Matrix3 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Matrix3 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and multiplication by a 3x3 matrix
+    // 
+    __forceinline Matrix3 & operator *=( const Matrix3 & mat );
+
+    // Construct an identity 3x3 matrix
+    // 
+    static __forceinline const Matrix3 identity( );
+
+    // Construct a 3x3 matrix to rotate around the x axis
+    // 
+    static __forceinline const Matrix3 rotationX( float radians );
+
+    // Construct a 3x3 matrix to rotate around the y axis
+    // 
+    static __forceinline const Matrix3 rotationY( float radians );
+
+    // Construct a 3x3 matrix to rotate around the z axis
+    // 
+    static __forceinline const Matrix3 rotationZ( float radians );
+
+    // Construct a 3x3 matrix to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix3 rotationX( const floatInVec &radians );
+
+    // Construct a 3x3 matrix to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix3 rotationY( const floatInVec &radians );
+
+    // Construct a 3x3 matrix to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix3 rotationZ( const floatInVec &radians );
+
+    // Construct a 3x3 matrix to rotate around the x, y, and z axes
+    // 
+    static __forceinline const Matrix3 rotationZYX( const Vector3 &radiansXYZ );
+
+    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector
+    // 
+    static __forceinline const Matrix3 rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix3 rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static __forceinline const Matrix3 rotation( const Quat &unitQuat );
+
+    // Construct a 3x3 matrix to perform scaling
+    // 
+    static __forceinline const Matrix3 scale( const Vector3 &scaleVec );
+
+};
+// Multiply a 3x3 matrix by a scalar
+// 
+__forceinline const Matrix3 operator *( float scalar, const Matrix3 & mat );
+
+// Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
+// 
+__forceinline const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat );
+
+// Append (post-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+__forceinline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+__forceinline const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat );
+
+// Multiply two 3x3 matrices per element
+// 
+__forceinline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 );
+
+// Compute the absolute value of a 3x3 matrix per element
+// 
+__forceinline const Matrix3 absPerElem( const Matrix3 & mat );
+
+// Transpose of a 3x3 matrix
+// 
+__forceinline const Matrix3 transpose( const Matrix3 & mat );
+
+// Compute the inverse of a 3x3 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+__forceinline const Matrix3 inverse( const Matrix3 & mat );
+
+// Determinant of a 3x3 matrix
+// 
+__forceinline const floatInVec determinant( const Matrix3 & mat );
+
+// Conditionally select between two 3x3 matrices
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 );
+
+// Conditionally select between two 3x3 matrices (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x3 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Matrix3 & mat );
+
+// Print a 3x3 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Matrix3 & mat, const char * name );
+
+#endif
+
+// A 4x4 matrix in array-of-structures format
+//
+class Matrix4
+{
+    Vector4 mCol0;
+    Vector4 mCol1;
+    Vector4 mCol2;
+    Vector4 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Matrix4( ) { };
+
+    // Copy a 4x4 matrix
+    // 
+    __forceinline Matrix4( const Matrix4 & mat );
+
+    // Construct a 4x4 matrix containing the specified columns
+    // 
+    __forceinline Matrix4( const Vector4 &col0, const Vector4 &col1, const Vector4 &col2, const Vector4 &col3 );
+
+    // Construct a 4x4 matrix from a 3x4 transformation matrix
+    // 
+    explicit __forceinline Matrix4( const Transform3 & mat );
+
+    // Construct a 4x4 matrix from a 3x3 matrix and a 3-D vector
+    // 
+    __forceinline Matrix4( const Matrix3 & mat, const Vector3 &translateVec );
+
+    // Construct a 4x4 matrix from a unit-length quaternion and a 3-D vector
+    // 
+    __forceinline Matrix4( const Quat &unitQuat, const Vector3 &translateVec );
+
+    // Set all elements of a 4x4 matrix to the same scalar value
+    // 
+    explicit __forceinline Matrix4( float scalar );
+
+    // Set all elements of a 4x4 matrix to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Matrix4( const floatInVec &scalar );
+
+    // Assign one 4x4 matrix to another
+    // 
+    __forceinline Matrix4 & operator =( const Matrix4 & mat );
+
+    // Set the upper-left 3x3 submatrix
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    __forceinline Matrix4 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 4x4 matrix
+    // 
+    __forceinline const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    __forceinline Matrix4 & setTranslation( const Vector3 &translateVec );
+
+    // Get the translation component of a 4x4 matrix
+    // 
+    __forceinline const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 4x4 matrix
+    // 
+    __forceinline Matrix4 & setCol0( const Vector4 &col0 );
+
+    // Set column 1 of a 4x4 matrix
+    // 
+    __forceinline Matrix4 & setCol1( const Vector4 &col1 );
+
+    // Set column 2 of a 4x4 matrix
+    // 
+    __forceinline Matrix4 & setCol2( const Vector4 &col2 );
+
+    // Set column 3 of a 4x4 matrix
+    // 
+    __forceinline Matrix4 & setCol3( const Vector4 &col3 );
+
+    // Get column 0 of a 4x4 matrix
+    // 
+    __forceinline const Vector4 getCol0( ) const;
+
+    // Get column 1 of a 4x4 matrix
+    // 
+    __forceinline const Vector4 getCol1( ) const;
+
+    // Get column 2 of a 4x4 matrix
+    // 
+    __forceinline const Vector4 getCol2( ) const;
+
+    // Get column 3 of a 4x4 matrix
+    // 
+    __forceinline const Vector4 getCol3( ) const;
+
+    // Set the column of a 4x4 matrix referred to by the specified index
+    // 
+    __forceinline Matrix4 & setCol( int col, const Vector4 &vec );
+
+    // Set the row of a 4x4 matrix referred to by the specified index
+    // 
+    __forceinline Matrix4 & setRow( int row, const Vector4 &vec );
+
+    // Get the column of a 4x4 matrix referred to by the specified index
+    // 
+    __forceinline const Vector4 getCol( int col ) const;
+
+    // Get the row of a 4x4 matrix referred to by the specified index
+    // 
+    __forceinline const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    __forceinline Vector4 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    __forceinline const Vector4 operator []( int col ) const;
+
+    // Set the element of a 4x4 matrix referred to by column and row indices
+    // 
+    __forceinline Matrix4 & setElem( int col, int row, float val );
+
+    // Set the element of a 4x4 matrix referred to by column and row indices (scalar data contained in vector data type)
+    // 
+    __forceinline Matrix4 & setElem( int col, int row, const floatInVec &val );
+
+    // Get the element of a 4x4 matrix referred to by column and row indices
+    // 
+    __forceinline const floatInVec getElem( int col, int row ) const;
+
+    // Add two 4x4 matrices
+    // 
+    __forceinline const Matrix4 operator +( const Matrix4 & mat ) const;
+
+    // Subtract a 4x4 matrix from another 4x4 matrix
+    // 
+    __forceinline const Matrix4 operator -( const Matrix4 & mat ) const;
+
+    // Negate all elements of a 4x4 matrix
+    // 
+    __forceinline const Matrix4 operator -( ) const;
+
+    // Multiply a 4x4 matrix by a scalar
+    // 
+    __forceinline const Matrix4 operator *( float scalar ) const;
+
+    // Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Matrix4 operator *( const floatInVec &scalar ) const;
+
+    // Multiply a 4x4 matrix by a 4-D vector
+    // 
+    __forceinline const Vector4 operator *( const Vector4 &vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D vector
+    // 
+    __forceinline const Vector4 operator *( const Vector3 &vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D point
+    // 
+    __forceinline const Vector4 operator *( const Point3 &pnt ) const;
+
+    // Multiply two 4x4 matrices
+    // 
+    __forceinline const Matrix4 operator *( const Matrix4 & mat ) const;
+
+    // Multiply a 4x4 matrix by a 3x4 transformation matrix
+    // 
+    __forceinline const Matrix4 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and addition with a 4x4 matrix
+    // 
+    __forceinline Matrix4 & operator +=( const Matrix4 & mat );
+
+    // Perform compound assignment and subtraction by a 4x4 matrix
+    // 
+    __forceinline Matrix4 & operator -=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    __forceinline Matrix4 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Matrix4 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and multiplication by a 4x4 matrix
+    // 
+    __forceinline Matrix4 & operator *=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    __forceinline Matrix4 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 4x4 matrix
+    // 
+    static __forceinline const Matrix4 identity( );
+
+    // Construct a 4x4 matrix to rotate around the x axis
+    // 
+    static __forceinline const Matrix4 rotationX( float radians );
+
+    // Construct a 4x4 matrix to rotate around the y axis
+    // 
+    static __forceinline const Matrix4 rotationY( float radians );
+
+    // Construct a 4x4 matrix to rotate around the z axis
+    // 
+    static __forceinline const Matrix4 rotationZ( float radians );
+
+    // Construct a 4x4 matrix to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix4 rotationX( const floatInVec &radians );
+
+    // Construct a 4x4 matrix to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix4 rotationY( const floatInVec &radians );
+
+    // Construct a 4x4 matrix to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix4 rotationZ( const floatInVec &radians );
+
+    // Construct a 4x4 matrix to rotate around the x, y, and z axes
+    // 
+    static __forceinline const Matrix4 rotationZYX( const Vector3 &radiansXYZ );
+
+    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector
+    // 
+    static __forceinline const Matrix4 rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix4 rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static __forceinline const Matrix4 rotation( const Quat &unitQuat );
+
+    // Construct a 4x4 matrix to perform scaling
+    // 
+    static __forceinline const Matrix4 scale( const Vector3 &scaleVec );
+
+    // Construct a 4x4 matrix to perform translation
+    // 
+    static __forceinline const Matrix4 translation( const Vector3 &translateVec );
+
+    // Construct viewing matrix based on eye, position looked at, and up direction
+    // 
+    static __forceinline const Matrix4 lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec );
+
+    // Construct a perspective projection matrix
+    // 
+    static __forceinline const Matrix4 perspective( float fovyRadians, float aspect, float zNear, float zFar );
+
+    // Construct a perspective projection matrix based on frustum
+    // 
+    static __forceinline const Matrix4 frustum( float left, float right, float bottom, float top, float zNear, float zFar );
+
+    // Construct an orthographic projection matrix
+    // 
+    static __forceinline const Matrix4 orthographic( float left, float right, float bottom, float top, float zNear, float zFar );
+
+};
+// Multiply a 4x4 matrix by a scalar
+// 
+__forceinline const Matrix4 operator *( float scalar, const Matrix4 & mat );
+
+// Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
+// 
+__forceinline const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat );
+
+// Append (post-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+__forceinline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+__forceinline const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat );
+
+// Multiply two 4x4 matrices per element
+// 
+__forceinline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 );
+
+// Compute the absolute value of a 4x4 matrix per element
+// 
+__forceinline const Matrix4 absPerElem( const Matrix4 & mat );
+
+// Transpose of a 4x4 matrix
+// 
+__forceinline const Matrix4 transpose( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+__forceinline const Matrix4 inverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.  The result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+__forceinline const Matrix4 affineInverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix with an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.
+// 
+__forceinline const Matrix4 orthoInverse( const Matrix4 & mat );
+
+// Determinant of a 4x4 matrix
+// 
+__forceinline const floatInVec determinant( const Matrix4 & mat );
+
+// Conditionally select between two 4x4 matrices
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 );
+
+// Conditionally select between two 4x4 matrices (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4x4 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Matrix4 & mat );
+
+// Print a 4x4 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Matrix4 & mat, const char * name );
+
+#endif
+
+// A 3x4 transformation matrix in array-of-structures format
+//
+class Transform3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+    Vector3 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Transform3( ) { };
+
+    // Copy a 3x4 transformation matrix
+    // 
+    __forceinline Transform3( const Transform3 & tfrm );
+
+    // Construct a 3x4 transformation matrix containing the specified columns
+    // 
+    __forceinline Transform3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2, const Vector3 &col3 );
+
+    // Construct a 3x4 transformation matrix from a 3x3 matrix and a 3-D vector
+    // 
+    __forceinline Transform3( const Matrix3 & tfrm, const Vector3 &translateVec );
+
+    // Construct a 3x4 transformation matrix from a unit-length quaternion and a 3-D vector
+    // 
+    __forceinline Transform3( const Quat &unitQuat, const Vector3 &translateVec );
+
+    // Set all elements of a 3x4 transformation matrix to the same scalar value
+    // 
+    explicit __forceinline Transform3( float scalar );
+
+    // Set all elements of a 3x4 transformation matrix to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Transform3( const floatInVec &scalar );
+
+    // Assign one 3x4 transformation matrix to another
+    // 
+    __forceinline Transform3 & operator =( const Transform3 & tfrm );
+
+    // Set the upper-left 3x3 submatrix
+    // 
+    __forceinline Transform3 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 3x4 transformation matrix
+    // 
+    __forceinline const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // 
+    __forceinline Transform3 & setTranslation( const Vector3 &translateVec );
+
+    // Get the translation component of a 3x4 transformation matrix
+    // 
+    __forceinline const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 3x4 transformation matrix
+    // 
+    __forceinline Transform3 & setCol0( const Vector3 &col0 );
+
+    // Set column 1 of a 3x4 transformation matrix
+    // 
+    __forceinline Transform3 & setCol1( const Vector3 &col1 );
+
+    // Set column 2 of a 3x4 transformation matrix
+    // 
+    __forceinline Transform3 & setCol2( const Vector3 &col2 );
+
+    // Set column 3 of a 3x4 transformation matrix
+    // 
+    __forceinline Transform3 & setCol3( const Vector3 &col3 );
+
+    // Get column 0 of a 3x4 transformation matrix
+    // 
+    __forceinline const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x4 transformation matrix
+    // 
+    __forceinline const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x4 transformation matrix
+    // 
+    __forceinline const Vector3 getCol2( ) const;
+
+    // Get column 3 of a 3x4 transformation matrix
+    // 
+    __forceinline const Vector3 getCol3( ) const;
+
+    // Set the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    __forceinline Transform3 & setCol( int col, const Vector3 &vec );
+
+    // Set the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    __forceinline Transform3 & setRow( int row, const Vector4 &vec );
+
+    // Get the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    __forceinline const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    __forceinline const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    __forceinline Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    __forceinline const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    __forceinline Transform3 & setElem( int col, int row, float val );
+
+    // Set the element of a 3x4 transformation matrix referred to by column and row indices (scalar data contained in vector data type)
+    // 
+    __forceinline Transform3 & setElem( int col, int row, const floatInVec &val );
+
+    // Get the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    __forceinline const floatInVec getElem( int col, int row ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D vector
+    // 
+    __forceinline const Vector3 operator *( const Vector3 &vec ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D point
+    // 
+    __forceinline const Point3 operator *( const Point3 &pnt ) const;
+
+    // Multiply two 3x4 transformation matrices
+    // 
+    __forceinline const Transform3 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    __forceinline Transform3 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 3x4 transformation matrix
+    // 
+    static __forceinline const Transform3 identity( );
+
+    // Construct a 3x4 transformation matrix to rotate around the x axis
+    // 
+    static __forceinline const Transform3 rotationX( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the y axis
+    // 
+    static __forceinline const Transform3 rotationY( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the z axis
+    // 
+    static __forceinline const Transform3 rotationZ( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Transform3 rotationX( const floatInVec &radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Transform3 rotationY( const floatInVec &radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Transform3 rotationZ( const floatInVec &radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the x, y, and z axes
+    // 
+    static __forceinline const Transform3 rotationZYX( const Vector3 &radiansXYZ );
+
+    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector
+    // 
+    static __forceinline const Transform3 rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static __forceinline const Transform3 rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static __forceinline const Transform3 rotation( const Quat &unitQuat );
+
+    // Construct a 3x4 transformation matrix to perform scaling
+    // 
+    static __forceinline const Transform3 scale( const Vector3 &scaleVec );
+
+    // Construct a 3x4 transformation matrix to perform translation
+    // 
+    static __forceinline const Transform3 translation( const Vector3 &translateVec );
+
+};
+// Append (post-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+__forceinline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+__forceinline const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm );
+
+// Multiply two 3x4 transformation matrices per element
+// 
+__forceinline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 );
+
+// Compute the absolute value of a 3x4 transformation matrix per element
+// 
+__forceinline const Transform3 absPerElem( const Transform3 & tfrm );
+
+// Inverse of a 3x4 transformation matrix
+// NOTE: 
+// Result is unpredictable when the determinant of the left 3x3 submatrix is equal to or near 0.
+// 
+__forceinline const Transform3 inverse( const Transform3 & tfrm );
+
+// Compute the inverse of a 3x4 transformation matrix, expected to have an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 3x4 transformation matrix meets the given restrictions.
+// 
+__forceinline const Transform3 orthoInverse( const Transform3 & tfrm );
+
+// Conditionally select between two 3x4 transformation matrices
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 );
+
+// Conditionally select between two 3x4 transformation matrices (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x4 transformation matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Transform3 & tfrm );
+
+// Print a 3x4 transformation matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Transform3 & tfrm, const char * name );
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#include "vec_aos.h"
+#include "quat_aos.h"
+#include "mat_aos.h"
+
+#endif