Add PhysicsEffects to Extras. The build is only tested on Windows and Android.

The Android/NEON optimized version of Physics Effects is thanks to Graham Rhodes and Anthony Hamilton, See Issue 587
2012-03-05 04:59:58 +00:00
parent 6cf8dfc202
commit a93a661b94
462 changed files with 86626 additions and 0 deletions
--- a/Extras/PhysicsEffects/include/vecmath/neon/boolInVec.h
+++ b/Extras/PhysicsEffects/include/vecmath/neon/boolInVec.h
@@ -0,0 +1,238 @@
+/*
+   Copyright (C) 2006-2010 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _BOOLINVEC_SCALAR_H
+#define _BOOLINVEC_SCALAR_H
+
+#include <math.h>
+namespace Vectormath {
+
+class floatInVec;
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec class
+//
+
+class boolInVec
+{
+private:
+    unsigned int mData;
+
+public:
+    // Default constructor; does no initialization
+    //
+    inline boolInVec( ) { };
+
+    // Construct from a value converted from float
+    //
+    inline boolInVec(floatInVec vec);
+
+    // Explicit cast from bool
+    //
+    explicit inline boolInVec(bool scalar);
+
+    // Explicit cast to bool
+    //
+    inline bool getAsBool() const;
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+    // Implicit cast to bool
+    //
+    inline operator bool() const;
+#endif
+
+    // Boolean negation operator
+    //
+    inline const boolInVec operator ! () const;
+
+    // Assignment operator
+    //
+    inline boolInVec& operator = (boolInVec vec);
+
+    // Boolean and assignment operator
+    //
+    inline boolInVec& operator &= (boolInVec vec);
+
+    // Boolean exclusive or assignment operator
+    //
+    inline boolInVec& operator ^= (boolInVec vec);
+
+    // Boolean or assignment operator
+    //
+    inline boolInVec& operator |= (boolInVec vec);
+
+};
+
+// Equal operator
+//
+inline const boolInVec operator == (boolInVec vec0, boolInVec vec1);
+
+// Not equal operator
+//
+inline const boolInVec operator != (boolInVec vec0, boolInVec vec1);
+
+// And operator
+//
+inline const boolInVec operator & (boolInVec vec0, boolInVec vec1);
+
+// Exclusive or operator
+//
+inline const boolInVec operator ^ (boolInVec vec0, boolInVec vec1);
+
+// Or operator
+//
+inline const boolInVec operator | (boolInVec vec0, boolInVec vec1);
+
+// Conditionally select between two values
+//
+inline const boolInVec select(boolInVec vec0, boolInVec vec1, boolInVec select_vec1);
+
+
+} // namespace Vectormath
+
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec implementation
+//
+
+#include "floatInVec.h"
+
+namespace Vectormath {
+
+inline
+boolInVec::boolInVec(floatInVec vec)
+{
+    *this = (vec != floatInVec(0.0f));
+}
+
+inline
+boolInVec::boolInVec(bool scalar)
+{
+    mData = -(int)scalar;
+}
+
+inline
+bool
+boolInVec::getAsBool() const
+{
+    return (mData > 0);
+}
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+inline
+boolInVec::operator bool() const
+{
+    return getAsBool();
+}
+#endif
+
+inline
+const boolInVec
+boolInVec::operator ! () const
+{
+    return boolInVec(!mData);
+}
+
+inline
+boolInVec&
+boolInVec::operator = (boolInVec vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator &= (boolInVec vec)
+{
+    *this = *this & vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator ^= (boolInVec vec)
+{
+    *this = *this ^ vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator |= (boolInVec vec)
+{
+    *this = *this | vec;
+    return *this;
+}
+
+inline
+const boolInVec
+operator == (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() == vec1.getAsBool());
+}
+
+inline
+const boolInVec
+operator != (boolInVec vec0, boolInVec vec1)
+{
+    return !(vec0 == vec1);
+}
+
+inline
+const boolInVec
+operator & (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() & vec1.getAsBool());
+}
+
+inline
+const boolInVec
+operator | (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() | vec1.getAsBool());
+}
+
+inline
+const boolInVec
+operator ^ (boolInVec vec0, boolInVec vec1)
+{
+    return boolInVec(vec0.getAsBool() ^ vec1.getAsBool());
+}
+
+inline
+const boolInVec
+select(boolInVec vec0, boolInVec vec1, boolInVec select_vec1)
+{
+    return (select_vec1.getAsBool() == 0) ? vec0 : vec1;
+}
+
+} // namespace Vectormath
+
+#endif // boolInVec_h
--- a/Extras/PhysicsEffects/include/vecmath/neon/floatInVec.h
+++ b/Extras/PhysicsEffects/include/vecmath/neon/floatInVec.h
@@ -0,0 +1,357 @@
+/*
+   Copyright (C) 2006-2010 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _FLOATINVEC__SCALAR_H
+#define _FLOATINVEC__SCALAR_H
+
+#include <math.h>
+namespace Vectormath {
+
+class boolInVec;
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec class
+//
+
+// A class representing a scalar float value contained in a vector register
+// This class does not support fastmath
+class floatInVec
+{
+private:
+    float mData;
+
+public:
+    // Default constructor; does no initialization
+    //
+    inline floatInVec( ) { };
+
+    // Construct from a value converted from bool
+    //
+    inline floatInVec(boolInVec vec);
+
+    // Explicit cast from float
+    //
+    explicit inline floatInVec(float scalar);
+
+    // Explicit cast to float
+    //
+    inline float getAsFloat() const;
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+    // Implicit cast to float
+    //
+    inline operator float() const;
+#endif
+
+    // Post increment (add 1.0f)
+    //
+    inline const floatInVec operator ++ (int);
+
+    // Post decrement (subtract 1.0f)
+    //
+    inline const floatInVec operator -- (int);
+
+    // Pre increment (add 1.0f)
+    //
+    inline floatInVec& operator ++ ();
+
+    // Pre decrement (subtract 1.0f)
+    //
+    inline floatInVec& operator -- ();
+
+    // Negation operator
+    //
+    inline const floatInVec operator - () const;
+
+    // Assignment operator
+    //
+    inline floatInVec& operator = (floatInVec vec);
+
+    // Multiplication assignment operator
+    //
+    inline floatInVec& operator *= (floatInVec vec);
+
+    // Division assignment operator
+    //
+    inline floatInVec& operator /= (floatInVec vec);
+
+    // Addition assignment operator
+    //
+    inline floatInVec& operator += (floatInVec vec);
+
+    // Subtraction assignment operator
+    //
+    inline floatInVec& operator -= (floatInVec vec);
+
+};
+
+// Multiplication operator
+//
+inline const floatInVec operator * (floatInVec vec0, floatInVec vec1);
+
+// Division operator
+//
+inline const floatInVec operator / (floatInVec vec0, floatInVec vec1);
+
+// Addition operator
+//
+inline const floatInVec operator + (floatInVec vec0, floatInVec vec1);
+
+// Subtraction operator
+//
+inline const floatInVec operator - (floatInVec vec0, floatInVec vec1);
+
+// Less than operator
+//
+inline const boolInVec operator < (floatInVec vec0, floatInVec vec1);
+
+// Less than or equal operator
+//
+inline const boolInVec operator <= (floatInVec vec0, floatInVec vec1);
+
+// Greater than operator
+//
+inline const boolInVec operator > (floatInVec vec0, floatInVec vec1);
+
+// Greater than or equal operator
+//
+inline const boolInVec operator >= (floatInVec vec0, floatInVec vec1);
+
+// Equal operator
+//
+inline const boolInVec operator == (floatInVec vec0, floatInVec vec1);
+
+// Not equal operator
+//
+inline const boolInVec operator != (floatInVec vec0, floatInVec vec1);
+
+// Conditionally select between two values
+//
+inline const floatInVec select(floatInVec vec0, floatInVec vec1, boolInVec select_vec1);
+
+
+} // namespace Vectormath
+
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec implementation
+//
+
+#include "boolInVec.h"
+
+namespace Vectormath {
+
+inline
+floatInVec::floatInVec(boolInVec vec)
+{
+    mData = float(vec.getAsBool());
+}
+
+inline
+floatInVec::floatInVec(float scalar)
+{
+    mData = scalar;
+}
+
+inline
+float
+floatInVec::getAsFloat() const
+{
+    return mData;
+}
+
+#ifndef _VECTORMATH_NO_SCALAR_CAST
+inline
+floatInVec::operator float() const
+{
+    return getAsFloat();
+}
+#endif
+
+inline
+const floatInVec
+floatInVec::operator ++ (int)
+{
+    float olddata = mData;
+    operator ++();
+    return floatInVec(olddata);
+}
+
+inline
+const floatInVec
+floatInVec::operator -- (int)
+{
+    float olddata = mData;
+    operator --();
+    return floatInVec(olddata);
+}
+
+inline
+floatInVec&
+floatInVec::operator ++ ()
+{
+    *this += floatInVec(1.0f);
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -- ()
+{
+    *this -= floatInVec(1.0f);
+    return *this;
+}
+
+inline
+const floatInVec
+floatInVec::operator - () const
+{
+    return floatInVec(-mData);
+}
+
+inline
+floatInVec&
+floatInVec::operator = (floatInVec vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator *= (floatInVec vec)
+{
+    *this = *this * vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator /= (floatInVec vec)
+{
+    *this = *this / vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator += (floatInVec vec)
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -= (floatInVec vec)
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline
+const floatInVec
+operator * (floatInVec vec0, floatInVec vec1)
+{
+    return floatInVec(vec0.getAsFloat() * vec1.getAsFloat());
+}
+
+inline
+const floatInVec
+operator / (floatInVec num, floatInVec den)
+{
+    return floatInVec(num.getAsFloat() / den.getAsFloat());
+}
+
+inline
+const floatInVec
+operator + (floatInVec vec0, floatInVec vec1)
+{
+    return floatInVec(vec0.getAsFloat() + vec1.getAsFloat());
+}
+
+inline
+const floatInVec
+operator - (floatInVec vec0, floatInVec vec1)
+{
+    return floatInVec(vec0.getAsFloat() - vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator < (floatInVec vec0, floatInVec vec1)
+{
+    return boolInVec(vec0.getAsFloat() < vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator <= (floatInVec vec0, floatInVec vec1)
+{
+    return !(vec0 > vec1);
+}
+
+inline
+const boolInVec
+operator > (floatInVec vec0, floatInVec vec1)
+{
+    return boolInVec(vec0.getAsFloat() > vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator >= (floatInVec vec0, floatInVec vec1)
+{
+    return !(vec0 < vec1);
+}
+
+inline
+const boolInVec
+operator == (floatInVec vec0, floatInVec vec1)
+{
+    return boolInVec(vec0.getAsFloat() == vec1.getAsFloat());
+}
+
+inline
+const boolInVec
+operator != (floatInVec vec0, floatInVec vec1)
+{
+    return !(vec0 == vec1);
+}
+
+inline
+const floatInVec
+select(floatInVec vec0, floatInVec vec1, boolInVec select_vec1)
+{
+    return (select_vec1.getAsBool() == 0) ? vec0 : vec1;
+}
+
+} // namespace Vectormath
+
+#endif // floatInVec_h
--- a/Extras/PhysicsEffects/include/vecmath/neon/mat_aos.h
+++ b/Extras/PhysicsEffects/include/vecmath/neon/mat_aos.h
--- a/Extras/PhysicsEffects/include/vecmath/neon/quat_aos.h
+++ b/Extras/PhysicsEffects/include/vecmath/neon/quat_aos.h
@@ -0,0 +1,446 @@
+/*
+   Copyright (C) 2006-2010 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _VECTORMATH_QUAT_AOS_CPP_H
+#define _VECTORMATH_QUAT_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+inline Quat::Quat( const Quat & quat )
+{
+    mX = quat.mX;
+    mY = quat.mY;
+    mZ = quat.mZ;
+    mW = quat.mW;
+}
+
+inline Quat::Quat( float _x, float _y, float _z, float _w )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+    mW = _w;
+}
+
+inline Quat::Quat( const Vector3 & xyz, float _w )
+{
+    this->setXYZ( xyz );
+    this->setW( _w );
+}
+
+inline Quat::Quat( const Vector4 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    mW = vec.getW();
+}
+
+inline Quat::Quat( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+    mW = scalar;
+}
+
+inline const Quat Quat::identity( )
+{
+    return Quat( 0.0f, 0.0f, 0.0f, 1.0f );
+}
+
+inline const Quat lerp( float t, const Quat & quat0, const Quat & quat1 )
+{
+    return ( quat0 + ( ( quat1 - quat0 ) * t ) );
+}
+
+inline const Quat slerp( float t, const Quat & unitQuat0, const Quat & unitQuat1 )
+{
+    Quat start;
+    float recipSinAngle, scale0, scale1, cosAngle, angle;
+    cosAngle = dot( unitQuat0, unitQuat1 );
+    if ( cosAngle < 0.0f ) {
+        cosAngle = -cosAngle;
+        start = ( -unitQuat0 );
+    } else {
+        start = unitQuat0;
+    }
+    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
+        angle = acosf( cosAngle );
+        recipSinAngle = ( 1.0f / sinf( angle ) );
+        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
+        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
+    } else {
+        scale0 = ( 1.0f - t );
+        scale1 = t;
+    }
+    return ( ( start * scale0 ) + ( unitQuat1 * scale1 ) );
+}
+
+inline const Quat squad( float t, const Quat & unitQuat0, const Quat & unitQuat1, const Quat & unitQuat2, const Quat & unitQuat3 )
+{
+    Quat tmp0, tmp1;
+    tmp0 = slerp( t, unitQuat0, unitQuat3 );
+    tmp1 = slerp( t, unitQuat1, unitQuat2 );
+    return slerp( ( ( 2.0f * t ) * ( 1.0f - t ) ), tmp0, tmp1 );
+}
+
+inline void loadXYZW( Quat & quat, const float * fptr )
+{
+    quat = Quat( fptr[0], fptr[1], fptr[2], fptr[3] );
+}
+
+inline void storeXYZW( const Quat & quat, float * fptr )
+{
+    fptr[0] = quat.getX();
+    fptr[1] = quat.getY();
+    fptr[2] = quat.getZ();
+    fptr[3] = quat.getW();
+}
+
+inline Quat & Quat::operator =( const Quat & quat )
+{
+    mX = quat.mX;
+    mY = quat.mY;
+    mZ = quat.mZ;
+    mW = quat.mW;
+    return *this;
+}
+
+inline Quat & Quat::setXYZ( const Vector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    return *this;
+}
+
+inline const Vector3 Quat::getXYZ( ) const
+{
+    return Vector3( mX, mY, mZ );
+}
+
+inline Quat & Quat::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Quat::getX( ) const
+{
+    return mX;
+}
+
+inline Quat & Quat::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Quat::getY( ) const
+{
+    return mY;
+}
+
+inline Quat & Quat::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Quat::getZ( ) const
+{
+    return mZ;
+}
+
+inline Quat & Quat::setW( float _w )
+{
+    mW = _w;
+    return *this;
+}
+
+inline float Quat::getW( ) const
+{
+    return mW;
+}
+
+inline Quat & Quat::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Quat::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Quat::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Quat::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Quat Quat::operator +( const Quat & quat ) const
+{
+    return Quat(
+        ( mX + quat.mX ),
+        ( mY + quat.mY ),
+        ( mZ + quat.mZ ),
+        ( mW + quat.mW )
+    );
+}
+
+inline const Quat Quat::operator -( const Quat & quat ) const
+{
+    return Quat(
+        ( mX - quat.mX ),
+        ( mY - quat.mY ),
+        ( mZ - quat.mZ ),
+        ( mW - quat.mW )
+    );
+}
+
+inline const Quat Quat::operator *( float scalar ) const
+{
+    return Quat(
+        ( mX * scalar ),
+        ( mY * scalar ),
+        ( mZ * scalar ),
+        ( mW * scalar )
+    );
+}
+
+inline Quat & Quat::operator +=( const Quat & quat )
+{
+    *this = *this + quat;
+    return *this;
+}
+
+inline Quat & Quat::operator -=( const Quat & quat )
+{
+    *this = *this - quat;
+    return *this;
+}
+
+inline Quat & Quat::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Quat Quat::operator /( float scalar ) const
+{
+    return Quat(
+        ( mX / scalar ),
+        ( mY / scalar ),
+        ( mZ / scalar ),
+        ( mW / scalar )
+    );
+}
+
+inline Quat & Quat::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+inline const Quat Quat::operator -( ) const
+{
+    return Quat(
+        -mX,
+        -mY,
+        -mZ,
+        -mW
+    );
+}
+
+inline const Quat operator *( float scalar, const Quat & quat )
+{
+    return quat * scalar;
+}
+
+inline float dot( const Quat & quat0, const Quat & quat1 )
+{
+    float result;
+    result = ( quat0.getX() * quat1.getX() );
+    result = ( result + ( quat0.getY() * quat1.getY() ) );
+    result = ( result + ( quat0.getZ() * quat1.getZ() ) );
+    result = ( result + ( quat0.getW() * quat1.getW() ) );
+    return result;
+}
+
+inline float norm( const Quat & quat )
+{
+    float result;
+    result = ( quat.getX() * quat.getX() );
+    result = ( result + ( quat.getY() * quat.getY() ) );
+    result = ( result + ( quat.getZ() * quat.getZ() ) );
+    result = ( result + ( quat.getW() * quat.getW() ) );
+    return result;
+}
+
+inline float length( const Quat & quat )
+{
+    return ::sqrtf( norm( quat ) );
+}
+
+inline const Quat normalize( const Quat & quat )
+{
+    float lenSqr, lenInv;
+    lenSqr = norm( quat );
+    lenInv = ( 1.0f / sqrtf( lenSqr ) );
+    return Quat(
+        ( quat.getX() * lenInv ),
+        ( quat.getY() * lenInv ),
+        ( quat.getZ() * lenInv ),
+        ( quat.getW() * lenInv )
+    );
+}
+
+inline const Quat Quat::rotation( const Vector3 & unitVec0, const Vector3 & unitVec1 )
+{
+    float cosHalfAngleX2, recipCosHalfAngleX2;
+    cosHalfAngleX2 = sqrtf( ( 2.0f * ( 1.0f + dot( unitVec0, unitVec1 ) ) ) );
+    recipCosHalfAngleX2 = ( 1.0f / cosHalfAngleX2 );
+    return Quat( ( cross( unitVec0, unitVec1 ) * recipCosHalfAngleX2 ), ( cosHalfAngleX2 * 0.5f ) );
+}
+
+inline const Quat Quat::rotation( float radians, const Vector3 & unitVec )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat( ( unitVec * s ), c );
+}
+
+inline const Quat Quat::rotationX( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat( s, 0.0f, 0.0f, c );
+}
+
+inline const Quat Quat::rotationY( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat( 0.0f, s, 0.0f, c );
+}
+
+inline const Quat Quat::rotationZ( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat( 0.0f, 0.0f, s, c );
+}
+
+inline const Quat Quat::operator *( const Quat & quat ) const
+{
+    return Quat(
+        ( ( ( ( mW * quat.mX ) + ( mX * quat.mW ) ) + ( mY * quat.mZ ) ) - ( mZ * quat.mY ) ),
+        ( ( ( ( mW * quat.mY ) + ( mY * quat.mW ) ) + ( mZ * quat.mX ) ) - ( mX * quat.mZ ) ),
+        ( ( ( ( mW * quat.mZ ) + ( mZ * quat.mW ) ) + ( mX * quat.mY ) ) - ( mY * quat.mX ) ),
+        ( ( ( ( mW * quat.mW ) - ( mX * quat.mX ) ) - ( mY * quat.mY ) ) - ( mZ * quat.mZ ) )
+    );
+}
+
+inline Quat & Quat::operator *=( const Quat & quat )
+{
+    *this = *this * quat;
+    return *this;
+}
+
+inline const Vector3 rotate( const Quat & quat, const Vector3 & vec )
+{
+    float tmpX, tmpY, tmpZ, tmpW;
+    tmpX = ( ( ( quat.getW() * vec.getX() ) + ( quat.getY() * vec.getZ() ) ) - ( quat.getZ() * vec.getY() ) );
+    tmpY = ( ( ( quat.getW() * vec.getY() ) + ( quat.getZ() * vec.getX() ) ) - ( quat.getX() * vec.getZ() ) );
+    tmpZ = ( ( ( quat.getW() * vec.getZ() ) + ( quat.getX() * vec.getY() ) ) - ( quat.getY() * vec.getX() ) );
+    tmpW = ( ( ( quat.getX() * vec.getX() ) + ( quat.getY() * vec.getY() ) ) + ( quat.getZ() * vec.getZ() ) );
+    return Vector3(
+        ( ( ( ( tmpW * quat.getX() ) + ( tmpX * quat.getW() ) ) - ( tmpY * quat.getZ() ) ) + ( tmpZ * quat.getY() ) ),
+        ( ( ( ( tmpW * quat.getY() ) + ( tmpY * quat.getW() ) ) - ( tmpZ * quat.getX() ) ) + ( tmpX * quat.getZ() ) ),
+        ( ( ( ( tmpW * quat.getZ() ) + ( tmpZ * quat.getW() ) ) - ( tmpX * quat.getY() ) ) + ( tmpY * quat.getX() ) )
+    );
+}
+
+inline const Quat conj( const Quat & quat )
+{
+    return Quat( -quat.getX(), -quat.getY(), -quat.getZ(), quat.getW() );
+}
+
+inline const Quat select( const Quat & quat0, const Quat & quat1, bool select1 )
+{
+    return Quat(
+        ( select1 )? quat1.getX() : quat0.getX(),
+        ( select1 )? quat1.getY() : quat0.getY(),
+        ( select1 )? quat1.getZ() : quat0.getZ(),
+        ( select1 )? quat1.getW() : quat0.getW()
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Quat & quat )
+{
+    printf( "( %f %f %f %f )\n", quat.getX(), quat.getY(), quat.getZ(), quat.getW() );
+}
+
+inline void print( const Quat & quat, const char * name )
+{
+    printf( "%s: ( %f %f %f %f )\n", name, quat.getX(), quat.getY(), quat.getZ(), quat.getW() );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
--- a/Extras/PhysicsEffects/include/vecmath/neon/vec_aos.h
+++ b/Extras/PhysicsEffects/include/vecmath/neon/vec_aos.h
--- a/Extras/PhysicsEffects/include/vecmath/neon/vectormath_aos.h
+++ b/Extras/PhysicsEffects/include/vecmath/neon/vectormath_aos.h
--- a/Extras/PhysicsEffects/include/vecmath/neon/vectormath_neon_assembly_implementations.S
+++ b/Extras/PhysicsEffects/include/vecmath/neon/vectormath_neon_assembly_implementations.S
@@ -0,0 +1,310 @@
+@
+@ Applied Research Associates Inc. (c)2011
+@
+@ Redistribution and use in source and binary forms,
+@   with or without modification, are permitted provided that the
+@   following conditions are met:
+@    * Redistributions of source code must retain the above copyright
+@      notice, this list of conditions and the following disclaimer.
+@    * Redistributions in binary form must reproduce the above copyright
+@      notice, this list of conditions and the following disclaimer in the
+@      documentation and/or other materials provided with the distribution.
+@    * Neither the name of the Applied Research Associates Inc nor the names
+@      of its contributors may be used to endorse or promote products derived
+@      from this software without specific prior written permission.
+@
+@   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+@   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+@   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+@   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+@   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+@   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+@   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+@   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+@   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+@   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+@   POSSIBILITY OF SUCH DAMAGE.
+@
+@----------------------------------------------------------------
+@
+@ This file contains ARM/NEON assembly versions of some vectormath
+@ atomic functions. We have implemented here instead of inline assembly
+@ because we have found gcc 4.4.3 to be too inconsistent and inadequate
+@ to properly support either NEON intrinsics or inline assembly. (See
+@ the inline assembly version of the vector3 dot product, which is
+@ contained in vec_aos.h but compiled out, for an example of a simple
+@ inline assembly function that wreaks havok if used.)
+@
+@ Note that the certain NEON registers must be preserved across
+@ function calls according to the following document:
+@
+@ "Procedure Call Standard for the ARM? Architecture," ARM document
+@ number ARM IHI 0042D, current through ABI release 2.08,
+@ 16th October, 2009, section 5.1.2.1
+@
+@ The registers are: q4-q7 (and their double-word and single word
+@ counterparts)
+@
+@ These functions preserve all non-scratch general purpose registers
+@ as well as the ones listed, and so we do not need to have extra
+@ code to do the register preservation
+@
+	.syntax unified
+	.arch armv7-a
+	.fpu neon
+	.thumb
+	.text
+	.align 2
+
+@----------------------------------------------------------------
+@ pfxVector3DotProductNEON
+@
+@ Vector3 dot product, result stored directly to memory
+@
+@ Result stored in memory pointed to by r2. r2 must point to
+@ memory sufficient to store two 32 bit floats, though only the
+@ first value is of interest
+@----------------------------------------------------------------
+	.global	pfxVector3DotProductNEON
+	.thumb_func
+pfxVector3DotProductNEON:
+	.fnstart
+	vld1.32 {d0,d1}, [r1]		@ input <x2,y2,z2,?> = d0,d1\n\t"
+	vld1.32 {d2,d3}, [r0]		@ input <x1,y1,z1,?> = d2,d3\n\t"
+	vmul.f32 d4, d0, d2			@ d4 = <x1*x2,y1*y2>\n\t"
+	vpadd.f32 d4, d4, d4		@ d4 = <x1*x2 + y1*y2, x1*x2 + y1*y2>\n\t"
+	vmla.f32 s8, s2, s6			@ s8 = <x1*x2 + y1*y2 + z1*z2\n\t"
+	vst1.32 {d4}, [r2]			@ save result to memory. supports double/quad word only. We only care about first word\n\t"
+	bx	lr
+	.fnend
+
+@----------------------------------------------------------------
+@ pfxVector4DotProductNEON
+@
+@ Vector4 dot product, result stored directly to memory
+@
+@ Result stored in memory pointed to by r2. r2 must point to
+@ memory sufficient to store two 32 bit floats, though only the
+@ first value is of interest
+@----------------------------------------------------------------
+	.global	pfxVector4DotProductNEON
+	.thumb_func
+pfxVector4DotProductNEON:
+	.fnstart
+	vld1.32	{d16,d17}, [r0]		@ input <x1,y1,z1,w1>
+	vld1.32	{d18,d19}, [r1]		@ input <x2,y2,z2,w2>
+	vmul.f32 d14, d16, d18		@ d14=<x1*x2,y1*y2>
+	@ non-fused multiple accumulate
+	vmla.f32 d14, d17, d19		@ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
+	@ fused multiple accumulate - listed here for reference but we use vmla above
+	@ instead since the fused version is not recognized by GNU assembler (as part
+	@ of the gcc 4.4.3 Android distribution)
+	@	vfma.f32 {d14}, d17, d19	@ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
+	vpadd.f32 d14, d14, d14		@ add the two halves of d14 together, same result in each lane
+	vst1.32	{d14}, [r2]
+	bx	lr
+	.fnend
+
+@----------------------------------------------------------------
+@ pfxVector3CrossProductNEON
+@
+@ Vector3 cross product, result stored directly to memory
+@
+@ Result stored in memory pointed to by r2. r2 must point to
+@ memory sufficient to store four 32 bit floats, though only the
+@ first 3 values are of interest
+@----------------------------------------------------------------
+	.global	pfxVector3CrossProductNEON
+	.thumb_func
+pfxVector3CrossProductNEON:
+	.fnstart
+	vld1.32	{d18,d19}, [r1]	@ input <x2,y2,z2,w2> = d18,d19
+	vld1.32	{d16,d17}, [r0]	@ input <x1,y1,z1,w1> = d16,d17
+	@ rearrange inputs into convenient order
+	vtrn.32 d18,d19			@  q9 = <x2,z2,y2,w2> = d18,d19
+	vrev64.32 d16,d16		@  q8 = <y1,x1,z1,w1> = d16,d17
+	vrev64.32 d18,d18		@  q9 = <z2,x2,y2,w2> = d18,d19
+	vtrn.32 d16,d17			@  q8 = <y1,z1,x1,w1> = d16,d17
+	@ perform first half of cross product using rearranged inputs
+	vmul.f32 q10, q8, q9	@ q10 = <y1*z2,z1*x2,x1*y2,w1*w2>
+	@ rearrange inputs again
+	vtrn.32 d18,d19			@  q9 = <z2,y2,x2,w2> = d18,d19
+	vrev64.32 d16,d16		@  q8 = <z1,y1,x1,w1> = d16,d17
+	vrev64.32 d18,d18		@  q9 = <y2,z2,x2,w2> = d18,d19
+	vtrn.32 d16,d17			@  q8 = <z1,x1,y1,w1> = d16,d17
+	@ perform last half of cross product using rearranged inputs
+	vmls.f32 q10, q8, q9	@ q10 = <y1*z2-y2*z1,z1*x2-z2*x1,x1*y2-x2*y1,w1*w2-w2*w1>
+	@ and store the result
+	vst1.32	{q10}, [r2]
+	bx	lr
+	.fnend
+
+@----------------------------------------------------------------
+@ pfxMatrix3Matrix3ProductNEON
+@
+@ Matrix3 * Matrix3 product, result stored directly to memory
+@
+@ Result stored in memory pointed to by r2. r2 must point to
+@ memory sufficient to store 12 32-bit floats. The reason for
+@ 12 rather than 9 is that each column vector actually has
+@ 4 32-bit floats rather than just 3....since NEON works with
+@ double and quad vectors.
+@
+@ Note that, since the inputs are loaded into registers then
+@ never used again, r2 can point to one of the inputs, e.g.,
+@ result can be stored back out to one of the input memory
+@ locations.
+@----------------------------------------------------------------
+	.global	pfxMatrix3Matrix3ProductNEON
+	.thumb_func
+pfxMatrix3Matrix3ProductNEON:
+	.fnstart
+	vld1.32 {d16-d19}, [r0]!	@ load first eight elements of matrix 0
+	vld1.32 {d20-d21}, [r0]		@ load second eight elements of matrix 0
+	vld1.32 {d0-d3}, [r1]!		@ load first eight elements of matrix 1
+	vld1.32 {d4-d5}, [r1]		@ load second eight elements of matrix 1
+	vmul.f32 q12, q8, d0[0]		@ rslt col0  = (mat0 col0) * (mat1 col0 elt0)
+	vmul.f32 q13, q8, d2[0]		@ rslt col1  = (mat0 col0) * (mat1 col1 elt0)
+	vmul.f32 q14, q8, d4[0]		@ rslt col2  = (mat0 col0) * (mat1 col2 elt0)
+	vmla.f32 q12, q9, d0[1]		@ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
+	vmla.f32 q13, q9, d2[1]		@ rslt col1 += (mat0 col1) * (mat1 col1 elt1)
+	vmla.f32 q14, q9, d4[1]		@ rslt col2 += (mat0 col1) * (mat1 col2 elt1)
+	vmla.f32 q12, q10, d1[0]	@ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
+	vmla.f32 q13, q10, d3[0]	@ rslt col1 += (mat0 col2) * (mat1 col1 elt2)
+	vmla.f32 q14, q10, d5[0]	@ rslt col2 += (mat0 col2) * (mat1 col2 elt2)
+	vst1.32 {d24-d27}, [r2]!	@ store first eight elements of result (each column has an extra float as stored)
+	vst1.32 {d28-d29}, [r2]		@ store last four elements of result (each column has an extra float as stored)
+	bx	lr
+	.fnend
+
+@----------------------------------------------------------------
+@ pfxMatrix4Matrix4ProductNEON
+@
+@ Matrix4 * Matrix4 product, result stored directly to memory
+@
+@ Result stored in memory pointed to by r2. r2 must point to
+@ memory sufficient to store 16 32 bit floats.
+@
+@ Note that, since the inputs are loaded into registers then
+@ never used again, r2 can point to one of the inputs, e.g.,
+@ result can be stored back out to one of the input memory
+@ locations.
+@----------------------------------------------------------------
+	.global	pfxMatrix4Matrix4ProductNEON
+	.thumb_func
+pfxMatrix4Matrix4ProductNEON:
+	.fnstart
+	vld1.32 {d16-d19}, [r0]!	@ load first eight elements of matrix 0
+	vld1.32 {d20-d23}, [r0]		@ load second eight elements of matrix 0
+	vld1.32 {d0-d3}, [r1]!		@ load first eight elements of matrix 1
+	vld1.32 {d4-d7}, [r1]		@ load second eight elements of matrix 1
+	vmul.f32 q12, q8, d0[0]		@ rslt col0  = (mat0 col0) * (mat1 col0 elt0)
+	vmul.f32 q13, q8, d2[0]		@ rslt col1  = (mat0 col0) * (mat1 col1 elt0)
+	vmul.f32 q14, q8, d4[0]		@ rslt col2  = (mat0 col0) * (mat1 col2 elt0)
+	vmul.f32 q15, q8, d6[0]		@ rslt col3  = (mat0 col0) * (mat1 col3 elt0)
+	vmla.f32 q12, q9, d0[1]		@ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
+	vmla.f32 q13, q9, d2[1]		@ rslt col1 += (mat0 col1) * (mat1 col1 elt1)
+	vmla.f32 q14, q9, d4[1]		@ rslt col2 += (mat0 col1) * (mat1 col2 elt1)
+	vmla.f32 q15, q9, d6[1]		@ rslt col3 += (mat0 col1) * (mat1 col3 elt1)
+	vmla.f32 q12, q10, d1[0]	@ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
+	vmla.f32 q13, q10, d3[0]	@ rslt col1 += (mat0 col2) * (mat1 col1 elt2)
+	vmla.f32 q14, q10, d5[0]	@ rslt col2 += (mat0 col2) * (mat1 col2 elt2)
+	vmla.f32 q15, q10, d7[0]	@ rslt col3 += (mat0 col2) * (mat1 col2 elt2)
+	vmla.f32 q12, q11, d1[1]	@ rslt col0 += (mat0 col3) * (mat1 col0 elt3)
+	vmla.f32 q13, q11, d3[1]	@ rslt col1 += (mat0 col3) * (mat1 col1 elt3)
+	vmla.f32 q14, q11, d5[1]	@ rslt col2 += (mat0 col3) * (mat1 col2 elt3)
+	vmla.f32 q15, q11, d7[1]	@ rslt col3 += (mat0 col3) * (mat1 col3 elt3)
+	vst1.32 {d24-d27}, [r2]!	@ store first eight elements of result
+	vst1.32 {d28-d31}, [r2]		@ store second eight elements of result
+	bx	lr
+	.fnend
+
+@----------------------------------------------------------------
+@ pfxTransform3OrthoInverseNEON
+@
+@ Computes the ortho inverse of a Transform 3.
+@
+@ Result stored in memory pointed to by r1. r1 must point to
+@ memory sufficient to store 16 32 bit floats.
+@
+@ Note that, since the inputs are loaded into registers then
+@ never used again, r1 can point to one of the inputs, e.g.,
+@ result can be stored back out to one of the input memory
+@ locations.
+@
+@ Note that this expects the inputs to have 4 floats per row,
+@ (to be consistent with NEON quad vector), and the last float
+@ in each row should be 0.0 for the math to work out.
+@----------------------------------------------------------------
+	.global	pfxTransform3OrthoInverseNEON
+	.thumb_func
+pfxTransform3OrthoInverseNEON:
+	.fnstart
+	@ direct load the first column of the ortho inverse result
+	vld1.f32 d0[0], [r0]!
+	vld1.f32 d2[0], [r0]!
+	vld1.f32 d4[0], [r0]!
+	vld1.f32 d1[1], [r0]!
+
+	@ direct load the second column of the ortho inverse result
+	vld1.f32 d0[1], [r0]!
+	vld1.f32 d2[1], [r0]!
+	vld1.f32 d4[1], [r0]!
+	vld1.f32 d3[1], [r0]!
+
+	@ direct load the third column of the ortho inverse result
+	vld1.f32 d1[0], [r0]!
+	vld1.f32 d3[0], [r0]!
+
+	vst1.f32 {d0-d3}, [r1]!		@ store first eight elements of result (1st two columns)
+
+	vld1.f32 d5[0], [r0]!
+	vld1.f32 d5[1], [r0]!
+
+	vst1.f32 {d4-d5}, [r1]!		@ store next four elements of result (3rd column)
+
+	@ move q0 into q8 so we can reuse q0 to load fourth column
+	@ of input. We do this to avoid using q4-q7 (which have to
+	@ be preserved during the function call)....needed to work
+	@ around some limitation rules that prevent us from accessing
+	@ single s registers associated with q8 and above.
+	vmov.f32 q8, q0
+
+	@ direct load the last column of the input
+	vld1.f32 {q0}, [r0]
+
+	@ prepare the last column of the output
+	vmul.f32 q3, q8, d0[0]		@ multiply result column 1 by x coord of input column 3
+	vmla.f32 q3, q1, d0[1]		@ multiply result column 2 by y coord of input column 3 and add
+	vmla.f32 q3, q2, d1[0]		@ multiply result column 3 by z coord of input column 3 and add
+	vneg.f32 q3, q3				@ negate final column
+
+	vst1.f32 {q3}, [r1]			@ store last four elements of result (4th column)
+
+	bx	lr
+	.fnend
+
+@----------------------------------------------------------------
+@ pfxTransform3Vector3MultiplyNEON
+@
+@ Computes the product of a Transform3 and a Vector3, e.g., it
+@ applies the transform to the vector.
+@
+@ Result stored in memory pointed to by r2. r2 must point to
+@ memory sufficient to store 4 32-bit floats.
+@----------------------------------------------------------------
+	.global	pfxTransform3Vector3MultiplyNEON
+	.thumb_func
+pfxTransform3Vector3MultiplyNEON:
+	.fnstart
+
+	vld1.32 {d16-d19}, [r0]!	@ load first eight elements of transform matrix
+	vld1.32 {d20-d21}, [r0]		@ load second eight elements of transform matrix
+	vld1.32 {d0-d1}, [r1]		@ load the four elements of vector3 (last one is just padding)
+	vmul.f32 q12, q8, d0[0]		@ rslt col0  = (mat0 col0) * (mat1 col0 elt0)
+	vmla.f32 q12, q9, d0[1]		@ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
+	vmla.f32 q12, q10, d1[0]	@ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
+	vst1.32 {d24-d25}, [r2]		@ store four elements of result (last one is padding)
+
+	bx	lr
+	.fnend
--- a/Extras/PhysicsEffects/include/vecmath/neon/vectormath_neon_assembly_prototypes.h
+++ b/Extras/PhysicsEffects/include/vecmath/neon/vectormath_neon_assembly_prototypes.h
@@ -0,0 +1,57 @@
+/*
+ Applied Research Associates Inc. (c)2011
+
+ Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Applied Research Associates Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _VECTORMATH_NEON_ASSEMBLY_PROTOTYPES_H
+#define _VECTORMATH_NEON_ASSEMBLY_PROTOTYPES_H
+
+// Prototypes for NEON assembly implementations
+extern "C"
+{
+	// NEON assembly implementations of Vector3 functions
+	void pfxVector3DotProductNEON(const Vectormath::Aos::Vector3 &vec0, const Vectormath::Aos::Vector3 &vec1, float *result);
+	void pfxVector3CrossProductNEON(const Vectormath::Aos::Vector3 &vec0, const Vectormath::Aos::Vector3 &vec1,
+									Vectormath::Aos::Vector3 &result);
+
+	// NEON assembly implementations of Matrix3 functions
+	void pfxMatrix3Matrix3ProductNEON(const Vectormath::Aos::Matrix3 &mat0, const Vectormath::Aos::Matrix3 &mat1,
+										Vectormath::Aos::Matrix3 &result);
+
+	// NEON assembly implementations of Transform3 functions
+	void pfxTransform3OrthoInverseNEON(const Vectormath::Aos::Transform3 &trn, Vectormath::Aos::Transform3 &result);
+	void pfxTransform3Vector3MultiplyNEON(const Vectormath::Aos::Transform3 &trn, const Vectormath::Aos::Vector3 &vec,
+											Vectormath::Aos::Vector3 &result);
+
+	// NEON assembly implementations of Vector4 functions
+	void pfxVector4DotProductNEON(const Vectormath::Aos::Vector4 &vec0, const Vectormath::Aos::Vector4 &vec1, float *result);
+
+	// NEON assembly implementations of Matrix4 functions
+	void pfxMatrix4Matrix4ProductNEON(const Vectormath::Aos::Matrix4 &mat0, const Vectormath::Aos::Matrix4 &mat1,
+										Vectormath::Aos::Matrix4 &result);
+}
+
+#endif // _VECTORMATH_NEON_ASSEMBLY_PROTOTYPES_H