Merge pull request #170 from erwincoumans/master

sync repos
2014-05-07 09:00:24 -07:00
parent a5d050bc5e f98f24b697
commit 6c01c83986
172 changed files with 42949 additions and 0 deletions
--- a/test/Bullet2/Info.plist
+++ b/test/Bullet2/Info.plist
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleDisplayName</key>
+	<string>${PRODUCT_NAME}</string>
+	<key>CFBundleExecutable</key>
+	<string>${EXECUTABLE_NAME}</string>
+	<key>CFBundleIconFiles</key>
+	<array/>
+	<key>CFBundleIdentifier</key>
+	<string>Apple.${PRODUCT_NAME:rfc1034identifier}</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>${PRODUCT_NAME}</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleSignature</key>
+	<string>????</string>
+	<key>CFBundleVersion</key>
+	<string>1.0</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+</dict>
+</plist>
--- a/test/Bullet2/README.txt
+++ b/test/Bullet2/README.txt
@@ -0,0 +1,28 @@
+1) Add a .cpp and .h file for your test function.  The function should conform to:
+
+    #ifdef __cplusplus
+        extern "C" {
+    #endif
+
+        #include "Utils.h"
+        #include "main.h"
+        #include "vector.h"
+
+        // Your test function
+        int MyTestFunc(void);
+
+    #ifdef __cplusplus
+        }
+    #endif
+
+    The rest of the program doesn't care or know what you do in MyTestFunc, except that MyTestFunc should return non-zero in case of failure in MyTestFunc. There are some handy functions in Utils.h that you might want to use. Please use vlog instead of printf to print stuff, and random_number32/64() in place of rand(), so I can multithread later if it comes to that.  There are some read-only globals that you may wish to respond to, declared in Utils.h:
+
+        gReportAverageTimes	if you do timing, report times as averages instead of best times if non-zero
+        gExitOnError		if non-zero, return non-zero immediately if you encounter an error
+        gAppName			(const char*) the name of the application
+
+    As a convenience, vector.h has some cross platform vector types declared and will correctly include various vector headers according to compiler flag.
+
+
+2) Add an entry to gTestList in TestList.cpp for your test function, so the rest of the app knows to call it
+
--- a/test/Bullet2/Source/TestList.cpp
+++ b/test/Bullet2/Source/TestList.cpp
@@ -0,0 +1,97 @@
+//
+//  TestList.c
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#include <stdlib.h>
+#include "TestList.h"
+
+#include "Test_qtmul.h"
+#include "Test_qtmulQV3.h"
+#include "Test_qtmulV3Q.h"
+#include "Test_qtdot.h"
+#include "Test_qtnorm.h"
+
+#include "Test_v3dot.h"
+#include "Test_v3sdiv.h"
+#include "Test_v3norm.h"
+#include "Test_v3cross.h"
+#include "Test_v3triple.h"
+#include "Test_v3interp.h"
+#include "Test_v3lerp.h"
+#include "Test_v3skew.h"
+#include "Test_v3div.h"
+#include "Test_v3rotate.h"
+
+#include "Test_maxdot.h"
+#include "Test_mindot.h"
+#include "Test_dot3.h"
+#include "Test_3x3transpose.h"
+#include "Test_3x3transposeTimes.h"
+#include "Test_3x3timesTranspose.h"
+#include "Test_3x3mulM.h"
+#include "Test_3x3mulM1M2.h"
+#include "Test_3x3mulMV.h"
+#include "Test_3x3mulVM.h"
+#include "Test_3x3setRot.h"
+#include "Test_3x3getRot.h"
+
+#include "Test_btDbvt.h"
+#include "Test_quat_aos_neon.h"
+
+#include "LinearMath/btScalar.h"
+#define ENTRY( _name, _func )       { _name, _func }
+
+//
+// Test functions have the form  int (*TestFunc)( void )
+// They return a non-zero result in case of failure.
+//
+// Please see handy stuff in Utils.h, vector.h when writing your test code.
+//
+#if defined (BT_USE_NEON) || defined (BT_USE_SSE_IN_API)
+
+TestDesc  gTestList[] = 
+{
+    ENTRY( "maxdot", Test_maxdot ),
+    ENTRY( "mindot", Test_mindot ),
+
+    ENTRY( "qtmul", Test_qtmul ),
+    ENTRY( "qtmulQV3", Test_qtmulQV3 ),
+    ENTRY( "qtmulV3Q", Test_qtmulV3Q ),
+    ENTRY( "qtdot", Test_qtdot ),
+    ENTRY( "qtnorm", Test_qtnorm ),
+
+    ENTRY( "v3dot", Test_v3dot ),
+    ENTRY( "v3sdiv", Test_v3sdiv ),
+    ENTRY( "v3norm", Test_v3norm ),
+    ENTRY( "v3cross", Test_v3cross ),
+    ENTRY( "v3triple", Test_v3triple ),
+    ENTRY( "v3interp", Test_v3interp ),
+    ENTRY( "v3lerp", Test_v3lerp ),
+    ENTRY( "v3skew", Test_v3skew ),
+    ENTRY( "v3div", Test_v3div ),
+    ENTRY( "v3rotate", Test_v3rotate ),
+
+    ENTRY( "dot3", Test_dot3 ),
+    ENTRY( "3x3transpose", Test_3x3transpose ),
+    ENTRY( "3x3transposeTimes", Test_3x3transposeTimes ),
+    ENTRY( "3x3timesTranspose", Test_3x3timesTranspose ),
+    ENTRY( "3x3mulM", Test_3x3mulM ),
+    ENTRY( "3x3mulM1M2", Test_3x3mulM1M2 ),
+    ENTRY( "3x3mulMV", Test_3x3mulMV ),
+    ENTRY( "3x3mulVM", Test_3x3mulMV ),
+    ENTRY( "3x3setRot", Test_3x3setRot ),
+    ENTRY( "3x3getRot", Test_3x3getRot ),
+  
+    ENTRY( "btDbvt", Test_btDbvt ),
+    ENTRY("quat_aos_neon", Test_quat_aos_neon),
+    
+    { NULL, NULL }
+};
+#else
+TestDesc  gTestList[]={{NULL,NULL}};
+
+#endif
+
--- a/test/Bullet2/Source/TestList.h
+++ b/test/Bullet2/Source/TestList.h
@@ -0,0 +1,28 @@
+//
+//  TestList.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_TestList_h
+#define BulletTest_TestList_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+    
+typedef struct TestDesc
+{
+    const char *name;
+    int (*test_func)(void);     // return 0 for success, non-zero for failure
+}TestDesc;
+
+extern TestDesc  gTestList[];
+    
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/test/Bullet2/Source/Tests/Test_3x3getRot.cpp
+++ b/test/Bullet2/Source/Tests/Test_3x3getRot.cpp
@@ -0,0 +1,158 @@
+//
+//  Test_3x3getRot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_3x3getRot.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_m1p1, RANDF_m1p1, RANDF_m1p1, BT_NAN );      // w channel NaN
+}
+
+static inline btSimdFloat4 qtNAN_f4(void)
+{
+    return btAssign128( BT_NAN, BT_NAN, BT_NAN, BT_NAN );
+}
+
+static void M3x3getRot_ref( const btMatrix3x3 &m, btQuaternion &q )
+{
+    btVector3 m_el[3] = { m[0], m[1], m[2] };
+
+    btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
+
+    btScalar temp[4];
+
+    if (trace > btScalar(0.0)) 
+    {
+        btScalar s = btSqrt(trace + btScalar(1.0));
+        temp[3]=(s * btScalar(0.5));
+        s = btScalar(0.5) / s;
+
+        temp[0]=((m_el[2].y() - m_el[1].z()) * s);
+        temp[1]=((m_el[0].z() - m_el[2].x()) * s);
+        temp[2]=((m_el[1].x() - m_el[0].y()) * s);
+    } 
+    else 
+    {
+        int i = m_el[0].x() < m_el[1].y() ? 
+            (m_el[1].y() < m_el[2].z() ? 2 : 1) :
+            (m_el[0].x() < m_el[2].z() ? 2 : 0); 
+        int j = (i + 1) % 3;  
+        int k = (i + 2) % 3;
+
+        btScalar s = btSqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0));
+        temp[i] = s * btScalar(0.5);
+        s = btScalar(0.5) / s;
+
+        temp[3] = (m_el[k][j] - m_el[j][k]) * s;
+        temp[j] = (m_el[j][i] + m_el[i][j]) * s;
+        temp[k] = (m_el[k][i] + m_el[i][k]) * s;
+    }
+    q.setValue(temp[0],temp[1],temp[2],temp[3]);
+}
+
+static int operator!= ( const btQuaternion &a, const btQuaternion &b )
+{
+    if( fabs(a.x() - b.x()) + 
+        fabs(a.y() - b.y()) +
+        fabs(a.z() - b.z()) +
+        fabs(a.w() - b.w()) > FLT_EPSILON * 4)
+        return 1;
+    
+    return 0;
+}
+
+int Test_3x3getRot(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3     in1[ARRAY_SIZE];
+    btQuaternion    out[ARRAY_SIZE];
+    btQuaternion    out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        out[i] = btQuaternion(qtNAN_f4());   
+        out2[i] = btQuaternion(qtNAN_f4());
+        
+        M3x3getRot_ref(in1[i], out[i]);
+        in1[i].getRotation(out2[i]);
+
+        if( out[i] != out2[i] )
+        {
+            vlog( "Error - M3x3getRot result error! ");
+            vlog( "failure @ %ld\n", i);
+         	vlog( 	"\ncorrect = (%10.7f, %10.7f, %10.7f, %10.7f) "
+					"\ntested  = (%10.7f, %10.7f, %10.7f, %10.7f) \n", 
+					out[i].x(), out[i].y(), out[i].z(), out[i].w(),
+					out2[i].x(), out2[i].y(), out2[i].z(), out2[i].w());
+		
+            return -1;
+        }
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = ~(bestTime&0);//-1ULL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            M3x3getRot_ref(in1[i], out[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = ~(bestTime&0);//-1ULL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+        {
+            in1[i].getRotation(out2[i]);
+        }
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+
+#endif//BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_3x3getRot.h
+++ b/test/Bullet2/Source/Tests/Test_3x3getRot.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3getRot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3getRot_h
+#define BulletTest_Test_3x3getRot_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3getRot(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_3x3mulM.cpp
+++ b/test/Bullet2/Source/Tests/Test_3x3mulM.cpp
@@ -0,0 +1,169 @@
+//
+//  Test_3x3mulM.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_3x3mulM.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_01, RANDF_01, RANDF_01, BT_NAN );      // w channel NaN
+}
+
+static btMatrix3x3 M3x3mulM_ref( btMatrix3x3 &in, const btMatrix3x3 &m )
+{
+    btVector3 m_el[3] = { in[0], in[1], in[2] };
+
+	in.setValue(
+        m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
+		m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
+		m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
+
+    return in;
+}
+
+static SIMD_FORCE_INLINE	bool	fuzzyEqualSlow(const btVector3& ref, const btVector3& other)
+{
+	const btScalar epsilon = SIMD_EPSILON;
+	return ((btFabs(ref.m_floats[3]-other.m_floats[3])<=epsilon) &&
+            (btFabs(ref.m_floats[2]-other.m_floats[2])<=epsilon) && 
+            (btFabs(ref.m_floats[1]-other.m_floats[1])<=epsilon) && 
+            (btFabs(ref.m_floats[0]-other.m_floats[0])<=epsilon));
+}
+	
+
+static int operator!= ( const btMatrix3x3 &a, const btMatrix3x3 &b )
+{
+    if( a.getRow(0) != b.getRow(0) )
+	{
+		if (!fuzzyEqualSlow(a.getRow(0),b.getRow(0)))
+		{
+			return 1;
+		}
+	}
+    if( a.getRow(1) != b.getRow(1) )
+	{
+	    if( !fuzzyEqualSlow(a.getRow(1),b.getRow(1)) )
+	        return 1;
+	}
+    if( a.getRow(2) != b.getRow(2) )
+	{
+		if( !fuzzyEqualSlow(a.getRow(2),b.getRow(2)) )
+		{
+			return 1;
+		}
+	}
+    return 0;
+}
+
+int Test_3x3mulM(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3 in1[ARRAY_SIZE];
+    btMatrix3x3 in2[ARRAY_SIZE];
+    btMatrix3x3 in3[ARRAY_SIZE];
+    btMatrix3x3 out[ARRAY_SIZE];
+    btMatrix3x3 out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in2[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in3[i] = in1[i];
+        
+        out[i] = M3x3mulM_ref(in1[i], in2[i]);
+        out2[i] = (in3[i] *= in2[i]);
+        
+        if( out[i] != out2[i] )
+        {
+ 			vlog( "Error - M3x3mulM result error! ");
+            vlog( "failure @ %ld\n", i);
+            btVector3 m0, m1, m2;
+            m0 = out[i].getRow(0);
+            m1 = out[i].getRow(1);
+            m2 = out[i].getRow(2);
+            
+            vlog(   "\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\n          (%10.4f, %10.4f, %10.4f, %10.4f) "
+                    "\n          (%10.4f, %10.4f, %10.4f, %10.4f) \n",
+                    m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+                    m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+                    m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+            m0 = out2[i].getRow(0);
+            m1 = out2[i].getRow(1);
+            m2 = out2[i].getRow(2);
+					
+            vlog(   "\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\n          (%10.4f, %10.4f, %10.4f, %10.4f) " 
+					"\n          (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+                    m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+                    m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+            return -1;
+        }
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = M3x3mulM_ref(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out2[i] = (in3[i] *= in2[i]);
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_3x3mulM.h
+++ b/test/Bullet2/Source/Tests/Test_3x3mulM.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3mulM.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3mulM_h
+#define BulletTest_Test_3x3mulM_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3mulM(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_3x3mulM1M2.cpp
+++ b/test/Bullet2/Source/Tests/Test_3x3mulM1M2.cpp
@@ -0,0 +1,164 @@
+//
+//  Test_3x3mulM1M2.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_3x3mulM1M2.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_01, RANDF_01, RANDF_01, BT_NAN );      // w channel NaN
+}
+
+static btMatrix3x3 M3x3mulM1M2_ref( const btMatrix3x3 &m1, const btMatrix3x3 &m2 )
+{
+	return btMatrix3x3(
+        m2.tdotx(m1[0]), m2.tdoty(m1[0]), m2.tdotz(m1[0]),
+		m2.tdotx(m1[1]), m2.tdoty(m1[1]), m2.tdotz(m1[1]),
+		m2.tdotx(m1[2]), m2.tdoty(m1[2]), m2.tdotz(m1[2]));
+}
+
+static bool	fuzzyEqualSlow(const btVector3& ref, const btVector3& other)
+{
+	const btScalar epsilon = SIMD_EPSILON;
+	return ((btFabs(ref.m_floats[3]-other.m_floats[3])<=epsilon) &&
+            (btFabs(ref.m_floats[2]-other.m_floats[2])<=epsilon) && 
+            (btFabs(ref.m_floats[1]-other.m_floats[1])<=epsilon) && 
+            (btFabs(ref.m_floats[0]-other.m_floats[0])<=epsilon));
+}
+	
+
+static int operator!= ( const btMatrix3x3 &a, const btMatrix3x3 &b )
+{
+    if( a.getRow(0) != b.getRow(0) )
+	{
+		if (!fuzzyEqualSlow(a.getRow(0),b.getRow(0)))
+		{
+			return 1;
+		}
+	}
+    if( a.getRow(1) != b.getRow(1) )
+	{
+	    if( !fuzzyEqualSlow(a.getRow(1),b.getRow(1)) )
+	        return 1;
+	}
+    if( a.getRow(2) != b.getRow(2) )
+	{
+		if( !fuzzyEqualSlow(a.getRow(2),b.getRow(2)) )
+		{
+			return 1;
+		}
+	}
+    return 0;
+}
+
+int Test_3x3mulM1M2(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3 in1[ARRAY_SIZE];
+    btMatrix3x3 in2[ARRAY_SIZE];
+    btMatrix3x3 out[ARRAY_SIZE];
+    btMatrix3x3 out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in2[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        
+        out[i] = M3x3mulM1M2_ref(in1[i], in2[i]);
+        out2[i] = (in1[i] * in2[i]);
+        
+        if( out[i] != out2[i] )
+        {
+ 			vlog( "Error - M3x3mulM1M2 result error! ");
+            vlog( "failure @ %ld\n", i);
+            btVector3 m0, m1, m2;
+            m0 = out[i].getRow(0);
+            m1 = out[i].getRow(1);
+            m2 = out[i].getRow(2);
+            
+            vlog(   "\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\n          (%10.4f, %10.4f, %10.4f, %10.4f) "
+                    "\n          (%10.4f, %10.4f, %10.4f, %10.4f) \n",
+                    m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+                    m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+                    m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+            m0 = out2[i].getRow(0);
+            m1 = out2[i].getRow(1);
+            m2 = out2[i].getRow(2);
+					
+            vlog(   "\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\n          (%10.4f, %10.4f, %10.4f, %10.4f) " 
+					"\n          (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+                    m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+                    m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+            return -1;
+        }
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = M3x3mulM1M2_ref(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out2[i] = (in1[i] * in2[i]);
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_3x3mulM1M2.h
+++ b/test/Bullet2/Source/Tests/Test_3x3mulM1M2.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3mulM1M2.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3mulM1M2_h
+#define BulletTest_Test_3x3mulM1M2_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3mulM1M2(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_3x3mulMV.cpp
+++ b/test/Bullet2/Source/Tests/Test_3x3mulMV.cpp
@@ -0,0 +1,112 @@
+//
+//  Test_3x3mulMV.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_3x3mulMV.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+	return btAssign128(RANDF_01, RANDF_01, RANDF_01, BT_NAN );      // w channel NaN
+}
+
+static btVector3 M3x3mulMV_ref( const btMatrix3x3 &m, const btVector3 &v )
+{
+	return btVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
+}
+
+int Test_3x3mulMV(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3 in1[ARRAY_SIZE];
+    btVector3   in2[ARRAY_SIZE];
+    btVector3   out[ARRAY_SIZE];
+    btVector3   out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in2[i] = btVector3(rand_f4());   
+        
+        out[i] = M3x3mulMV_ref(in1[i], in2[i]);
+        out2[i] = (in1[i] * in2[i]);
+
+		if( fabsf(out[i].m_floats[0] - out2[i].m_floats[0]) + 
+			fabsf(out[i].m_floats[1] - out2[i].m_floats[1]) +
+			fabsf(out[i].m_floats[2] - out2[i].m_floats[2]) +
+			fabsf(out[i].m_floats[3] - out2[i].m_floats[3]) > FLT_EPSILON*4 )
+		{	
+			vlog( "Error - M3x3mulMV result error! ");
+            vlog( "failure @ %ld\n", i);
+			vlog( 	"\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					out[i].m_floats[0], out[i].m_floats[1], out[i].m_floats[2], out[i].m_floats[3], 
+					out2[i].m_floats[0], out2[i].m_floats[1], out2[i].m_floats[2], out2[i].m_floats[3]);
+		
+			return 1;
+		}
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = M3x3mulMV_ref(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out2[i] = (in1[i] * in2[i]);
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_3x3mulMV.h
+++ b/test/Bullet2/Source/Tests/Test_3x3mulMV.h
@@ -0,0 +1,23 @@
+//
+//  Test_3x3mulMV.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3mulMV_h
+#define BulletTest_Test_3x3mulMV_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3mulMV(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
+
--- a/test/Bullet2/Source/Tests/Test_3x3mulVM.cpp
+++ b/test/Bullet2/Source/Tests/Test_3x3mulVM.cpp
@@ -0,0 +1,112 @@
+//
+//  Test_3x3mulVM.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_3x3mulVM.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_01, RANDF_01, RANDF_01, BT_NAN );      // w channel NaN
+}
+
+static btVector3 M3x3mulVM_ref(  const btVector3 &v, const btMatrix3x3 &m)
+{
+	return btVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
+}
+
+int Test_3x3mulVM(void)
+{
+    // Init an array flanked by guard pages
+    btVector3   in1[ARRAY_SIZE];
+    btMatrix3x3 in2[ARRAY_SIZE];
+    btVector3   out[ARRAY_SIZE];
+    btVector3   out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btVector3(rand_f4());   
+        in2[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        
+        out[i] = M3x3mulVM_ref(in1[i], in2[i]);
+        out2[i] = (in1[i] * in2[i]);
+
+		if( fabsf(out[i].m_floats[0] - out2[i].m_floats[0]) + 
+			fabsf(out[i].m_floats[1] - out2[i].m_floats[1]) +
+			fabsf(out[i].m_floats[2] - out2[i].m_floats[2]) +
+			fabsf(out[i].m_floats[3] - out2[i].m_floats[3]) > FLT_EPSILON*4 )
+		{	
+			vlog( "Error - M3x3mulVM result error! ");
+            vlog( "failure @ %ld\n", i);
+			vlog( 	"\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					out[i].m_floats[0], out[i].m_floats[1], out[i].m_floats[2], out[i].m_floats[3], 
+					out2[i].m_floats[0], out2[i].m_floats[1], out2[i].m_floats[2], out2[i].m_floats[3]);
+		
+			return 1;
+		}
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = M3x3mulVM_ref(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out2[i] = (in1[i] * in2[i]);
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_3x3mulVM.h
+++ b/test/Bullet2/Source/Tests/Test_3x3mulVM.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3mulVM.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3mulVM_h
+#define BulletTest_Test_3x3mulVM_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3mulVM(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_3x3setRot.cpp
+++ b/test/Bullet2/Source/Tests/Test_3x3setRot.cpp
@@ -0,0 +1,171 @@
+//
+//  Test_3x3setRot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_3x3setRot.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_01, RANDF_01, RANDF_01, BT_NAN );      // w channel NaN
+}
+
+static inline btSimdFloat4 qtrand_f4(void)
+{
+    return btAssign128( RANDF_01, RANDF_01, RANDF_01, RANDF_01 );
+}
+
+static btMatrix3x3 M3x3setRot_ref( btMatrix3x3 &m, const btQuaternion &q )
+{
+    btScalar d = q.length2();
+    btScalar s = btScalar(2.0) / d;
+
+    btScalar xs = q.x() * s,   ys = q.y() * s,   zs = q.z() * s;
+    
+    btScalar wx = q.w() * xs,  wy = q.w() * ys,  wz = q.w() * zs;
+    btScalar xx = q.x() * xs,  xy = q.x() * ys,  xz = q.x() * zs;
+    btScalar yy = q.y() * ys,  yz = q.y() * zs,  zz = q.z() * zs;
+    m.setValue(
+        btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
+        xy + wz, btScalar(1.0) - (xx + zz), yz - wx,
+        xz - wy, yz + wx, btScalar(1.0) - (xx + yy));
+
+    return m;
+}
+
+
+static int operator!= ( const btMatrix3x3 &a, const btMatrix3x3 &b )
+{
+    int i; 
+    btVector3 av3, bv3;
+
+    for(i=0; i<3; i++)
+    {
+        av3 = a.getRow(i);
+        bv3 = b.getRow(i);
+        
+        if( fabs(av3.m_floats[0] - bv3.m_floats[0]) + 
+            fabs(av3.m_floats[1] - bv3.m_floats[1]) +
+            fabs(av3.m_floats[2] - bv3.m_floats[2]) > FLT_EPSILON * 4)
+            return 1;
+    }
+    
+    return 0;
+}
+
+int Test_3x3setRot(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3     in1[ARRAY_SIZE];
+    btQuaternion    in2[ARRAY_SIZE];
+    btMatrix3x3     in3[ARRAY_SIZE];
+    btMatrix3x3     out[ARRAY_SIZE];
+    btMatrix3x3     out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in2[i] = btQuaternion(qtrand_f4());   
+        in3[i] = in1[i];
+        
+        out[i] = M3x3setRot_ref(in1[i], in2[i]);
+        in3[i].setRotation(in2[i]);
+        out2[i] = in3[i];
+
+        if( out[i] != out2[i] )
+        {
+            vlog( "Error - M3x3setRot result error! ");
+            vlog( "failure @ %ld\n", i);
+            btVector3 m0, m1, m2;
+            m0 = out[i].getRow(0);
+            m1 = out[i].getRow(1);
+            m2 = out[i].getRow(2);
+            
+            vlog(   "\ncorrect = (%10.7f, %10.7f, %10.7f, %10.7f) "
+					"\n          (%10.7f, %10.7f, %10.7f, %10.7f) "
+                    "\n          (%10.7f, %10.7f, %10.7f, %10.7f) \n",
+                    m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+                    m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+                    m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+            m0 = out2[i].getRow(0);
+            m1 = out2[i].getRow(1);
+            m2 = out2[i].getRow(2);
+					
+            vlog(   "\ntested  = (%10.7f, %10.7f, %10.7f, %10.7f) "
+					"\n          (%10.7f, %10.7f, %10.7f, %10.7f) " 
+					"\n          (%10.7f, %10.7f, %10.7f, %10.7f) \n", 
+					m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+                    m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+                    m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+            return -1;
+        }
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = M3x3setRot_ref(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) 
+    {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+        {
+            in3[i].setRotation(in2[i]);
+            out2[i] = in3[i];
+        }
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+#endif //BT_USE_SSE
+
--- a/test/Bullet2/Source/Tests/Test_3x3setRot.h
+++ b/test/Bullet2/Source/Tests/Test_3x3setRot.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3setRot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3setRot_h
+#define BulletTest_Test_3x3setRot_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3setRot(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_3x3timesTranspose.cpp
+++ b/test/Bullet2/Source/Tests/Test_3x3timesTranspose.cpp
@@ -0,0 +1,117 @@
+//
+//  Test_3x3timesTranspose.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_3x3timesTranspose.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF, RANDF, RANDF, BT_NAN );      // w channel NaN
+}
+
+static btMatrix3x3 timesTranspose( const btMatrix3x3 &in, const btMatrix3x3 &m )
+{
+    btVector3 m_el[3] = { in[0], in[1], in[2] };
+	return btMatrix3x3(
+                       m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
+                       m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
+                       m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
+}
+
+static int operator!= ( const btMatrix3x3 &a, const btMatrix3x3 &b )
+{
+    if( a.getRow(0) != b.getRow(0) )
+        return 1;
+    if( a.getRow(1) != b.getRow(1) )
+        return 1;
+    if( a.getRow(2) != b.getRow(2) )
+        return 1;
+    return 0;
+}
+
+int Test_3x3timesTranspose(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3 in1[ARRAY_SIZE];
+    btMatrix3x3 in2[ARRAY_SIZE];
+    btMatrix3x3 out[ARRAY_SIZE];
+    btMatrix3x3 out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in2[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        
+        out[i] = timesTranspose(in1[i], in2[i]);
+        out2[i] = in1[i].timesTranspose(in2[i]);
+        
+        if( out[i] != out2[i] )
+        {
+            printf( "failure @ %ld\n", i);
+            return -1;
+        }
+    }
+    
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = timesTranspose(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = in1[i].timesTranspose(in2[i]);
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_3x3timesTranspose.h
+++ b/test/Bullet2/Source/Tests/Test_3x3timesTranspose.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3timesTranspose.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3timesTranspose_h
+#define BulletTest_Test_3x3timesTranspose_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3timesTranspose(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_3x3transpose.cpp
+++ b/test/Bullet2/Source/Tests/Test_3x3transpose.cpp
@@ -0,0 +1,116 @@
+//
+//  Test_3x3transpose.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_3x3transpose.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 1024 
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF, RANDF, RANDF, BT_NAN );      // w channel NaN
+}
+
+static btMatrix3x3 Transpose( btMatrix3x3 &in )
+{
+    btVector3 row0 = in.getRow(0);
+    btVector3 row1 = in.getRow(1);
+    btVector3 row2 = in.getRow(2);
+	btVector3 col0 = btAssign128(row0.x(), row1.x(), row2.x(), 0 );
+	btVector3 col1 = btAssign128(row0.y(), row1.y(), row2.y(), 0 );
+	btVector3 col2 = btAssign128(row0.z(), row1.z(), row2.z(), 0);
+	return btMatrix3x3( col0, col1, col2);
+}
+
+static int operator!= ( const btMatrix3x3 &a, const btMatrix3x3 &b )
+{
+    if( a.getRow(0) != b.getRow(0) )
+        return 1;
+    if( a.getRow(1) != b.getRow(1) )
+        return 1;
+    if( a.getRow(2) != b.getRow(2) )
+        return 1;
+    return 0;
+}
+
+int Test_3x3transpose(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3 in[ARRAY_SIZE];
+    btMatrix3x3 out[ARRAY_SIZE];
+    btMatrix3x3 out2[ARRAY_SIZE];
+    
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+     
+        out[i] = Transpose(in[i]);
+        out2[i] = in[i].transpose();
+        
+        if( out[i] != out2[i] )
+        {
+            printf( "failure @ %ld\n", i);
+            return -1;
+        }
+    }
+
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = Transpose(in[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = in[i].transpose();
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+        
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+        
+    return 0;
+}
+#endif //BT_USE_SSE
+
--- a/test/Bullet2/Source/Tests/Test_3x3transpose.h
+++ b/test/Bullet2/Source/Tests/Test_3x3transpose.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3transpose.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3transpose_h
+#define BulletTest_Test_3x3transpose_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+    
+    int Test_3x3transpose(void);
+    
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
--- a/test/Bullet2/Source/Tests/Test_3x3transposeTimes.cpp
+++ b/test/Bullet2/Source/Tests/Test_3x3transposeTimes.cpp
@@ -0,0 +1,168 @@
+//
+//  Test_3x3transposeTimes.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_3x3transposeTimes.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btMatrix3x3.h>
+
+#define LOOPCOUNT 1000
+#define ARRAY_SIZE 128
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_01, RANDF_01, RANDF_01, BT_NAN );      // w channel NaN
+}
+
+static btMatrix3x3 TransposeTimesReference( const btMatrix3x3 &in, const btMatrix3x3 &m )
+{
+    btVector3 m_el[3] = { in[0], in[1], in[2] };
+    btSimdFloat4 r0 = btAssign128(m_el[0].x() * m[0].x() + m_el[1].x() * m[1].x() + m_el[2].x() * m[2].x(),
+                            m_el[0].x() * m[0].y() + m_el[1].x() * m[1].y() + m_el[2].x() * m[2].y(),
+                            m_el[0].x() * m[0].z() + m_el[1].x() * m[1].z() + m_el[2].x() * m[2].z(),
+                            0.0f );
+    btSimdFloat4 r1 = btAssign128(   m_el[0].y() * m[0].x() + m_el[1].y() * m[1].x() + m_el[2].y() * m[2].x(),
+                            m_el[0].y() * m[0].y() + m_el[1].y() * m[1].y() + m_el[2].y() * m[2].y(),
+                            m_el[0].y() * m[0].z() + m_el[1].y() * m[1].z() + m_el[2].y() * m[2].z(),
+                            0.0f );
+    btSimdFloat4 r2 = btAssign128(   m_el[0].z() * m[0].x() + m_el[1].z() * m[1].x() + m_el[2].z() * m[2].x(),
+                            m_el[0].z() * m[0].y() + m_el[1].z() * m[1].y() + m_el[2].z() * m[2].y(),
+                            m_el[0].z() * m[0].z() + m_el[1].z() * m[1].z() + m_el[2].z() * m[2].z(),
+                            0.0f );
+    return btMatrix3x3( r0, r1, r2 );
+}
+
+static int operator!= ( const btMatrix3x3 &a, const btMatrix3x3 &b )
+{
+    if( a.getRow(0) != b.getRow(0) )
+        return 1;
+    if( a.getRow(1) != b.getRow(1) )
+        return 1;
+    if( a.getRow(2) != b.getRow(2) )
+        return 1;
+    return 0;
+}
+
+int Test_3x3transposeTimes(void)
+{
+    // Init an array flanked by guard pages
+    btMatrix3x3 in1[ARRAY_SIZE];
+    btMatrix3x3 in2[ARRAY_SIZE];
+    btMatrix3x3 out[ARRAY_SIZE];
+    btMatrix3x3 out2[ARRAY_SIZE];
+    
+	float maxRelativeError = 0.f;
+    // Init the data
+    size_t i, j;
+    for( i = 0; i < ARRAY_SIZE; i++ )
+    {
+        in1[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        in2[i] = btMatrix3x3(rand_f4(), rand_f4(), rand_f4() );   
+        
+        out[i] = TransposeTimesReference(in1[i], in2[i]);
+        out2[i] = in1[i].transposeTimes(in2[i]);
+        
+        if( out[i] != out2[i] )
+        {
+
+			float relativeError = 0.f;
+
+			for (int column=0;column<3;column++)
+			for (int row=0;row<3;row++)
+				relativeError = btMax(relativeError,btFabs(out2[i][row][column] - out[i][row][column]) / out[i][row][column]);
+
+			if (relativeError>1e-6)
+			{
+				vlog( "failure @ %ld\n", i);
+				btVector3 m0, m1, m2;
+				m0 = out[i].getRow(0);
+				m1 = out[i].getRow(1);
+				m2 = out[i].getRow(2);
+            
+				vlog(   "\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+						"\n          (%10.4f, %10.4f, %10.4f, %10.4f) "
+						"\n          (%10.4f, %10.4f, %10.4f, %10.4f) \n",
+						m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+						m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+						m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+				m0 = out2[i].getRow(0);
+				m1 = out2[i].getRow(1);
+				m2 = out2[i].getRow(2);
+					
+				vlog(   "\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) "
+						"\n          (%10.4f, %10.4f, %10.4f, %10.4f) " 
+						"\n          (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+						m0.m_floats[0], m0.m_floats[1], m0.m_floats[2], m0.m_floats[3], 
+						m1.m_floats[0], m1.m_floats[1], m1.m_floats[2], m1.m_floats[3],
+						m2.m_floats[0], m2.m_floats[1], m2.m_floats[2], m2.m_floats[3]); 
+
+				return -1;
+			} else
+			{
+				if (relativeError>maxRelativeError)
+					maxRelativeError = relativeError;
+			}
+        }
+    }
+    
+	if (maxRelativeError)
+	{
+		printf("Warning: maxRelativeError = %e\n",maxRelativeError);
+	}
+    uint64_t scalarTime, vectorTime;
+    uint64_t startTime, bestTime, currentTime;
+    bestTime = -1LL;
+    scalarTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = TransposeTimesReference(in1[i], in2[i]);
+        currentTime = ReadTicks() - startTime;
+        scalarTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        scalarTime = bestTime;        
+    else
+        scalarTime /= LOOPCOUNT;
+    
+    bestTime = -1LL;
+    vectorTime = 0;
+    for (j = 0; j < LOOPCOUNT; j++) {
+        startTime = ReadTicks();
+        for( i = 0; i < ARRAY_SIZE; i++ )
+            out[i] = in1[i].transposeTimes(in2[i]);
+        currentTime = ReadTicks() - startTime;
+        vectorTime += currentTime;
+        if( currentTime < bestTime )
+            bestTime = currentTime;
+    }
+    if( 0 == gReportAverageTimes )
+        vectorTime = bestTime;        
+    else
+        vectorTime /= LOOPCOUNT;
+    
+    vlog( "Timing:\n" );
+    vlog( "\t    scalar\t    vector\n" );
+    vlog( "\t%10.2f\t%10.2f\n", TicksToCycles( scalarTime ) / ARRAY_SIZE, TicksToCycles( vectorTime ) / ARRAY_SIZE );
+    
+    return 0;
+}
+
+#endif //BT_USE_SSE
+
--- a/test/Bullet2/Source/Tests/Test_3x3transposeTimes.h
+++ b/test/Bullet2/Source/Tests/Test_3x3transposeTimes.h
@@ -0,0 +1,22 @@
+//
+//  Test_3x3transposeTimes.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_3x3transposeTimes_h
+#define BulletTest_Test_3x3transposeTimes_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_3x3transposeTimes(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_btDbvt.cpp
+++ b/test/Bullet2/Source/Tests/Test_btDbvt.cpp
@@ -0,0 +1,495 @@
+//
+//  Test_btDbvt.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc., Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_btDbvt.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <BulletCollision/BroadphaseCollision/btDbvt.h>
+
+// reference code for testing purposes
+SIMD_FORCE_INLINE bool Intersect_ref( btDbvtAabbMm& a,  btDbvtAabbMm& b)
+{
+    return(	(a.tMins().x()<=b.tMaxs().x())&&
+           (a.tMaxs().x()>=b.tMins().x())&&
+           (a.tMins().y()<=b.tMaxs().y())&&
+           (a.tMaxs().y()>=b.tMins().y())&&
+           (a.tMins().z()<=b.tMaxs().z())&&		
+           (a.tMaxs().z()>=b.tMins().z()));
+    
+   }
+
+
+SIMD_FORCE_INLINE btScalar	Proximity_ref(	 btDbvtAabbMm& a,
+								   btDbvtAabbMm& b)
+{
+	const btVector3	d=(a.tMins()+a.tMaxs())-(b.tMins()+b.tMaxs());
+	return(btFabs(d.x())+btFabs(d.y())+btFabs(d.z()));
+}
+
+
+
+SIMD_FORCE_INLINE int			Select_ref(	 btDbvtAabbMm& o,
+							    btDbvtAabbMm& a,
+							    btDbvtAabbMm& b)
+{
+	return(Proximity_ref(o,a)<Proximity_ref(o,b)?0:1);
+}
+
+
+SIMD_FORCE_INLINE void		Merge_ref(	 btDbvtAabbMm& a,
+							   btDbvtAabbMm& b,
+							  btDbvtAabbMm& r)
+{
+    //
+    //Changing '3' into '4' to compare with the vector code which changes all 4 floats.
+    //Erwin: don't do this because the 4th component is ignore and not computed on non-vector code (there is no NEON version and scalar is just 3 components)
+    //
+	for(int i=0;i<3;++i)
+	{
+		if(a.tMins().m_floats[i]<b.tMins().m_floats[i]) 
+            r.tMins().m_floats[i] = a.tMins().m_floats[i]; 
+        else 
+            r.tMins().m_floats[i] = b.tMins().m_floats[i];
+		
+        
+        if(a.tMaxs().m_floats[i]>b.tMaxs().m_floats[i]) 
+            r.tMaxs().m_floats[i]=a.tMaxs().m_floats[i]; 
+        else 
+            r.tMaxs().m_floats[i]=b.tMaxs().m_floats[i];
+	}
+}
+/*
+[0]	float32_t	0.0318338
+[1]	float32_t	0.0309355
+[2]	float32_t	0.93264
+[3]	float32_t	0.88788
+
+[0]	float32_t	0.59133
+[1]	float32_t	0.478779
+[2]	float32_t	0.833354
+[3]	float32_t	0.186335
+
+[0]	float32_t	0.242578
+[1]	float32_t	0.0134696
+[2]	float32_t	0.383139
+[3]	float32_t	0.414653
+
+[0]	float32_t	0.067769
+[1]	float32_t	0.993127
+[2]	float32_t	0.484308
+[3]	float32_t	0.765338
+*/
+
+#define LOOPCOUNT 1000
+#define NUM_CYCLES 10000
+#define DATA_SIZE 1024
+
+int Test_btDbvt(void)
+{
+    btDbvtAabbMm a[DATA_SIZE], b[DATA_SIZE], c[DATA_SIZE];
+    btDbvtAabbMm a_ref[DATA_SIZE], b_ref[DATA_SIZE], c_ref[DATA_SIZE];
+        
+    int i;
+        
+    bool Intersect_Test_Res[DATA_SIZE], Intersect_Ref_Res[DATA_SIZE];
+    int Select_Test_Res[DATA_SIZE], Select_Ref_Res[DATA_SIZE];
+    
+    
+    for (i = 0; i < DATA_SIZE; i++)
+    {
+        a[i].tMins().m_floats[0] = (float)rand() / (float)RAND_MAX;
+        a[i].tMins().m_floats[1] = (float)rand() / (float)RAND_MAX;
+        a[i].tMins().m_floats[2] = (float)rand() / (float)RAND_MAX;
+        a[i].tMins().m_floats[3] = (float)rand() / (float)RAND_MAX;
+        
+        a[i].tMaxs().m_floats[0] = (float)rand() / (float)RAND_MAX;
+        a[i].tMaxs().m_floats[1] = (float)rand() / (float)RAND_MAX;
+        a[i].tMaxs().m_floats[2] = (float)rand() / (float)RAND_MAX;
+        a[i].tMaxs().m_floats[3] = (float)rand() / (float)RAND_MAX;
+        
+        b[i].tMins().m_floats[0] = (float)rand() / (float)RAND_MAX;
+        b[i].tMins().m_floats[1] = (float)rand() / (float)RAND_MAX;
+        b[i].tMins().m_floats[2] = (float)rand() / (float)RAND_MAX;
+        b[i].tMins().m_floats[3] = (float)rand() / (float)RAND_MAX;
+        
+        b[i].tMaxs().m_floats[0] = (float)rand() / (float)RAND_MAX;
+        b[i].tMaxs().m_floats[1] = (float)rand() / (float)RAND_MAX;
+        b[i].tMaxs().m_floats[2] = (float)rand() / (float)RAND_MAX;
+        b[i].tMaxs().m_floats[3] = (float)rand() / (float)RAND_MAX;
+        
+        c[i].tMins().m_floats[0] = (float)rand() / (float)RAND_MAX;
+        c[i].tMins().m_floats[1] = (float)rand() / (float)RAND_MAX;
+        c[i].tMins().m_floats[2] = (float)rand() / (float)RAND_MAX;
+        c[i].tMins().m_floats[3] = (float)rand() / (float)RAND_MAX;
+        
+        c[i].tMaxs().m_floats[0] = (float)rand() / (float)RAND_MAX;
+        c[i].tMaxs().m_floats[1] = (float)rand() / (float)RAND_MAX;
+        c[i].tMaxs().m_floats[2] = (float)rand() / (float)RAND_MAX;
+        c[i].tMaxs().m_floats[3] = (float)rand() / (float)RAND_MAX;
+        
+        
+        a_ref[i].tMins().m_floats[0] = a[i].tMins().m_floats[0];
+        a_ref[i].tMins().m_floats[1] = a[i].tMins().m_floats[1];
+        a_ref[i].tMins().m_floats[2] = a[i].tMins().m_floats[2];
+        a_ref[i].tMins().m_floats[3] = a[i].tMins().m_floats[3];
+        
+        a_ref[i].tMaxs().m_floats[0] = a[i].tMaxs().m_floats[0];
+        a_ref[i].tMaxs().m_floats[1] = a[i].tMaxs().m_floats[1];
+        a_ref[i].tMaxs().m_floats[2] = a[i].tMaxs().m_floats[2];
+        a_ref[i].tMaxs().m_floats[3] = a[i].tMaxs().m_floats[3];
+        
+        b_ref[i].tMins().m_floats[0] = b[i].tMins().m_floats[0];
+        b_ref[i].tMins().m_floats[1] = b[i].tMins().m_floats[1];
+        b_ref[i].tMins().m_floats[2] = b[i].tMins().m_floats[2];
+        b_ref[i].tMins().m_floats[3] = b[i].tMins().m_floats[3];
+        
+        b_ref[i].tMaxs().m_floats[0] = b[i].tMaxs().m_floats[0];
+        b_ref[i].tMaxs().m_floats[1] = b[i].tMaxs().m_floats[1];
+        b_ref[i].tMaxs().m_floats[2] = b[i].tMaxs().m_floats[2];
+        b_ref[i].tMaxs().m_floats[3] = b[i].tMaxs().m_floats[3];
+        
+        c_ref[i].tMins().m_floats[0] = c[i].tMins().m_floats[0];
+        c_ref[i].tMins().m_floats[1] = c[i].tMins().m_floats[1];
+        c_ref[i].tMins().m_floats[2] = c[i].tMins().m_floats[2];
+        c_ref[i].tMins().m_floats[3] = c[i].tMins().m_floats[3];
+        
+        c_ref[i].tMaxs().m_floats[0] = c[i].tMaxs().m_floats[0];
+        c_ref[i].tMaxs().m_floats[1] = c[i].tMaxs().m_floats[1];
+        c_ref[i].tMaxs().m_floats[2] = c[i].tMaxs().m_floats[2];
+        c_ref[i].tMaxs().m_floats[3] = c[i].tMaxs().m_floats[3];
+        
+    }
+    
+    
+#if 1
+    for (i = 0; i < DATA_SIZE; i++)
+    {
+        
+        Intersect_Test_Res[i] = Intersect(a[i], b[i]);
+        Intersect_Ref_Res[i]  = Intersect_ref(a_ref[i], b_ref[i]);
+        
+        if(Intersect_Test_Res[i] != Intersect_Ref_Res[i])
+        {
+            printf("Diff on %d\n", i); 
+            
+            printf("a_mx_f[0] = %.3f, a_mx_f[1] = %.3f, a_mx_f[2] = %.3f, a_mx_f[3] = %.3f\n", a[i].tMaxs().m_floats[0], a[i].tMaxs().m_floats[1], a[i].tMaxs().m_floats[2], a[i].tMaxs().m_floats[3]);
+            printf("a_mi_f[0] = %.3f, a_mi_f[1] = %.3f, a_mi_f[2] = %.3f, a_mi_f[3] = %.3f\n", a[i].tMins().m_floats[0], a[i].tMins().m_floats[1], a[i].tMins().m_floats[2], a[i].tMins().m_floats[3]);
+            printf("b_mx_f[0] = %.3f, b_mx_f[1] = %.3f, b_mx_f[2] = %.3f, b_mx_f[3] = %.3f\n", b[i].tMaxs().m_floats[0], b[i].tMaxs().m_floats[1], b[i].tMaxs().m_floats[2], b[i].tMaxs().m_floats[3]);
+            printf("b_mi_f[0] = %.3f, b_mi_f[1] = %.3f, b_mi_f[2] = %.3f, b_mi_f[3] = %.3f\n", b[i].tMins().m_floats[0], b[i].tMins().m_floats[1], b[i].tMins().m_floats[2], b[i].tMins().m_floats[3]);
+            
+            printf("a_mx_f_ref[0] = %.3f, a_mx_f_ref[1] = %.3f, a_mx_f_ref[2] = %.3f, a_mx_f_ref[3] = %.3f\n", a_ref[i].tMaxs().m_floats[0], a_ref[i].tMaxs().m_floats[1], a_ref[i].tMaxs().m_floats[2], a_ref[i].tMaxs().m_floats[3]);
+            printf("a_mi_f_ref[0] = %.3f, a_mi_f_ref[1] = %.3f, a_mi_f_ref[2] = %.3f, a_mi_f_ref[3] = %.3f\n", a_ref[i].tMins().m_floats[0], a_ref[i].tMins().m_floats[1], a_ref[i].tMins().m_floats[2], a_ref[i].tMins().m_floats[3]);
+            printf("b_mx_f_ref[0] = %.3f, b_mx_f_ref[1] = %.3f, b_mx_f_ref[2] = %.3f, b_mx_f_ref[3] = %.3f\n", b_ref[i].tMaxs().m_floats[0], b_ref[i].tMaxs().m_floats[1], b_ref[i].tMaxs().m_floats[2], b_ref[i].tMaxs().m_floats[3]);
+            printf("b_mi_f_ref[0] = %.3f, b_mi_f_ref[1] = %.3f, b_mi_f_ref[2] = %.3f, b_mi_f_ref[3] = %.3f\n", b_ref[i].tMins().m_floats[0], b_ref[i].tMins().m_floats[1], b_ref[i].tMins().m_floats[2], b_ref[i].tMins().m_floats[3]);   
+        }
+    }
+#endif    
+        
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j;
+        
+    
+    ////////////////////////////////////
+    //
+    // Time and Test Intersect
+    //
+    ////////////////////////////////////
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            
+            
+            for (i = 0; i < DATA_SIZE; i++)
+            {
+                Intersect_Ref_Res[i]  = Intersect_ref(a_ref[i], b_ref[i]);
+            }
+            
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            
+            for (i = 0; i < DATA_SIZE; i++)
+            {
+                Intersect_Test_Res[i] = Intersect(a[i], b[i]);
+            }
+
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+    
+    vlog( "Intersect Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+    
+    //printf("scalar = %llu, vector = %llu\n", scalarTime, vectorTime);
+    
+    for (i = 0; i < DATA_SIZE; i++)
+    {
+        if(Intersect_Test_Res[i] != Intersect_Ref_Res[i])
+        {
+            printf("Intersect fail at %d\n", i);
+			return 1;
+        }
+    }
+    
+    ////////////////////////////////////
+    //
+    // Time and Test Merge
+    //
+    ////////////////////////////////////
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            
+            
+            for (i = 0; i < DATA_SIZE; i++)
+            {
+                Merge_ref(a_ref[i], b_ref[i], c_ref[i]);
+            }
+            
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            
+            for (i = 0; i < DATA_SIZE; i++)
+            {
+                Merge(a[i], b[i], c[i]);
+            }
+            
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+    
+    vlog( "Merge Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+    
+    //printf("scalar = %llu, vector = %llu\n", scalarTime, vectorTime);
+    /*
+ c  [0]	float32_t	0.00455523
+    [1]	float32_t	0.559712
+    [2]	float32_t	0.0795838
+    [3]	float32_t	0.10182
+    
+c_ref
+    [0]	float32_t	0.00455523
+    [1]	float32_t	0.559712
+    [2]	float32_t	0.0795838
+    [3]	float32_t	0.552081
+    
+    
+c   [0]	float32_t	0.829904
+    [1]	float32_t	0.692891
+    [2]	float32_t	0.961654
+    [3]	float32_t	0.666956
+    
+ c_ref
+    [0]	float32_t	0.829904
+    [1]	float32_t	0.692891
+    [2]	float32_t	0.961654
+    [3]	float32_t	0.522878
+    */
+    for (i = 0; i < DATA_SIZE; i++)
+    {
+        //ignore 4th component because it is not computed in all code-paths
+        if( (fabs(c[i].tMaxs().m_floats[0] - c_ref[i].tMaxs().m_floats[0]) > 0.001) || 
+           (fabs(c[i].tMaxs().m_floats[1] - c_ref[i].tMaxs().m_floats[1]) > 0.001) || 
+           (fabs(c[i].tMaxs().m_floats[2] - c_ref[i].tMaxs().m_floats[2]) > 0.001) || 
+          // (fabs(c[i].tMaxs().m_floats[3] - c_ref[i].tMaxs().m_floats[3]) > 0.001) || 
+           (fabs(c[i].tMins().m_floats[0] - c_ref[i].tMins().m_floats[0]) > 0.001) || 
+           (fabs(c[i].tMins().m_floats[1] - c_ref[i].tMins().m_floats[1]) > 0.001) || 
+           (fabs(c[i].tMins().m_floats[2] - c_ref[i].tMins().m_floats[2]) > 0.001)  
+          //|| (fabs(c[i].tMins().m_floats[3] - c_ref[i].tMins().m_floats[3]) > 0.001) 
+           )
+           
+        
+        //if((c[i].tMaxs().m_floats[0] != c_ref[i].tMaxs().m_floats[0]) || (c[i].tMaxs().m_floats[1] != c_ref[i].tMaxs().m_floats[1]) || (c[i].tMaxs().m_floats[2] != c_ref[i].tMaxs().m_floats[2]) || (c[i].tMaxs().m_floats[3] != c_ref[i].tMaxs().m_floats[3]) || (c[i].tMins().m_floats[0] != c_ref[i].tMins().m_floats[0]) || (c[i].tMins().m_floats[1] != c_ref[i].tMins().m_floats[1]) || (c[i].tMins().m_floats[2] != c_ref[i].tMins().m_floats[2]) || (c[i].tMins().m_floats[3] != c_ref[i].tMins().m_floats[3]))
+        {
+            printf("Merge fail at %d with test = %d, ref = %d\n", i, Select_Test_Res[i], Select_Ref_Res[i]);
+            
+            printf("a_mx_f[0] = %.3f, a_mx_f[1] = %.3f, a_mx_f[2] = %.3f, a_mx_f[3] = %.3f\n", a[i].tMaxs().m_floats[0], a[i].tMaxs().m_floats[1], a[i].tMaxs().m_floats[2], a[i].tMaxs().m_floats[3]);
+            printf("a_mi_f[0] = %.3f, a_mi_f[1] = %.3f, a_mi_f[2] = %.3f, a_mi_f[3] = %.3f\n", a[i].tMins().m_floats[0], a[i].tMins().m_floats[1], a[i].tMins().m_floats[2], a[i].tMins().m_floats[3]);
+            printf("b_mx_f[0] = %.3f, b_mx_f[1] = %.3f, b_mx_f[2] = %.3f, b_mx_f[3] = %.3f\n", b[i].tMaxs().m_floats[0], b[i].tMaxs().m_floats[1], b[i].tMaxs().m_floats[2], b[i].tMaxs().m_floats[3]);
+            printf("b_mi_f[0] = %.3f, b_mi_f[1] = %.3f, b_mi_f[2] = %.3f, b_mi_f[3] = %.3f\n", b[i].tMins().m_floats[0], b[i].tMins().m_floats[1], b[i].tMins().m_floats[2], b[i].tMins().m_floats[3]);
+            printf("c_mx_f[0] = %.3f, c_mx_f[1] = %.3f, c_mx_f[2] = %.3f, c_mx_f[3] = %.3f\n", c[i].tMaxs().m_floats[0], c[i].tMaxs().m_floats[1], c[i].tMaxs().m_floats[2], c[i].tMaxs().m_floats[3]);
+            printf("c_mi_f[0] = %.3f, c_mi_f[1] = %.3f, c_mi_f[2] = %.3f, c_mi_f[3] = %.3f\n", c[i].tMins().m_floats[0], c[i].tMins().m_floats[1], c[i].tMins().m_floats[2], c[i].tMins().m_floats[3]);
+            
+            printf("a_mx_f_ref[0] = %.3f, a_mx_f_ref[1] = %.3f, a_mx_f_ref[2] = %.3f, a_mx_f_ref[3] = %.3f\n", a_ref[i].tMaxs().m_floats[0], a_ref[i].tMaxs().m_floats[1], a_ref[i].tMaxs().m_floats[2], a_ref[i].tMaxs().m_floats[3]);
+            printf("a_mi_f_ref[0] = %.3f, a_mi_f_ref[1] = %.3f, a_mi_f_ref[2] = %.3f, a_mi_f_ref[3] = %.3f\n", a_ref[i].tMins().m_floats[0], a_ref[i].tMins().m_floats[1], a_ref[i].tMins().m_floats[2], a_ref[i].tMins().m_floats[3]);
+            printf("b_mx_f_ref[0] = %.3f, b_mx_f_ref[1] = %.3f, b_mx_f_ref[2] = %.3f, b_mx_f_ref[3] = %.3f\n", b_ref[i].tMaxs().m_floats[0], b_ref[i].tMaxs().m_floats[1], b_ref[i].tMaxs().m_floats[2], b_ref[i].tMaxs().m_floats[3]);
+            printf("b_mi_f_ref[0] = %.3f, b_mi_f_ref[1] = %.3f, b_mi_f_ref[2] = %.3f, b_mi_f_ref[3] = %.3f\n", b_ref[i].tMins().m_floats[0], b_ref[i].tMins().m_floats[1], b_ref[i].tMins().m_floats[2], b_ref[i].tMins().m_floats[3]);
+            printf("c_mx_f_ref[0] = %.3f, c_mx_f_ref[1] = %.3f, c_mx_f_ref[2] = %.3f, c_mx_f_ref[3] = %.3f\n", c_ref[i].tMaxs().m_floats[0], c_ref[i].tMaxs().m_floats[1], c_ref[i].tMaxs().m_floats[2], c_ref[i].tMaxs().m_floats[3]);
+            printf("c_mi_f_ref[0] = %.3f, c_mi_f_ref[1] = %.3f, c_mi_f_ref[2] = %.3f, c_mi_f_ref[3] = %.3f\n", c_ref[i].tMins().m_floats[0], c_ref[i].tMins().m_floats[1], c_ref[i].tMins().m_floats[2], c_ref[i].tMins().m_floats[3]);
+			return 1;
+
+		}
+        
+    }
+    
+    ////////////////////////////////////
+    //
+    // Time and Test Select
+    //
+    ////////////////////////////////////
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            
+            
+            for (i = 0; i < DATA_SIZE; i++)
+            {
+                Select_Ref_Res[i]  = Select_ref(a_ref[i], b_ref[i], c_ref[i]);
+            }
+            
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            
+            for (i = 0; i < DATA_SIZE; i++)
+            {
+                Select_Test_Res[i] = Select(a[i], b[i], c[i]);
+            }
+            
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+    
+    vlog( "Select Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+    
+    //printf("scalar = %llu, vector = %llu\n", scalarTime, vectorTime);
+    
+    for (i = 0; i < DATA_SIZE; i++)
+    {
+        Select_Ref_Res[i]  = Select_ref(a_ref[i], b_ref[i], c_ref[i]);
+        Select_Test_Res[i] = Select(a[i], b[i], c[i]);
+        
+        if(Select_Test_Res[i] != Select_Ref_Res[i])
+        {
+            printf("Select fail at %d with test = %d, ref = %d\n", i, Select_Test_Res[i], Select_Ref_Res[i]);
+            
+            printf("a_mx_f[0] = %.3f, a_mx_f[1] = %.3f, a_mx_f[2] = %.3f, a_mx_f[3] = %.3f\n", a[i].tMaxs().m_floats[0], a[i].tMaxs().m_floats[1], a[i].tMaxs().m_floats[2], a[i].tMaxs().m_floats[3]);
+            printf("a_mi_f[0] = %.3f, a_mi_f[1] = %.3f, a_mi_f[2] = %.3f, a_mi_f[3] = %.3f\n", a[i].tMins().m_floats[0], a[i].tMins().m_floats[1], a[i].tMins().m_floats[2], a[i].tMins().m_floats[3]);
+            printf("b_mx_f[0] = %.3f, b_mx_f[1] = %.3f, b_mx_f[2] = %.3f, b_mx_f[3] = %.3f\n", b[i].tMaxs().m_floats[0], b[i].tMaxs().m_floats[1], b[i].tMaxs().m_floats[2], b[i].tMaxs().m_floats[3]);
+            printf("b_mi_f[0] = %.3f, b_mi_f[1] = %.3f, b_mi_f[2] = %.3f, b_mi_f[3] = %.3f\n", b[i].tMins().m_floats[0], b[i].tMins().m_floats[1], b[i].tMins().m_floats[2], b[i].tMins().m_floats[3]);
+            printf("c_mx_f[0] = %.3f, c_mx_f[1] = %.3f, c_mx_f[2] = %.3f, c_mx_f[3] = %.3f\n", c[i].tMaxs().m_floats[0], c[i].tMaxs().m_floats[1], c[i].tMaxs().m_floats[2], c[i].tMaxs().m_floats[3]);
+            printf("c_mi_f[0] = %.3f, c_mi_f[1] = %.3f, c_mi_f[2] = %.3f, c_mi_f[3] = %.3f\n", c[i].tMins().m_floats[0], c[i].tMins().m_floats[1], c[i].tMins().m_floats[2], c[i].tMins().m_floats[3]);
+            
+            printf("a_mx_f_ref[0] = %.3f, a_mx_f_ref[1] = %.3f, a_mx_f_ref[2] = %.3f, a_mx_f_ref[3] = %.3f\n", a_ref[i].tMaxs().m_floats[0], a_ref[i].tMaxs().m_floats[1], a_ref[i].tMaxs().m_floats[2], a_ref[i].tMaxs().m_floats[3]);
+            printf("a_mi_f_ref[0] = %.3f, a_mi_f_ref[1] = %.3f, a_mi_f_ref[2] = %.3f, a_mi_f_ref[3] = %.3f\n", a_ref[i].tMins().m_floats[0], a_ref[i].tMins().m_floats[1], a_ref[i].tMins().m_floats[2], a_ref[i].tMins().m_floats[3]);
+            printf("b_mx_f_ref[0] = %.3f, b_mx_f_ref[1] = %.3f, b_mx_f_ref[2] = %.3f, b_mx_f_ref[3] = %.3f\n", b_ref[i].tMaxs().m_floats[0], b_ref[i].tMaxs().m_floats[1], b_ref[i].tMaxs().m_floats[2], b_ref[i].tMaxs().m_floats[3]);
+            printf("b_mi_f_ref[0] = %.3f, b_mi_f_ref[1] = %.3f, b_mi_f_ref[2] = %.3f, b_mi_f_ref[3] = %.3f\n", b_ref[i].tMins().m_floats[0], b_ref[i].tMins().m_floats[1], b_ref[i].tMins().m_floats[2], b_ref[i].tMins().m_floats[3]);
+            printf("c_mx_f_ref[0] = %.3f, c_mx_f_ref[1] = %.3f, c_mx_f_ref[2] = %.3f, c_mx_f_ref[3] = %.3f\n", c_ref[i].tMaxs().m_floats[0], c_ref[i].tMaxs().m_floats[1], c_ref[i].tMaxs().m_floats[2], c_ref[i].tMaxs().m_floats[3]);
+            printf("c_mi_f_ref[0] = %.3f, c_mi_f_ref[1] = %.3f, c_mi_f_ref[2] = %.3f, c_mi_f_ref[3] = %.3f\n", c_ref[i].tMins().m_floats[0], c_ref[i].tMins().m_floats[1], c_ref[i].tMins().m_floats[2], c_ref[i].tMins().m_floats[3]);
+			return 1;
+		}
+        
+    }
+    
+    return 0;
+}
+#endif
+
+
+
+
--- a/test/Bullet2/Source/Tests/Test_btDbvt.h
+++ b/test/Bullet2/Source/Tests/Test_btDbvt.h
@@ -0,0 +1,21 @@
+//
+//  Test_btDbvt.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc., Inc.
+//
+
+#ifndef BulletTest_Test_btDbvt_h
+#define BulletTest_Test_btDbvt_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+    
+    int Test_btDbvt(void);
+    
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/test/Bullet2/Source/Tests/Test_dot3.cpp
+++ b/test/Bullet2/Source/Tests/Test_dot3.cpp
@@ -0,0 +1,153 @@
+//
+//  Test_v3dot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_dot3.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static btVector3 dot3_ref( const btVector3 &,  const btVector3 &,   const btVector3 &,   const btVector3 &);
+static btVector3 dot3_ref( const btVector3 &v, const btVector3 &v1, const btVector3 &v2, const btVector3 &v3)
+{
+    return btVector3( v.dot(v1), v.dot(v2), v.dot(v3));
+}
+
+/*
+SIMD_FORCE_INLINE int operator!=(const btVector3 &s, const btVector3 &v)
+{
+#ifdef __SSE__
+    __m128 test = _mm_cmpneq_ps( s.mVec128, v.mVec128 );
+    return (_mm_movemask_ps( test ) & 7) != 0;
+#elif defined __ARM_NEON_H
+    uint32x4_t test = vandq_u32( vceqq_f32( s.mVec128, v.mVec128 ), (uint32x4_t){-1,-1,-1,0});
+    uint32x2_t t = vpadd_u32( vget_low_u32(test), vget_high_u32(test));
+    t = vpadd_u32(t, t);
+    return -3 != (int32_t) vget_lane_u32(t, 0);
+#else
+    return  s.m_floats[0] != v.m_floats[0] ||
+    s.m_floats[1] != v.m_floats[1] ||
+    s.m_floats[2] != v.m_floats[2];
+#endif
+}
+*/
+ 
+
+
+#define LOOPCOUNT 1000
+#define NUM_CYCLES 10000
+
+int Test_dot3(void)
+{
+    btVector3 v, v1, v2, v3;
+    
+#define DATA_SIZE 1024
+    
+	btVector3 vec3_arr[DATA_SIZE];
+	btVector3 vec3_arr1[DATA_SIZE];
+	btVector3 vec3_arr2[DATA_SIZE];
+	btVector3 vec3_arr3[DATA_SIZE];
+    btVector3 res_arr[DATA_SIZE];
+    
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+    btVector3 correct, test;
+    
+	for( k = 0; k < DATA_SIZE; k++ )
+	{
+        
+        vec3_arr[k]  = btVector3( btAssign128( RANDF, RANDF, RANDF, BT_NAN));
+        vec3_arr1[k] = btVector3( btAssign128( RANDF, RANDF, RANDF, BT_NAN));
+        vec3_arr2[k] = btVector3( btAssign128( RANDF, RANDF, RANDF, BT_NAN ));
+        vec3_arr3[k] = btVector3( btAssign128( RANDF, RANDF, RANDF, BT_NAN));
+
+		correct = dot3_ref(vec3_arr[k], vec3_arr1[k], vec3_arr2[k], vec3_arr3[k]);
+		test = vec3_arr[k].dot3( vec3_arr1[k], vec3_arr2[k], vec3_arr3[k]);
+        
+		if( correct != test )
+		{
+			vlog( "Error (%ld) - dot3 result error! *{%a, %a, %a, %a} != {%a, %a, %a, %a} \n", k,
+                   correct.x(), correct.y(), correct.z(), correct.w(),
+                   test.x(), test.y(), test.z(), test.w() );
+            
+			return 1;
+		}
+    }
+    
+    
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t k32 = (k & (DATA_SIZE-1)); 
+                res_arr[k32] = dot3_ref( vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
+				res_arr[k32] = dot3_ref( vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
+				res_arr[k32] = dot3_ref( vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
+				res_arr[k32] = dot3_ref( vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t k32 = (k & (DATA_SIZE-1)); 
+                res_arr[k32] = vec3_arr[k32].dot3( vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
+				res_arr[k32] = vec3_arr[k32].dot3( vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
+				res_arr[k32] = vec3_arr[k32].dot3( vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++;
+				res_arr[k32] = vec3_arr[k32].dot3( vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+    
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+    
+    return 0;
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_dot3.h
+++ b/test/Bullet2/Source/Tests/Test_dot3.h
@@ -0,0 +1,22 @@
+//
+//  Test_mindot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_dot3_h
+#define BulletTest_Test_dot3_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+    
+    int Test_dot3(void);
+    
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
--- a/test/Bullet2/Source/Tests/Test_maxdot.cpp
+++ b/test/Bullet2/Source/Tests/Test_maxdot.cpp
@@ -0,0 +1,281 @@
+//
+//  Test_maxdot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_maxdot.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+
+// reference code for testing purposes
+static long maxdot_ref(    const btSimdFloat4 *vertices, 
+                float *vec,
+                size_t count, 
+                float *dotResult );
+
+
+
+
+
+#ifdef __arm__
+    #define MAX_LOG2_SIZE   9
+#else
+    #define MAX_LOG2_SIZE   10
+#endif
+#define MAX_SIZE        (1U << MAX_LOG2_SIZE)
+#define LOOPCOUNT 10
+
+int Test_maxdot(void)
+{
+    // Init an array flanked by guard pages
+    btSimdFloat4 *data = (btSimdFloat4*) GuardCalloc( 1, MAX_SIZE * sizeof(btSimdFloat4), NULL );
+    float *fp = (float*) data;
+    long correct, test;
+    btVector3 localScaling( 0.1f, 0.2f, 0.3f);
+    size_t size;
+    
+    // Init the data
+    size_t i;
+    for( i = 0; i < MAX_SIZE; i++ )
+    {
+        fp[4*i] = (int32_t) RANDF_16;
+        fp[4*i+1] = (int32_t) RANDF_16;
+        fp[4*i+2] = (int32_t) RANDF_16;
+        fp[4*i+3] = BT_NAN;     // w channel NaN
+    }
+    
+    float correctDot, testDot;
+    fp = (float*) localScaling;
+	float maxRelativeError = 0.f;
+	
+    for( size = 1; size <= MAX_SIZE; size++ )
+    {
+        float *in = (float*)(data + MAX_SIZE - size);
+        size_t position;
+        
+        for( position = 0; position < size; position++ )
+        {
+            float *biggest = in + position * 4;
+            float old[4] = { biggest[0], biggest[1], biggest[2], biggest[3] };
+            biggest[0] += LARGE_FLOAT17;
+            biggest[1] += LARGE_FLOAT17;
+            biggest[2] += LARGE_FLOAT17;
+            biggest[3] += LARGE_FLOAT17;
+            
+            correctDot = BT_NAN;
+            testDot = BT_NAN;
+            correct = maxdot_ref( (btSimdFloat4*) in, (float*) &localScaling, size, &correctDot);
+            test = localScaling.maxDot( (btVector3*) in, size, testDot);
+            if( test < 0 || test >= size )
+            {
+                vlog( "Error @ %ld: index out of bounds! *%ld vs %ld \n", size, correct, test);
+                continue;
+            }
+            if( correct != test )
+			{
+                vlog( "Error @ %ld: index misreported! *%ld vs %ld  (*%f, %f)\n", size, correct, test, 
+                       fp[0] * in[4*correct] + fp[1] * in[4*correct+1]  + fp[2] * in[4*correct+2], 
+                       fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2] );
+				return 1;
+			}
+            if( test != position )
+			{
+                vlog( "Biggest not found where it is supposed to be: *%ld vs %ld (*%f, %f)\n", position, test, 
+                       fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2],
+                       fp[0] * in[4*position] + fp[1] * in[4*position+1]  + fp[2] * in[4*position+2]  );
+				return 1;
+			}
+
+            if( correctDot != testDot )
+			{
+				float relativeError = btFabs((testDot - correctDot) / correctDot);
+				if (relativeError>1e-6)
+				{
+                vlog( "Error @ %ld: dotpr misreported! *%f vs %f    (*%f, %f)\n", size, correctDot, testDot, 
+                       fp[0] * in[4*correct] + fp[1] * in[4*correct+1]  + fp[2] * in[4*correct+2], 
+                       fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2]  );
+				return 1;
+				} else
+				{
+					if (maxRelativeError < relativeError)
+					{
+						maxRelativeError = relativeError;
+#ifdef VERBOSE_WARNING
+						sprintf(errStr,"Warning @ %ld: dotpr misreported! *%f vs %f    (*%f, %f)\n", size, correctDot, testDot, 
+						   fp[0] * in[4*correct] + fp[1] * in[4*correct+1]  + fp[2] * in[4*correct+2], 
+						   fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2]);
+#endif //VERBOSE_WARNING
+					}
+				}
+			}
+            
+            memcpy( biggest, old, 16 );
+        }
+    }
+    
+	
+	if (maxRelativeError)
+	{
+		printf("Warning: relative error = %e\n", maxRelativeError);
+#ifdef VERBOSE_WARNING
+		vlog(errStr);
+#endif
+	}
+
+    uint64_t scalarTimes[33 + (MAX_LOG2_SIZE-5)];
+    uint64_t vectorTimes[33 + (MAX_LOG2_SIZE-5)];
+    size_t j, k;
+    float *in = (float*) data;
+    for( size = 1; size <= 32; size++ )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTimes[size] = 0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                correct += maxdot_ref( (btSimdFloat4*) in, (float*) &localScaling, size, &correctDot);
+            currentTime = ReadTicks() - startTime;
+            scalarTimes[size] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTimes[size] = bestTime;        
+        else
+            scalarTimes[size] /= 100;
+    }
+    
+    uint64_t *timep = &scalarTimes[33];
+    for( size = 64; size <= MAX_SIZE; size *= 2 )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        timep[0] =0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                correct += maxdot_ref( (btSimdFloat4*) in, (float*) &localScaling, size, &correctDot);
+            currentTime = ReadTicks() - startTime;
+            timep[0] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            timep[0] = bestTime;        
+        else
+            timep[0] /= 100;
+
+        timep++;
+    }
+
+    for( size = 1; size <= 32; size++ )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTimes[size] = 0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                test += localScaling.maxDot( (btVector3*) in, size, testDot);
+            currentTime = ReadTicks() - startTime;
+            vectorTimes[size] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTimes[size] = bestTime;        
+        else
+            vectorTimes[size] /= 100;
+    }
+    
+    timep = &vectorTimes[33];
+    for( size = 64; size <= MAX_SIZE; size *= 2 )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        timep[0] =0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                test += localScaling.maxDot( (btVector3*) in, size, testDot);
+            currentTime = ReadTicks() - startTime;
+            timep[0] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            timep[0] = bestTime;        
+        else
+            timep[0] /= 100;
+        
+        timep++;
+    }
+    
+    vlog( "Timing:\n" );
+    vlog( " size\t    scalar\t    vector\n" );
+    for( size = 1; size <= 32; size++ )
+        vlog( "%5lu\t%10.2f\t%10.2f\n", size, TicksToCycles( scalarTimes[size] ) / LOOPCOUNT, TicksToCycles( vectorTimes[size] ) / LOOPCOUNT );
+    size_t index = 33;
+    for( size = 64; size <= MAX_SIZE; size *= 2 )
+    {
+        vlog( "%5lu\t%10.2f\t%10.2f\n", size, TicksToCycles( scalarTimes[index] ) / LOOPCOUNT, TicksToCycles( vectorTimes[index] ) / LOOPCOUNT );
+        index++;
+    }
+    
+    // Useless check to make sure that the timing loops are not optimized away
+    if( test != correct )
+        vlog( "Error: Test != correct: *%ld vs. %ld\n", correct, test);
+    
+    GuardFree(data);
+    
+    return 0;
+}
+
+
+static long maxdot_ref(    const btSimdFloat4 *vertices, 
+                float *vec,
+                size_t count, 
+                float *dotResult )
+{
+    
+    const float *dp = (const float*) vertices;
+    float  maxDot = -BT_INFINITY;
+    long i = 0;
+    long ptIndex = -1;
+    
+    for( i = 0; i < count; i++ )
+    {
+        float dot = vec[0] * dp[0] + vec[1] * dp[1] + vec[2] * dp[2];   dp += 4;
+        
+        if( dot > maxDot )
+        {
+            maxDot = dot;
+            ptIndex = i;
+        }
+    }
+    
+    *dotResult = maxDot;
+    
+    return ptIndex;
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_maxdot.h
+++ b/test/Bullet2/Source/Tests/Test_maxdot.h
@@ -0,0 +1,22 @@
+//
+//  Test_maxdot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_maxdot_h
+#define BulletTest_Test_maxdot_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_maxdot(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_mindot.cpp
+++ b/test/Bullet2/Source/Tests/Test_mindot.cpp
@@ -0,0 +1,269 @@
+//
+//  Test_mindot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_mindot.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+
+// reference code for testing purposes
+static long mindot_ref(    const btSimdFloat4 *vertices, 
+                       float *vec,
+                       size_t count, 
+                       float *dotResult );
+
+#ifdef __arm__
+    #define MAX_LOG2_SIZE   9
+#else
+    #define MAX_LOG2_SIZE   9
+#endif
+#define MAX_SIZE        (1U << MAX_LOG2_SIZE)
+#define LOOPCOUNT 100
+
+int Test_mindot(void)
+{
+    // Init an array flanked by guard pages
+    btSimdFloat4 *data = (btSimdFloat4*) GuardCalloc( 1, MAX_SIZE * sizeof(btSimdFloat4), NULL );
+    float *fp = (float*) data;
+    long correct, test;
+    btVector3 localScaling( 0.1f, 0.2f, 0.3f);
+    size_t size;
+    
+    // Init the data
+    size_t i;
+    for( i = 0; i < MAX_SIZE; i++ )
+    {
+        fp[4*i] = (int32_t) RANDF_16;
+        fp[4*i+1] = (int32_t) RANDF_16;
+        fp[4*i+2] = (int32_t) RANDF_16;
+        fp[4*i+3] = BT_NAN;     // w channel NaN
+    }
+    
+    float correctDot, testDot;
+    fp = (float*) localScaling;
+	float maxRelativeError = 0.f;
+
+    for( size = 1; size <= MAX_SIZE; size++ )
+    {
+        float *in = (float*)(data + MAX_SIZE - size);
+        size_t position;
+        
+        for( position = 0; position < size; position++ )
+        {
+            float *biggest = in + position * 4;
+            float old[4] = { biggest[0], biggest[1], biggest[2], biggest[3] };
+            biggest[0] -= LARGE_FLOAT17;
+            biggest[1] -= LARGE_FLOAT17;
+            biggest[2] -= LARGE_FLOAT17;
+            biggest[3] -= LARGE_FLOAT17;
+            
+            correctDot = BT_NAN;
+            testDot = BT_NAN;
+            correct = mindot_ref( (btSimdFloat4*) in, (float*) &localScaling, size, &correctDot);
+            test = localScaling.minDot( (btVector3*) in, size, testDot);
+            if( test < 0 || test >= size )
+            {
+                vlog( "Error @ %ld: index out of bounds! *%ld vs %ld \n", size, correct, test);
+                continue;
+            }
+            if( correct != test )
+			{
+                vlog( "Error @ %ld: index misreported! *%ld vs %ld  (*%f, %f)\n", size, correct, test, 
+                       fp[0] * in[4*correct] + fp[1] * in[4*correct+1]  + fp[2] * in[4*correct+2], 
+                       fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2] );
+				return 1;
+			}
+            if( test != position )
+			{
+                vlog( "Biggest not found where it is supposed to be: *%ld vs %ld (*%f, %f)\n", position, test, 
+                       fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2],
+                       fp[0] * in[4*position] + fp[1] * in[4*position+1]  + fp[2] * in[4*position+2]  );
+				return 1;
+			}
+
+            if( correctDot != testDot )
+			{
+				float relativeError = btFabs((testDot - correctDot) / correctDot);
+				if (relativeError>1e6)
+				{
+					vlog( "Error @ %ld: dotpr misreported! *%f vs %f    (*%f, %f)\n", size, correctDot, testDot, 
+						   fp[0] * in[4*correct] + fp[1] * in[4*correct+1]  + fp[2] * in[4*correct+2], 
+						   fp[0] * in[4*test] + fp[1] * in[4*test+1]  + fp[2] * in[4*test+2]  );
+					return 1;
+				} else
+				{
+					if (maxRelativeError < relativeError)
+					{
+						maxRelativeError = relativeError;
+					}
+				}
+			}
+
+            
+            memcpy( biggest, old, 16 );
+        }
+    }
+    
+	if (maxRelativeError)
+	{
+		printf("Warning: relative error = %e\n", maxRelativeError);
+	}
+    uint64_t scalarTimes[33 + (MAX_LOG2_SIZE-5)];
+    uint64_t vectorTimes[33 + (MAX_LOG2_SIZE-5)];
+    size_t j, k;
+    float *in = (float*) data;
+    for( size = 1; size <= 32; size++ )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTimes[size] = 0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                correct += mindot_ref( (btSimdFloat4*) in, (float*) &localScaling, size, &correctDot);
+            currentTime = ReadTicks() - startTime;
+            scalarTimes[size] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTimes[size] = bestTime;        
+        else
+            scalarTimes[size] /= 100;
+    }
+    
+    uint64_t *timep = &scalarTimes[33];
+    for( size = 64; size <= MAX_SIZE; size *= 2 )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        timep[0] =0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                correct += mindot_ref( (btSimdFloat4*) in, (float*) &localScaling, size, &correctDot);
+            currentTime = ReadTicks() - startTime;
+            timep[0] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            timep[0] = bestTime;        
+        else
+            timep[0] /= 100;
+        
+        timep++;
+    }
+    
+    for( size = 1; size <= 32; size++ )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTimes[size] = 0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                test += localScaling.minDot( (btVector3*) in, size, testDot);
+            currentTime = ReadTicks() - startTime;
+            vectorTimes[size] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTimes[size] = bestTime;        
+        else
+            vectorTimes[size] /= 100;
+    }
+    
+    timep = &vectorTimes[33];
+    for( size = 64; size <= MAX_SIZE; size *= 2 )
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        timep[0] =0;
+        for (j = 0; j < 100; j++) {
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+                test += localScaling.minDot( (btVector3*) in, size, testDot);
+            currentTime = ReadTicks() - startTime;
+            timep[0] += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            timep[0] = bestTime;        
+        else
+            timep[0] /= 100;
+        
+        timep++;
+    }
+    
+    vlog( "Timing:\n" );
+    vlog( " size\t    scalar\t    vector\n" );
+    for( size = 1; size <= 32; size++ )
+        vlog( "%5lu\t%10.2f\t%10.2f\n", size, TicksToCycles( scalarTimes[size] ) / LOOPCOUNT, TicksToCycles( vectorTimes[size] ) / LOOPCOUNT );
+    size_t index = 33;
+    for( size = 64; size <= MAX_SIZE; size *= 2 )
+    {
+        vlog( "%5lu\t%10.2f\t%10.2f\n", size, TicksToCycles( scalarTimes[index] ) / LOOPCOUNT, TicksToCycles( vectorTimes[index] ) / LOOPCOUNT );
+        index++;
+    }
+    
+    // Useless check to make sure that the timing loops are not optimized away
+    if( test != correct )
+        vlog( "Error: Test != correct: *%ld vs. %ld\n", correct, test);
+    
+    GuardFree(data);
+    
+    return 0;
+}
+
+
+
+static long mindot_ref(    const btSimdFloat4 *vertices, 
+                       float *vec,
+                       size_t count, 
+                       float *dotResult )
+{
+    
+    const float *dp = (const float*) vertices;
+    float  minDot = BT_INFINITY;
+    long i = 0;
+    long ptIndex = -1;
+    
+    for( i = 0; i < count; i++ )
+    {
+        float dot = vec[0] * dp[0] + vec[1] * dp[1] + vec[2] * dp[2];   dp += 4;
+        
+        if( dot < minDot )
+        {
+            minDot = dot;
+            ptIndex = i;
+        }
+    }
+    
+    *dotResult = minDot;
+    
+    return ptIndex;
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_mindot.h
+++ b/test/Bullet2/Source/Tests/Test_mindot.h
@@ -0,0 +1,22 @@
+//
+//  Test_mindot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_mindot_h
+#define BulletTest_Test_mindot_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+    
+    int Test_mindot(void);
+    
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
--- a/test/Bullet2/Source/Tests/Test_qtdot.cpp
+++ b/test/Bullet2/Source/Tests/Test_qtdot.cpp
@@ -0,0 +1,162 @@
+//
+//  Test_qtdot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_qtdot.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btQuaternion.h>
+
+#define BT_OP(a, b)     (a.dot(b))
+// reference code for testing purposes
+static inline btScalar qtdot_ref(btQuaternion& q1, btQuaternion& q2);
+
+static inline btScalar qtdot_ref(btQuaternion& q1, btQuaternion& q2)
+{
+    return 
+        q1.x() * q2.x() + 
+        q1.y() * q2.y() + 
+        q1.z() * q2.z() + 
+        q1.w() * q2.w();
+}
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_qtdot(void)
+{
+    btQuaternion q1, q2;
+	float x, y, z, w, vNaN;
+    vNaN = BT_NAN;     // w channel NaN
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = RANDF_01;
+    q1.setValue(x,y,z,w);
+	
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = RANDF_01;
+    q2.setValue(x,y,z,w);
+
+	btScalar correct_res, test_res;
+	 
+    {
+		correct_res = vNaN; 
+		test_res = vNaN;
+		correct_res = qtdot_ref(q1, q2);
+		test_res = BT_OP(q1,q2);
+	   
+		if( fabsf(correct_res - test_res) > FLT_EPSILON*4 )
+		{	
+			vlog( "Error - qtdot result error! "
+					"\ncorrect = %10.4f "
+					"\ntested  = %10.4f \n", 
+					correct_res, test_res);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btQuaternion qt_arr1[DATA_SIZE];
+	btQuaternion qt_arr2[DATA_SIZE];
+    btScalar     res_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+    for( k = 0; k < DATA_SIZE; k++ )
+    {
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        w = RANDF_01;
+        qt_arr1[k].setValue(x,y,z,w);
+
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        w = RANDF_01;
+        qt_arr2[k].setValue(x,y,z,w);
+    }
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t km = (k & (DATA_SIZE-1)); 
+                res_arr[km] = qtdot_ref(qt_arr1[km], qt_arr2[km]);km++;
+                res_arr[km] = qtdot_ref(qt_arr1[km], qt_arr2[km]);km++;
+                res_arr[km] = qtdot_ref(qt_arr1[km], qt_arr2[km]);km++;
+                res_arr[km] = qtdot_ref(qt_arr1[km], qt_arr2[km]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t km = (k & (DATA_SIZE-1)); 
+            	res_arr[km] = BT_OP(qt_arr1[km], qt_arr2[km]);km++;
+            	res_arr[km] = BT_OP(qt_arr1[km], qt_arr2[km]);km++;
+            	res_arr[km] = BT_OP(qt_arr1[km], qt_arr2[km]);km++;
+            	res_arr[km] = BT_OP(qt_arr1[km], qt_arr2[km]);km++;
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_qtdot.h
+++ b/test/Bullet2/Source/Tests/Test_qtdot.h
@@ -0,0 +1,22 @@
+//
+//  Test_qtdot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_qtdot_h
+#define BulletTest_Test_qtdot_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_qtdot(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_qtmul.cpp
+++ b/test/Bullet2/Source/Tests/Test_qtmul.cpp
@@ -0,0 +1,183 @@
+//
+//  Test_qtmul.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_qtmul.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btQuaternion.h>
+
+#define BT_OP(a, b)     ((a) *= (b))
+// reference code for testing purposes
+static inline btQuaternion& qtmul_ref(btQuaternion& q1, btQuaternion& q2);
+
+static inline btQuaternion& qtmul_ref(btQuaternion& q1, btQuaternion& q2)
+{
+    float x,y,z,w;
+    x = q1.w() * q2.x() + q1.x() * q2.w() + q1.y() * q2.z() - q1.z() * q2.y(),
+    y = q1.w() * q2.y() + q1.y() * q2.w() + q1.z() * q2.x() - q1.x() * q2.z(),
+    z = q1.w() * q2.z() + q1.z() * q2.w() + q1.x() * q2.y() - q1.y() * q2.x(),
+    w = q1.w() * q2.w() - q1.x() * q2.x() - q1.y() * q2.y() - q1.z() * q2.z();
+
+    q1.setValue(x, y, z, w);
+	return q1;
+}
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_qtmul(void)
+{
+    btQuaternion q1, q2, q3;
+	
+    float x, y, z, w, vNaN;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = RANDF_01;
+    vNaN = BT_NAN;     // w channel NaN
+    q1.setValue(x,y,z,w);
+	
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = RANDF_01;
+    q2.setValue(x,y,z,w);
+
+	q3 = q1;
+		
+    btQuaternion correct_res, test_res;
+	 
+    {
+		float vNaN = BT_NAN;
+		correct_res.setValue(vNaN, vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN, vNaN);
+		correct_res = qtmul_ref(q1, q2);
+		test_res = BT_OP(q3,q2);
+	   
+		if( fabsf(correct_res.x() - test_res.x()) + 
+			fabsf(correct_res.y() - test_res.y()) +
+			fabsf(correct_res.z() - test_res.z()) +
+			fabsf(correct_res.w() - test_res.w()) > FLT_EPSILON*10 )
+		{	
+			vlog( "Error - qtmul result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					correct_res.x(), correct_res.y(), 
+                    correct_res.z(), correct_res.w(),
+					test_res.x(), test_res.y(), 
+                    test_res.z(), test_res.w());
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btQuaternion qt_arr1[DATA_SIZE];
+	btQuaternion qt_arr2[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				w = RANDF_01;
+				qt_arr1[k].setValue(x,y,z,w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				w = RANDF_01;
+				qt_arr2[k].setValue(x,y,z,w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            qt_arr1[k] = qtmul_ref(qt_arr1[k], qt_arr2[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				w = RANDF_01;
+				qt_arr1[k].setValue(x,y,z,w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				w = RANDF_01;
+				qt_arr2[k].setValue(x,y,z,w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				qt_arr1[k] = BT_OP(qt_arr1[k], qt_arr2[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_qtmul.h
+++ b/test/Bullet2/Source/Tests/Test_qtmul.h
@@ -0,0 +1,22 @@
+//
+//  Test_qtmul.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_qtmul_h
+#define BulletTest_Test_qtmul_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_qtmul(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_qtmulQV3.cpp
+++ b/test/Bullet2/Source/Tests/Test_qtmulQV3.cpp
@@ -0,0 +1,162 @@
+//
+//  Test_qtmulQV3.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_qtmulQV3.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btQuaternion.h>
+
+#define BT_OP(a, b)     ((a) * (b))
+// reference code for testing purposes
+static inline btQuaternion qtmulQV3_ref(const btQuaternion& q, const btVector3& w);
+
+static inline btQuaternion qtmulQV3_ref(const btQuaternion& q, const btVector3& w)
+{
+	return btQuaternion( 
+         q.w() * w.x() + q.y() * w.z() - q.z() * w.y(),
+		 q.w() * w.y() + q.z() * w.x() - q.x() * w.z(),
+		 q.w() * w.z() + q.x() * w.y() - q.y() * w.x(),
+		-q.x() * w.x() - q.y() * w.y() - q.z() * w.z()); 
+}
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_m1p1, RANDF_m1p1, RANDF_m1p1, BT_NAN );      // w channel NaN
+}
+
+static inline btSimdFloat4 qtrand_f4(void)
+{
+    return btAssign128( RANDF_m1p1, RANDF_m1p1, RANDF_m1p1, RANDF_m1p1 );
+}
+
+static inline btSimdFloat4 qtNAN_f4(void)
+{
+    return btAssign128( BT_NAN, BT_NAN, BT_NAN, BT_NAN );
+}
+
+int Test_qtmulQV3(void)
+{
+    btQuaternion q;
+	btVector3 v3;
+    
+    // Init the data
+    q = btQuaternion(qtrand_f4()); 
+    v3 = btVector3(rand_f4());
+
+    btQuaternion correct_res, test_res;
+    correct_res = btQuaternion(qtNAN_f4());
+    test_res = btQuaternion(qtNAN_f4());
+	 
+    {
+		correct_res = qtmulQV3_ref(q, v3);
+		test_res = BT_OP(q, v3);
+	   
+		if( fabsf(correct_res.x() - test_res.x()) + 
+			fabsf(correct_res.y() - test_res.y()) +
+			fabsf(correct_res.z() - test_res.z()) +
+			fabsf(correct_res.w() - test_res.w()) > FLT_EPSILON*8 )
+		{	
+			vlog( "Error - qtmulQV3 result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					correct_res.x(), correct_res.y(), 
+                    correct_res.z(), correct_res.w(),
+					test_res.x(), test_res.y(), 
+                    test_res.z(), test_res.w());
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btQuaternion qt_arrR[DATA_SIZE];
+	btQuaternion qt_arr[DATA_SIZE];
+	btVector3 v3_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                qt_arr[k] = btQuaternion(qtrand_f4()); 
+                v3_arr[k] = btVector3(rand_f4());
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            qt_arrR[k] = qtmulQV3_ref(qt_arr[k], v3_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                qt_arr[k] = btQuaternion(qtrand_f4()); 
+                v3_arr[k] = btVector3(rand_f4());
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				qt_arrR[k] = BT_OP(qt_arr[k], v3_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_qtmulQV3.h
+++ b/test/Bullet2/Source/Tests/Test_qtmulQV3.h
@@ -0,0 +1,22 @@
+//
+//  Test_qtmulQV3.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_qtmulQV3_h
+#define BulletTest_Test_qtmulQV3_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_qtmulQV3(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_qtmulV3Q.cpp
+++ b/test/Bullet2/Source/Tests/Test_qtmulV3Q.cpp
@@ -0,0 +1,161 @@
+//
+//  Test_qtmulV3Q.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_qtmulV3Q.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btQuaternion.h>
+
+#define BT_OP(a, b)     ((a) * (b))
+// reference code for testing purposes
+static inline btQuaternion qtmulV3Q_ref(const btVector3& w, const btQuaternion& q);
+
+static inline btQuaternion qtmulV3Q_ref(const btVector3& w, const btQuaternion& q)
+{
+	return btQuaternion( 
+        +w.x() * q.w() + w.y() * q.z() - w.z() * q.y(),
+		+w.y() * q.w() + w.z() * q.x() - w.x() * q.z(),
+		+w.z() * q.w() + w.x() * q.y() - w.y() * q.x(),
+		-w.x() * q.x() - w.y() * q.y() - w.z() * q.z()); 
+}
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+static inline btSimdFloat4 rand_f4(void)
+{
+    return btAssign128( RANDF_m1p1, RANDF_m1p1, RANDF_m1p1, BT_NAN );      // w channel NaN
+}
+
+static inline btSimdFloat4 qtrand_f4(void)
+{
+    return btAssign128( RANDF_m1p1, RANDF_m1p1, RANDF_m1p1, RANDF_m1p1 );
+}
+
+static inline btSimdFloat4 qtNAN_f4(void)
+{
+    return btAssign128( BT_NAN, BT_NAN, BT_NAN, BT_NAN );
+}
+
+int Test_qtmulV3Q(void)
+{
+    btQuaternion q;
+	btVector3 v3;
+    
+    // Init the data
+    q = btQuaternion(qtrand_f4()); 
+    v3 = btVector3(rand_f4());
+
+    btQuaternion correct_res, test_res;
+    correct_res = btQuaternion(qtNAN_f4());
+    test_res = btQuaternion(qtNAN_f4());
+	 
+    {
+		correct_res = qtmulV3Q_ref(v3, q);
+		test_res = BT_OP(v3, q);
+	   
+		if( fabsf(correct_res.x() - test_res.x()) + 
+			fabsf(correct_res.y() - test_res.y()) +
+			fabsf(correct_res.z() - test_res.z()) +
+			fabsf(correct_res.w() - test_res.w()) > FLT_EPSILON*8 )
+		{	
+			vlog( "Error - qtmulV3Q result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f, %10.4f) \n", 
+					correct_res.x(), correct_res.y(), 
+                    correct_res.z(), correct_res.w(),
+					test_res.x(), test_res.y(), 
+                    test_res.z(), test_res.w());
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btQuaternion qt_arrR[DATA_SIZE];
+	btQuaternion qt_arr[DATA_SIZE];
+	btVector3 v3_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                qt_arr[k] = btQuaternion(qtrand_f4()); 
+                v3_arr[k] = btVector3(rand_f4());
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            qt_arrR[k] = qtmulV3Q_ref(v3_arr[k], qt_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                qt_arr[k] = btQuaternion(qtrand_f4()); 
+                v3_arr[k] = btVector3(rand_f4());
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				qt_arrR[k] = BT_OP(v3_arr[k], qt_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+#endif//#ifdef BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_qtmulV3Q.h
+++ b/test/Bullet2/Source/Tests/Test_qtmulV3Q.h
@@ -0,0 +1,22 @@
+//
+//  Test_qtmulV3Q.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_qtmulV3Q_h
+#define BulletTest_Test_qtmulV3Q_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_qtmulV3Q(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_qtnorm.cpp
+++ b/test/Bullet2/Source/Tests/Test_qtnorm.cpp
@@ -0,0 +1,176 @@
+//
+//  Test_qtnorm.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_qtnorm.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btQuaternion.h>
+
+#define BT_OP(a)     (a.normalize())
+// reference code for testing purposes
+static inline btQuaternion& qtnorm_ref(btQuaternion& q1);
+
+static inline btQuaternion& qtnorm_ref(btQuaternion& q1)
+{
+    float dot =
+        q1.x() * q1.x() + 
+        q1.y() * q1.y() + 
+        q1.z() * q1.z() + 
+        q1.w() * q1.w();
+
+	dot = 1.0f / sqrtf(dot);
+
+    q1.setValue(q1.x()*dot, q1.y()*dot, q1.z()*dot, q1.w()*dot);
+
+    return q1;
+}
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_qtnorm(void)
+{
+    int i;
+    btQuaternion q1, q2;
+	float x, y, z, w, vNaN;
+    vNaN = BT_NAN;     // w channel NaN
+    
+	btQuaternion correct_res, test_res;
+	
+    for (i=0; i<LOOPCOUNT; i++)
+    {
+        // Init the data
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        w = RANDF_01;
+        q1.setValue(x,y,z,w);
+        
+        q2 = q1;
+
+		correct_res.setValue(vNaN, vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN, vNaN);
+		correct_res = qtnorm_ref(q1);
+		test_res = BT_OP(q2);
+	   
+		if( fabsf(correct_res.x() - test_res.x()) + 
+			fabsf(correct_res.y() - test_res.y()) +
+			fabsf(correct_res.z() - test_res.z()) +
+			fabsf(correct_res.w() - test_res.w()) > FLT_EPSILON*10 )
+        {	
+			vlog( "Error - qtnorm result error! "
+					"\ncorrect = (%10.7f, %10.7f, %10.7f, %10.7f) "
+					"\ntested  = (%10.7f, %10.7f, %10.7f, %10.7f) \n", 
+					correct_res.x(), correct_res.y(), 
+                    correct_res.z(), correct_res.w(),
+					test_res.x(), test_res.y(), 
+                    test_res.z(), test_res.w());
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btQuaternion qt_arr0[DATA_SIZE];
+	btQuaternion qt_arr1[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            for( k = 0; k < DATA_SIZE; k++ )
+            {
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+                w = RANDF_01;
+                qt_arr1[k].setValue(x,y,z,w);
+            }
+
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t km = (k & (DATA_SIZE-1)); 
+                qt_arr0[km] = qtnorm_ref(qt_arr1[km]);km++;
+                qt_arr0[km] = qtnorm_ref(qt_arr1[km]);km++;
+                qt_arr0[km] = qtnorm_ref(qt_arr1[km]);km++;
+                qt_arr0[km] = qtnorm_ref(qt_arr1[km]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            for( k = 0; k < DATA_SIZE; k++ )
+            {
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+                w = RANDF_01;
+                qt_arr1[k].setValue(x,y,z,w);
+            }
+        
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t km = (k & (DATA_SIZE-1)); 
+            	qt_arr0[km] = BT_OP(qt_arr1[km]);km++;
+            	qt_arr0[km] = BT_OP(qt_arr1[km]);km++;
+            	qt_arr0[km] = BT_OP(qt_arr1[km]);km++;
+            	qt_arr0[km] = BT_OP(qt_arr1[km]);km++;
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_qtnorm.h
+++ b/test/Bullet2/Source/Tests/Test_qtnorm.h
@@ -0,0 +1,22 @@
+//
+//  Test_qtnorm.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_qtnorm_h
+#define BulletTest_Test_qtnorm_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_qtnorm(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_quat_aos_neon.cpp
+++ b/test/Bullet2/Source/Tests/Test_quat_aos_neon.cpp
@@ -0,0 +1,599 @@
+//
+//  Test_quat_aos_neon.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc., Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_quat_aos_neon.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+
+#include <vectormath/vmInclude.h>
+
+
+//typedef Vectormath::Aos::Vector3    vmVector3;
+//typedef Vectormath::Aos::Quat       vmQuat;
+//typedef Vectormath::Aos::Matrix3    vmMatrix3;
+//typedef Vectormath::Aos::Transform3 vmTransform3;
+//typedef Vectormath::Aos::Point3     vmPoint3;
+
+
+typedef Vectormath::Aos::Vector4    vmVector4;
+
+// reference code for testing purposes
+ATTRIBUTE_ALIGNED16(class) Quat_ref
+{
+    float mX;
+    float mY;
+    float mZ;
+    float mW;
+    
+public:
+    // Default constructor; does no initialization
+    // 
+    inline Quat_ref( ) { };
+    
+    // Copy a quaternion
+    // 
+    inline Quat_ref( const Quat_ref & quat );
+    
+    // Construct a quaternion from x, y, z, and w elements
+    // 
+    inline Quat_ref( float x, float y, float z, float w );
+    
+    // Construct a quaternion from a 3-D vector and a scalar
+    // 
+    inline Quat_ref( const vmVector3 & xyz, float w );
+    
+    // Copy elements from a 4-D vector into a quaternion
+    // 
+    explicit inline Quat_ref( const vmVector4 & vec );
+    
+    // Convert a rotation matrix to a unit-length quaternion
+    // 
+    explicit inline Quat_ref( const vmMatrix3 & rotMat );
+    
+    // Set all elements of a quaternion to the same scalar value
+    // 
+    explicit inline Quat_ref( float scalar );
+    
+    // Assign one quaternion to another
+    // 
+    inline Quat_ref & operator =( const Quat_ref & quat );
+    
+    // Set the x, y, and z elements of a quaternion
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    inline Quat_ref & setXYZ( const vmVector3 & vec );
+    
+    // Get the x, y, and z elements of a quaternion
+    // 
+    inline const vmVector3 getXYZ( ) const;
+    
+    // Set the x element of a quaternion
+    // 
+    inline Quat_ref & setX( float x );
+    
+    // Set the y element of a quaternion
+    // 
+    inline Quat_ref & setY( float y );
+    
+    // Set the z element of a quaternion
+    // 
+    inline Quat_ref & setZ( float z );
+    
+    // Set the w element of a quaternion
+    // 
+    inline Quat_ref & setW( float w );
+    
+    // Get the x element of a quaternion
+    // 
+    inline float getX( ) const;
+    
+    // Get the y element of a quaternion
+    // 
+    inline float getY( ) const;
+    
+    // Get the z element of a quaternion
+    // 
+    inline float getZ( ) const;
+    
+    // Get the w element of a quaternion
+    // 
+    inline float getW( ) const;
+    
+    // Set an x, y, z, or w element of a quaternion by index
+    // 
+    inline Quat_ref & setElem( int idx, float value );
+    
+    // Get an x, y, z, or w element of a quaternion by index
+    // 
+    inline float getElem( int idx ) const;
+    
+    // Subscripting operator to set or get an element
+    // 
+    inline float & operator []( int idx );
+    
+    // Subscripting operator to get an element
+    // 
+    inline float operator []( int idx ) const;
+    
+    // Add two quaternions
+    // 
+    inline const Quat_ref operator +( const Quat_ref & quat ) const;
+    
+    // Subtract a quaternion from another quaternion
+    // 
+    inline const Quat_ref operator -( const Quat_ref & quat ) const;
+    
+    // Multiply two quaternions
+    // 
+    inline const Quat_ref operator *( const Quat_ref & quat ) const;
+    
+    // Multiply a quaternion by a scalar
+    // 
+    inline const Quat_ref operator *( float scalar ) const;
+    
+    // Divide a quaternion by a scalar
+    // 
+    inline const Quat_ref operator /( float scalar ) const;
+    
+    // Perform compound assignment and addition with a quaternion
+    // 
+    inline Quat_ref & operator +=( const Quat_ref & quat );
+    
+    // Perform compound assignment and subtraction by a quaternion
+    // 
+    inline Quat_ref & operator -=( const Quat_ref & quat );
+    
+    // Perform compound assignment and multiplication by a quaternion
+    // 
+    inline Quat_ref & operator *=( const Quat_ref & quat );
+    
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    inline Quat_ref & operator *=( float scalar );
+    
+    // Perform compound assignment and division by a scalar
+    // 
+    inline Quat_ref & operator /=( float scalar );
+    
+    // Negate all elements of a quaternion
+    // 
+    inline const Quat_ref operator -( ) const;
+    
+    // Construct an identity quaternion
+    // 
+    static inline const Quat_ref identity( );
+    
+    // Construct a quaternion to rotate between two unit-length 3-D vectors
+    // NOTE: 
+    // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions.
+    // 
+    static inline const Quat_ref rotation( const vmVector3 & unitVec0, const vmVector3 & unitVec1 );
+    
+    // Construct a quaternion to rotate around a unit-length 3-D vector
+    // 
+    static inline const Quat_ref rotation( float radians, const vmVector3 & unitVec );
+    
+    // Construct a quaternion to rotate around the x axis
+    // 
+    static inline const Quat_ref rotationX( float radians );
+    
+    // Construct a quaternion to rotate around the y axis
+    // 
+    static inline const Quat_ref rotationY( float radians );
+    
+    // Construct a quaternion to rotate around the z axis
+    // 
+    static inline const Quat_ref rotationZ( float radians );
+    
+};
+
+inline Quat_ref::Quat_ref( const Quat_ref & quat )
+{
+    mX = quat.mX;
+    mY = quat.mY;
+    mZ = quat.mZ;
+    mW = quat.mW;
+}
+
+inline Quat_ref::Quat_ref( float _x, float _y, float _z, float _w )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+    mW = _w;
+}
+
+inline Quat_ref::Quat_ref( const vmVector3 & xyz, float _w )
+{
+    this->setXYZ( xyz );
+    this->setW( _w );
+}
+
+inline Quat_ref::Quat_ref( const vmVector4 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    mW = vec.getW();
+}
+
+inline Quat_ref::Quat_ref( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+    mW = scalar;
+}
+
+inline const Quat_ref Quat_ref::identity( )
+{
+    return Quat_ref( 0.0f, 0.0f, 0.0f, 1.0f );
+}
+
+
+inline void loadXYZW_ref( Quat_ref & quat, const float * fptr )
+{
+    quat = Quat_ref( fptr[0], fptr[1], fptr[2], fptr[3] );
+}
+
+inline void storeXYZW_ref( const Quat_ref & quat, float * fptr )
+{
+    fptr[0] = quat.getX();
+    fptr[1] = quat.getY();
+    fptr[2] = quat.getZ();
+    fptr[3] = quat.getW();
+}
+
+inline Quat_ref & Quat_ref::operator =( const Quat_ref & quat )
+{
+    mX = quat.mX;
+    mY = quat.mY;
+    mZ = quat.mZ;
+    mW = quat.mW;
+    return *this;
+}
+
+inline Quat_ref & Quat_ref::setXYZ( const vmVector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    return *this;
+}
+
+inline const vmVector3 Quat_ref::getXYZ( ) const
+{
+    return vmVector3( mX, mY, mZ );
+}
+
+inline Quat_ref & Quat_ref::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Quat_ref::getX( ) const
+{
+    return mX;
+}
+
+inline Quat_ref & Quat_ref::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Quat_ref::getY( ) const
+{
+    return mY;
+}
+
+inline Quat_ref & Quat_ref::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Quat_ref::getZ( ) const
+{
+    return mZ;
+}
+
+inline Quat_ref & Quat_ref::setW( float _w )
+{
+    mW = _w;
+    return *this;
+}
+
+inline float Quat_ref::getW( ) const
+{
+    return mW;
+}
+
+inline Quat_ref & Quat_ref::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Quat_ref::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Quat_ref::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Quat_ref::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Quat_ref Quat_ref::operator +( const Quat_ref & quat ) const
+{
+    return Quat_ref(
+                ( mX + quat.mX ),
+                ( mY + quat.mY ),
+                ( mZ + quat.mZ ),
+                ( mW + quat.mW )
+                );
+}
+
+inline const Quat_ref Quat_ref::operator -( const Quat_ref & quat ) const
+{
+    return Quat_ref(
+                ( mX - quat.mX ),
+                ( mY - quat.mY ),
+                ( mZ - quat.mZ ),
+                ( mW - quat.mW )
+                );
+}
+
+inline const Quat_ref Quat_ref::operator *( float scalar ) const
+{
+    return Quat_ref(
+                ( mX * scalar ),
+                ( mY * scalar ),
+                ( mZ * scalar ),
+                ( mW * scalar )
+                );
+}
+
+inline Quat_ref & Quat_ref::operator +=( const Quat_ref & quat )
+{
+    *this = *this + quat;
+    return *this;
+}
+
+inline Quat_ref & Quat_ref::operator -=( const Quat_ref & quat )
+{
+    *this = *this - quat;
+    return *this;
+}
+
+inline Quat_ref & Quat_ref::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Quat_ref Quat_ref::operator /( float scalar ) const
+{
+    return Quat_ref(
+                ( mX / scalar ),
+                ( mY / scalar ),
+                ( mZ / scalar ),
+                ( mW / scalar )
+                );
+}
+
+inline Quat_ref & Quat_ref::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+inline const Quat_ref Quat_ref::operator -( ) const
+{
+    return Quat_ref(
+                -mX,
+                -mY,
+                -mZ,
+                -mW
+                );
+}
+
+inline const Quat_ref operator *( float scalar, const Quat_ref & quat )
+{
+    return quat * scalar;
+}
+
+inline float dot( const Quat_ref & quat0, const Quat_ref & quat1 )
+{
+    float result;
+    result = ( quat0.getX() * quat1.getX() );
+    result = ( result + ( quat0.getY() * quat1.getY() ) );
+    result = ( result + ( quat0.getZ() * quat1.getZ() ) );
+    result = ( result + ( quat0.getW() * quat1.getW() ) );
+    return result;
+}
+
+inline const Quat_ref lerp( float t, const Quat_ref & quat0, const Quat_ref & quat1 )
+{
+    return ( quat0 + ( ( quat1 - quat0 ) * t ) );
+}
+
+inline const Quat_ref slerp( float t, const Quat_ref & unitQuat0, const Quat_ref & unitQuat1 )
+{
+    Quat_ref start;
+    float recipSinAngle, scale0, scale1, cosAngle, angle;
+    cosAngle = dot( unitQuat0, unitQuat1 );
+    if ( cosAngle < 0.0f ) {
+        cosAngle = -cosAngle;
+        start = ( -unitQuat0 );
+    } else {
+        start = unitQuat0;
+    }
+    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
+        angle = acosf( cosAngle );
+        recipSinAngle = ( 1.0f / sinf( angle ) );
+        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
+        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
+    } else {
+        scale0 = ( 1.0f - t );
+        scale1 = t;
+    }
+    return ( ( start * scale0 ) + ( unitQuat1 * scale1 ) );
+}
+
+inline const Quat_ref squad( float t, const Quat_ref & unitQuat0, const Quat_ref & unitQuat1, const Quat_ref & unitQuat2, const Quat_ref & unitQuat3 )
+{
+    Quat_ref tmp0, tmp1;
+    tmp0 = slerp( t, unitQuat0, unitQuat3 );
+    tmp1 = slerp( t, unitQuat1, unitQuat2 );
+    return slerp( ( ( 2.0f * t ) * ( 1.0f - t ) ), tmp0, tmp1 );
+}
+
+inline float norm( const Quat_ref & quat )
+{
+    float result;
+    result = ( quat.getX() * quat.getX() );
+    result = ( result + ( quat.getY() * quat.getY() ) );
+    result = ( result + ( quat.getZ() * quat.getZ() ) );
+    result = ( result + ( quat.getW() * quat.getW() ) );
+    return result;
+}
+
+inline float length( const Quat_ref & quat )
+{
+    return ::sqrtf( norm( quat ) );
+}
+
+inline const Quat_ref normalize( const Quat_ref & quat )
+{
+    float lenSqr, lenInv;
+    lenSqr = norm( quat );
+    lenInv = ( 1.0f / sqrtf( lenSqr ) );
+    return Quat_ref(
+                ( quat.getX() * lenInv ),
+                ( quat.getY() * lenInv ),
+                ( quat.getZ() * lenInv ),
+                ( quat.getW() * lenInv )
+                );
+}
+
+inline const Quat_ref Quat_ref::rotation( const vmVector3 & unitVec0, const vmVector3 & unitVec1 )
+{
+    float cosHalfAngleX2, recipCosHalfAngleX2;
+    cosHalfAngleX2 = sqrtf( ( 2.0f * ( 1.0f + dot( unitVec0, unitVec1 ) ) ) );
+    recipCosHalfAngleX2 = ( 1.0f / cosHalfAngleX2 );
+    return Quat_ref( ( cross( unitVec0, unitVec1 ) * recipCosHalfAngleX2 ), ( cosHalfAngleX2 * 0.5f ) );
+}
+
+inline const Quat_ref Quat_ref::rotation( float radians, const vmVector3 & unitVec )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat_ref( ( unitVec * s ), c );
+}
+
+inline const Quat_ref Quat_ref::rotationX( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat_ref( s, 0.0f, 0.0f, c );
+}
+
+inline const Quat_ref Quat_ref::rotationY( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat_ref( 0.0f, s, 0.0f, c );
+}
+
+inline const Quat_ref Quat_ref::rotationZ( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat_ref( 0.0f, 0.0f, s, c );
+}
+
+inline const Quat_ref Quat_ref::operator *( const Quat_ref & quat ) const
+{
+    return Quat_ref(
+                ( ( ( ( mW * quat.mX ) + ( mX * quat.mW ) ) + ( mY * quat.mZ ) ) - ( mZ * quat.mY ) ),
+                ( ( ( ( mW * quat.mY ) + ( mY * quat.mW ) ) + ( mZ * quat.mX ) ) - ( mX * quat.mZ ) ),
+                ( ( ( ( mW * quat.mZ ) + ( mZ * quat.mW ) ) + ( mX * quat.mY ) ) - ( mY * quat.mX ) ),
+                ( ( ( ( mW * quat.mW ) - ( mX * quat.mX ) ) - ( mY * quat.mY ) ) - ( mZ * quat.mZ ) )
+                );
+}
+
+inline Quat_ref & Quat_ref::operator *=( const Quat_ref & quat )
+{
+    *this = *this * quat;
+    return *this;
+}
+
+inline const vmVector3 rotate( const Quat_ref & quat, const vmVector3 & vec )
+{
+    float tmpX, tmpY, tmpZ, tmpW;
+    tmpX = ( ( ( quat.getW() * vec.getX() ) + ( quat.getY() * vec.getZ() ) ) - ( quat.getZ() * vec.getY() ) );
+    tmpY = ( ( ( quat.getW() * vec.getY() ) + ( quat.getZ() * vec.getX() ) ) - ( quat.getX() * vec.getZ() ) );
+    tmpZ = ( ( ( quat.getW() * vec.getZ() ) + ( quat.getX() * vec.getY() ) ) - ( quat.getY() * vec.getX() ) );
+    tmpW = ( ( ( quat.getX() * vec.getX() ) + ( quat.getY() * vec.getY() ) ) + ( quat.getZ() * vec.getZ() ) );
+    return vmVector3(
+                   ( ( ( ( tmpW * quat.getX() ) + ( tmpX * quat.getW() ) ) - ( tmpY * quat.getZ() ) ) + ( tmpZ * quat.getY() ) ),
+                   ( ( ( ( tmpW * quat.getY() ) + ( tmpY * quat.getW() ) ) - ( tmpZ * quat.getX() ) ) + ( tmpX * quat.getZ() ) ),
+                   ( ( ( ( tmpW * quat.getZ() ) + ( tmpZ * quat.getW() ) ) - ( tmpX * quat.getY() ) ) + ( tmpY * quat.getX() ) )
+                   );
+}
+
+inline const Quat_ref conj( const Quat_ref & quat )
+{
+    return Quat_ref( -quat.getX(), -quat.getY(), -quat.getZ(), quat.getW() );
+}
+
+inline const Quat_ref select( const Quat_ref & quat0, const Quat_ref & quat1, bool select1 )
+{
+    return Quat_ref(
+                ( select1 )? quat1.getX() : quat0.getX(),
+                ( select1 )? quat1.getY() : quat0.getY(),
+                ( select1 )? quat1.getZ() : quat0.getZ(),
+                ( select1 )? quat1.getW() : quat0.getW()
+                );
+}
+
+
+
+#define LOOPCOUNT 1000
+#define NUM_CYCLES 10000
+#define DATA_SIZE 1024
+
+int Test_quat_aos_neon(void)
+{
+       
+    return 0;
+}
+
+#endif
+
--- a/test/Bullet2/Source/Tests/Test_quat_aos_neon.h
+++ b/test/Bullet2/Source/Tests/Test_quat_aos_neon.h
@@ -0,0 +1,21 @@
+//
+//  Test_quat_aos_neon.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc., Inc.
+//
+
+#ifndef BulletTest_Test_quat_aos_neon_h
+#define BulletTest_Test_quat_aos_neon_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+    
+    int Test_quat_aos_neon(void);
+    
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/test/Bullet2/Source/Tests/Test_v3cross.cpp
+++ b/test/Bullet2/Source/Tests/Test_v3cross.cpp
@@ -0,0 +1,181 @@
+//
+//  Test_v3cross.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_v3cross.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static btVector3& v3cross_ref(btVector3& v1, btVector3& v2);
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_v3cross(void)
+{
+    btVector3 v1, v2, v3;
+	
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    v2.setValue(x,y,z);
+	v2.setW(w);
+
+	v3 = v1;
+		
+    btVector3 correct_res, test_res;
+	 
+    {
+		float vNaN = BT_NAN;
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+		correct_res = v3cross_ref(v1, v2);
+		test_res = v3.cross(v2);
+	   
+		if( fabs(correct_res.m_floats[0] - test_res.m_floats[0]) +  
+			fabs(correct_res.m_floats[1] - test_res.m_floats[1]) + 
+			fabs(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON * 4)
+		{	
+			vlog( "Error - v3cross result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2]);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr1[DATA_SIZE];
+	btVector3 vec3_arr2[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+	    
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+			    x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+                vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+			    x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            vec3_arr1[k] = v3cross_ref(vec3_arr1[k], vec3_arr2[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+			    x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+			    x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				vec3_arr1[k] = vec3_arr1[k].cross(vec3_arr2[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+static btVector3& v3cross_ref(btVector3& v1, btVector3& v2)
+{
+	btScalar x,y,z;
+	x = v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1];
+	y = v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2];
+	z = v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0];
+	
+	v1.m_floats[0] = x;
+	v1.m_floats[1] = y;
+	v1.m_floats[2] = z;
+	
+	return v1;
+}
+
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_v3cross.h
+++ b/test/Bullet2/Source/Tests/Test_v3cross.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3cross.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3cross_h
+#define BulletTest_Test_v3cross_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3cross(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_v3div.cpp
+++ b/test/Bullet2/Source/Tests/Test_v3div.cpp
@@ -0,0 +1,178 @@
+//
+//  Test_v3div.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_v3div.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+#define BT_OP(a, b)     ((a) / (b))
+// reference code for testing purposes
+static inline btVector3& v3div_ref(btVector3& v1, btVector3& v2);
+
+static btVector3& v3div_ref(btVector3& v0, btVector3& v1, btVector3& v2)
+{
+	v0.m_floats[0] = BT_OP(v1.m_floats[0] , v2.m_floats[0]), 
+	v0.m_floats[1] = BT_OP(v1.m_floats[1] , v2.m_floats[1]),
+	v0.m_floats[2] = BT_OP(v1.m_floats[2] , v2.m_floats[2]);
+	
+	return v0;
+}
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_v3div(void)
+{
+    btVector3 v1, v2, v3;
+	
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    v2.setValue(x,y,z);
+	v2.setW(w);
+
+	v3 = v1;
+		
+    btVector3 correct_res, test_res;
+	 
+    {
+		float vNaN = BT_NAN;
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+		correct_res = v3div_ref(correct_res, v1, v2);
+		test_res = BT_OP(v3,v2);
+	   
+		if( fabsf(correct_res.m_floats[0] - test_res.m_floats[0]) + 
+			fabsf(correct_res.m_floats[1] - test_res.m_floats[1]) +
+			fabsf(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON*10 )
+		{	
+			vlog( "Error - v3div result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2]);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr0[DATA_SIZE];
+	btVector3 vec3_arr1[DATA_SIZE];
+	btVector3 vec3_arr2[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+		w = BT_NAN;     // w channel NaN
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            vec3_arr0[k] = v3div_ref(vec3_arr0[k], vec3_arr1[k], vec3_arr2[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				vec3_arr0[k] = BT_OP(vec3_arr1[k] , vec3_arr2[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_v3div.h
+++ b/test/Bullet2/Source/Tests/Test_v3div.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3div.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3div_h
+#define BulletTest_Test_v3div_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3div(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_v3dot.cpp
+++ b/test/Bullet2/Source/Tests/Test_v3dot.cpp
@@ -0,0 +1,164 @@
+//
+//  Test_v3dot.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_v3dot.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static inline 
+btScalar v3dot_ref(
+    const btVector3& v1, 
+	const btVector3& v2);
+
+#define LOOPCOUNT 1000
+#define NUM_CYCLES 10000
+
+int Test_v3dot(void)
+{
+    btVector3 v1, v2;
+   
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    v2.setValue(x,y,z);
+	v2.setW(w);
+	
+    float correctDot0, testDot0;
+
+    {
+		correctDot0 = w;
+		testDot0 = w; ;
+		correctDot0 = v3dot_ref(v1, v2);
+		testDot0 = v1.dot(v2);
+	   
+		if( fabsf(correctDot0 - testDot0) > FLT_EPSILON * 4 )
+		{
+			vlog( "Error - v3dot result error! %f != %f \n", correctDot0, testDot0);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE 1024
+
+	btVector3 vec3_arr1[DATA_SIZE];
+	btVector3 vec3_arr2[DATA_SIZE];
+    btScalar res_arr[DATA_SIZE];
+    
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	for( k = 0; k < DATA_SIZE; k++ )
+	{
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+		vec3_arr1[k].setValue(x,y,z);
+		vec3_arr1[k].setW(w);
+
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+		vec3_arr2[k].setValue(x,y,z);
+		vec3_arr2[k].setW(w);
+	
+        res_arr[k] = w;
+    }
+    
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t k32 = (k & (DATA_SIZE-1)); 
+                res_arr[k32] = v3dot_ref( vec3_arr1[k32], vec3_arr2[k32]); k32++;
+				res_arr[k32] = v3dot_ref( vec3_arr1[k32], vec3_arr2[k32]); k32++;
+				res_arr[k32] = v3dot_ref( vec3_arr1[k32], vec3_arr2[k32]); k32++;
+				res_arr[k32] = v3dot_ref( vec3_arr1[k32], vec3_arr2[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t k32 = k & (DATA_SIZE -1); 
+                res_arr[k32] = vec3_arr1[k32].dot(vec3_arr2[k32]); k32++;
+				res_arr[k32] = vec3_arr1[k32].dot(vec3_arr2[k32]); k32++;
+				res_arr[k32] = vec3_arr1[k32].dot(vec3_arr2[k32]); k32++;
+				res_arr[k32] = vec3_arr1[k32].dot(vec3_arr2[k32]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+        vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+
+static btScalar v3dot_ref(const btVector3& v1, 
+						const btVector3& v2)
+{
+	return  (v1.m_floats[0] * v2.m_floats[0] + 
+			 v1.m_floats[1] * v2.m_floats[1] + 
+			 v1.m_floats[2] * v2.m_floats[2]);
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_v3dot.h
+++ b/test/Bullet2/Source/Tests/Test_v3dot.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3dot.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3dot_h
+#define BulletTest_Test_v3dot_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3dot(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_v3interp.cpp
+++ b/test/Bullet2/Source/Tests/Test_v3interp.cpp
@@ -0,0 +1,195 @@
+//
+//  Test_v3interp.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_v3interp.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static inline 
+btVector3& v3interp_ref(
+    btVector3& vr, 
+    btVector3& v0, 
+    btVector3& v1, 
+    btScalar& rt);
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_v3interp(void)
+{
+    btVector3 v1, v2;
+	btScalar rt;
+    
+    float x,y,z,w;
+
+	float vNaN = BT_NAN;
+    w = BT_NAN;     // w channel NaN
+    
+    btVector3 correct_res, test_res;
+
+    for (rt = 0.0f; rt <= 1.0f; rt += 0.1f) 
+    {
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+
+        // Init the data
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        v1.setValue(x,y,z);
+        v1.setW(w);
+
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        v2.setValue(x,y,z);
+        v2.setW(w);
+
+        correct_res = v3interp_ref(correct_res, v1, v2, rt);
+		test_res.setInterpolate3(v1, v2, rt);
+	   
+		if( fabs(correct_res.m_floats[0] - test_res.m_floats[0]) +  
+			fabs(correct_res.m_floats[1] - test_res.m_floats[1]) + 
+			fabs(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON * 4)
+		{	
+			vlog( "Error - v3interp result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n"
+                    "\n rt=%10.4f", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2], rt);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr1[DATA_SIZE];
+	btVector3 vec3_arr2[DATA_SIZE];
+    btScalar  rt_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+		
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+		
+                rt_arr[k] = RANDF_01;
+            }
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            v3interp_ref(vec3_arr1[k], vec3_arr1[k], vec3_arr2[k], rt_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+                
+                rt_arr[k] = RANDF_01;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				vec3_arr1[k].setInterpolate3(vec3_arr1[k], vec3_arr2[k], rt_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+static btVector3& 
+v3interp_ref(
+    btVector3& vr, 
+    btVector3& v0, 
+    btVector3& v1, 
+    btScalar& rt)
+{
+    btScalar s = btScalar(1.0) - rt;
+    vr.m_floats[0] = s * v0.m_floats[0] + rt * v1.m_floats[0];
+    vr.m_floats[1] = s * v0.m_floats[1] + rt * v1.m_floats[1];
+    vr.m_floats[2] = s * v0.m_floats[2] + rt * v1.m_floats[2];
+
+	return vr;
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_v3interp.h
+++ b/test/Bullet2/Source/Tests/Test_v3interp.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3interp.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3interp_h
+#define BulletTest_Test_v3interp_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3interp(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_v3lerp.cpp
+++ b/test/Bullet2/Source/Tests/Test_v3lerp.cpp
@@ -0,0 +1,198 @@
+//
+//  Test_v3lerp.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_v3lerp.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static inline 
+btVector3& 
+v3lerp_ref(
+    btVector3& vr, 
+    btVector3& v0, 
+    btVector3& v1, 
+    btScalar& rt);
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_v3lerp(void)
+{
+    btVector3 v1, v2;
+	btScalar rt;
+    
+    float x,y,z,w;
+
+	float vNaN =BT_NAN;
+    w =BT_NAN;     // w channel NaN
+    
+    btVector3 correct_res, test_res;
+
+    for (rt = 0.0f; rt <= 1.0f; rt += 0.1f) 
+    {
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+
+        // Init the data
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        v1.setValue(x,y,z);
+        v1.setW(w);
+
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+        v2.setValue(x,y,z);
+        v2.setW(w);
+
+        correct_res = v3lerp_ref(correct_res, v1, v2, rt);
+		test_res = v1.lerp(v2, rt);
+	   
+		if( fabs(correct_res.m_floats[0] - test_res.m_floats[0]) +  
+			fabs(correct_res.m_floats[1] - test_res.m_floats[1]) +
+			fabs(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON * 4)
+		{	
+			vlog( "Error - v3lerp result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n"
+                    "\n rt=%10.4f", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2], rt);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr1[DATA_SIZE];
+	btVector3 vec3_arr2[DATA_SIZE];
+    btScalar  rt_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+		w =BT_NAN;     // w channel NaN
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+
+                rt_arr[k] = RANDF_01;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+	            v3lerp_ref(vec3_arr1[k], vec3_arr1[k], vec3_arr2[k], rt_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr2[k].setValue(x,y,z);
+				vec3_arr2[k].setW(w);
+
+                rt_arr[k] = RANDF_01;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				vec3_arr1[k] = vec3_arr1[k].lerp(vec3_arr2[k], rt_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+static 
+btVector3& 
+v3lerp_ref(
+    btVector3& vr, 
+    btVector3& v0, 
+    btVector3& v1, 
+    btScalar& rt)
+{
+    vr.m_floats[0] = v0.m_floats[0] + rt * (v1.m_floats[0] - v0.m_floats[0]);
+    vr.m_floats[1] = v0.m_floats[1] + rt * (v1.m_floats[1] - v0.m_floats[1]);
+    vr.m_floats[2] = v0.m_floats[2] + rt * (v1.m_floats[2] - v0.m_floats[2]);
+
+	return vr;
+}
+
+#endif //BT_USE_SSE
+
--- a/test/Bullet2/Source/Tests/Test_v3lerp.h
+++ b/test/Bullet2/Source/Tests/Test_v3lerp.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3lerp.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3lerp_h
+#define BulletTest_Test_v3lerp_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3lerp(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_v3norm.cpp
+++ b/test/Bullet2/Source/Tests/Test_v3norm.cpp
@@ -0,0 +1,170 @@
+//
+//  Test_v3norm.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_v3norm.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static inline btVector3& v3norm_ref(btVector3& v);
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 1000
+
+int Test_v3norm(void)
+{
+    btVector3 v1, v2;
+   
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    v2 = v1; 
+
+    btVector3 correct_res, test_res;
+	 
+    {
+		float vNaN = BT_NAN;
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+		correct_res = v3norm_ref(v1);
+		test_res = v2.normalize();
+	   
+		if( fabs(correct_res.m_floats[0] - test_res.m_floats[0]) +  
+			fabs(correct_res.m_floats[1] - test_res.m_floats[1]) + 
+			fabs(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON * 4)
+		{	
+			vlog( "Error - v3norm result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2]);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr0[DATA_SIZE];
+	btVector3 vec3_arr1[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+				x = RANDF_01;
+				y = RANDF_01;
+				z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+	             vec3_arr0[k] = v3norm_ref(vec3_arr1[k]);
+	             vec3_arr0[k+1] = v3norm_ref(vec3_arr1[k+1]);
+	             vec3_arr0[k+2] = v3norm_ref(vec3_arr1[k+2]);
+	             vec3_arr0[k+3] = v3norm_ref(vec3_arr1[k+3]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+				x = RANDF_01;
+				y = RANDF_01;
+				z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				vec3_arr0[k] = vec3_arr1[k].normalize();
+				vec3_arr0[k+1] = vec3_arr1[k+1].normalize();
+				vec3_arr0[k+2] = vec3_arr1[k+2].normalize();
+				vec3_arr0[k+3] = vec3_arr1[k+3].normalize();
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+static btVector3& v3norm_ref(btVector3& v)
+{
+	float dot =	v.m_floats[0] * v.m_floats[0] +
+				v.m_floats[1] * v.m_floats[1] +
+				v.m_floats[2] * v.m_floats[2];
+	
+	dot = 1.0f / sqrtf(dot);
+	v.m_floats[0] *= dot; 
+	v.m_floats[1] *= dot;
+	v.m_floats[2] *= dot;
+
+	return v;
+}
+
+#endif //BT_USE_SSE
+
--- a/test/Bullet2/Source/Tests/Test_v3norm.h
+++ b/test/Bullet2/Source/Tests/Test_v3norm.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3norm.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3norm_h
+#define BulletTest_Test_v3norm_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3norm(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_v3rotate.cpp
+++ b/test/Bullet2/Source/Tests/Test_v3rotate.cpp
@@ -0,0 +1,194 @@
+//
+//  Test_v3rotate.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_v3rotate.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static inline 
+btVector3& v3rotate_ref(
+    btVector3& v0, 
+    btVector3& v1, 
+	const btScalar& s);
+
+#define LOOPCOUNT 2048
+#define NUM_CYCLES 1000
+
+int Test_v3rotate(void)
+{
+    btVector3 v1, v2;
+	float s;
+   
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    v2.setValue(x,y,z);
+	v2.setW(w);
+
+    s = RANDF_01 * (float) SIMD_PI;
+    
+    btVector3 correct_res, test_res;
+	 
+    {
+		float vNaN = BT_NAN;
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+		test_res = v1.rotate(v2, s);
+        correct_res = v3rotate_ref(v1, v2, s);
+		
+		if( fabs(correct_res.m_floats[0] - test_res.m_floats[0]) + 
+			fabs(correct_res.m_floats[1] - test_res.m_floats[1]) +
+			fabs(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON * 4)
+		{	
+			vlog( "Error - v3rotate result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2]);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr0[DATA_SIZE];
+	btVector3 vec3_arr1[DATA_SIZE];
+    btScalar  s_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr0[k].setValue(x,y,z);
+				vec3_arr0[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+				
+				s_arr[k] = RANDF_01 * (float)SIMD_PI;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+                vec3_arr0[k] = v3rotate_ref(vec3_arr0[k], vec3_arr1[k], s_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr0[k].setValue(x,y,z);
+				vec3_arr0[k].setW(w);
+
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr1[k].setValue(x,y,z);
+				vec3_arr1[k].setW(w);
+				
+				s_arr[k] = RANDF_01 * (float)SIMD_PI;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				vec3_arr0[k  ] = vec3_arr0[k  ].rotate(vec3_arr1[k  ], s_arr[k]);
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+static inline 
+btVector3& 
+v3rotate_ref(
+    btVector3& v0, 
+    btVector3& wAxis, 
+    const btScalar& _angle)
+{
+	btVector3 o = wAxis * wAxis.dot( v0 );
+	btVector3 _x = v0 - o;
+	btVector3 _y;
+
+	_y = wAxis.cross( v0 );
+
+	v0 = o + _x * cosf( _angle ) + _y * sinf( _angle );
+
+	return v0;
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_v3rotate.h
+++ b/test/Bullet2/Source/Tests/Test_v3rotate.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3rotate.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3rotate_h
+#define BulletTest_Test_v3rotate_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3rotate(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_v3sdiv.cpp
+++ b/test/Bullet2/Source/Tests/Test_v3sdiv.cpp
@@ -0,0 +1,181 @@
+//
+//  Test_v3sdiv.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+
+#include "Test_v3sdiv.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static inline 
+btVector3& v3sdiv_ref(
+    btVector3& v, 
+	const btScalar& s);
+
+#define LOOPCOUNT 2048
+#define NUM_CYCLES 1000
+
+int Test_v3sdiv(void)
+{
+    btVector3 v1, v2;
+	btScalar s;
+   
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    v2.setValue(x,y,z);
+	v2.setW(w);
+
+    s = (float) RANDF_16;
+    
+    btVector3 correct_res, test_res;
+	 
+    {
+		float vNaN = BT_NAN;
+		correct_res.setValue(vNaN, vNaN, vNaN); 
+		test_res.setValue(vNaN, vNaN, vNaN);
+		correct_res = v3sdiv_ref(v1, s);
+		test_res = (v2 /= s);
+	   
+		if( fabs(correct_res.m_floats[0] - test_res.m_floats[0]) +  
+			fabs(correct_res.m_floats[1] - test_res.m_floats[1]) + 
+			fabs(correct_res.m_floats[2] - test_res.m_floats[2]) > FLT_EPSILON * 4)
+		{	
+			vlog( "Error - v3sdiv result error! "
+					"\ncorrect = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  = (%10.4f, %10.4f, %10.4f) \n", 
+					correct_res.m_floats[0], correct_res.m_floats[1], correct_res.m_floats[2], 
+					test_res.m_floats[0], test_res.m_floats[1], test_res.m_floats[2]);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE LOOPCOUNT
+
+	btVector3 vec3_arr[DATA_SIZE];
+    btScalar  s_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = uint64_t(-1LL);
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr[k].setValue(x,y,z);
+				vec3_arr[k].setW(w);
+				
+				s_arr[k] = RANDF_01;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+                v3sdiv_ref( vec3_arr[k], s_arr[k]);
+                v3sdiv_ref( vec3_arr[k+1], s_arr[k+1]);
+                v3sdiv_ref( vec3_arr[k+2], s_arr[k+2]);
+                v3sdiv_ref( vec3_arr[k+3], s_arr[k+3]);
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+			for( k = 0; k < DATA_SIZE; k++ )
+			{
+                x = RANDF_01;
+                y = RANDF_01;
+                z = RANDF_01;
+				vec3_arr[k].setValue(x,y,z);
+				vec3_arr[k].setW(w);
+				
+				s_arr[k] = RANDF_01;
+			}
+
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				vec3_arr[k] /= s_arr[k];
+				vec3_arr[k+1] /= s_arr[k+1];
+				vec3_arr[k+2] /= s_arr[k+2];
+				vec3_arr[k+3] /= s_arr[k+3];
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, 
+									TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+static inline 
+btVector3& 
+v3sdiv_ref(
+    btVector3& v, 
+    const btScalar& s)
+{
+	btScalar recip = btScalar(1.0) / s;
+	
+	v.m_floats[0] *= recip; 
+	v.m_floats[1] *= recip;
+	v.m_floats[2] *= recip;
+
+	return v;
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_v3sdiv.h
+++ b/test/Bullet2/Source/Tests/Test_v3sdiv.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3sdiv.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3sdiv_h
+#define BulletTest_Test_v3sdiv_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3sdiv(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_v3skew.cpp
+++ b/test/Bullet2/Source/Tests/Test_v3skew.cpp
@@ -0,0 +1,197 @@
+//
+//  Test_v3skew.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_v3skew.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static void 
+v3skew_ref(
+	const btVector3* v, 
+	btVector3* v1,
+	btVector3* v2,
+    btVector3* v3);
+
+#define LOOPCOUNT 2048
+#define NUM_CYCLES 10000
+
+int Test_v3skew(void)
+{
+    btVector3 v, v1, v2, v3, vt1, vt2, vt3;
+   
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v.setValue(x,y,z);
+	v.setW(w);
+
+    v1.setValue(w,w,w);
+	v1.setW(w);
+
+    vt3 = vt2 = vt1 = v3 = v2 = v1;
+	
+    {
+		v3skew_ref(&v, &v1, &v2, &v3);
+		v.getSkewSymmetricMatrix(&vt1, &vt2, &vt3);
+	   /*
+		if( v1.m_floats[0] != vt1.m_floats[0] || 
+			v1.m_floats[1] != vt1.m_floats[1] ||
+			v1.m_floats[2] != vt1.m_floats[2] )
+		*/
+        if(!(v1 == vt1))
+        {	
+			vlog( "Error - v3skew result error! "
+					"\ncorrect v1 = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  v1 = (%10.4f, %10.4f, %10.4f) \n", 
+					v1.m_floats[0], v1.m_floats[1], v1.m_floats[2], 
+					vt1.m_floats[0], vt1.m_floats[1], vt1.m_floats[2]);
+		
+			return 1;
+		}
+
+		/*
+        if( v2.m_floats[0] != vt2.m_floats[0] || 
+			v2.m_floats[1] != vt2.m_floats[1] ||
+			v2.m_floats[2] != vt2.m_floats[2] )
+		*/
+        if(!(v2 == vt2))
+        {	
+			vlog( "Error - v3skew result error! "
+					"\ncorrect v2 = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  v2 = (%10.4f, %10.4f, %10.4f) \n", 
+					v2.m_floats[0], v2.m_floats[1], v2.m_floats[2], 
+					vt2.m_floats[0], vt2.m_floats[1], vt2.m_floats[2]);
+		
+			return 1;
+		}
+
+		/*
+        if( v3.m_floats[0] != vt3.m_floats[0] || 
+			v3.m_floats[1] != vt3.m_floats[1] ||
+			v3.m_floats[2] != vt3.m_floats[2] )
+		*/
+        if(!(v3 == vt3))
+        {	
+			vlog( "Error - v3skew result error! "
+					"\ncorrect v3 = (%10.4f, %10.4f, %10.4f) "
+					"\ntested  v3 = (%10.4f, %10.4f, %10.4f) \n", 
+					v3.m_floats[0], v3.m_floats[1], v3.m_floats[2], 
+					vt3.m_floats[0], vt3.m_floats[1], vt3.m_floats[2]);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE 256
+
+	btVector3 v3_arr0[DATA_SIZE];
+	btVector3 v3_arr1[DATA_SIZE];
+	btVector3 v3_arr2[DATA_SIZE];
+	btVector3 v3_arr3[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	for( k = 0; k < DATA_SIZE; k++ )
+	{
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+		v3_arr0[k].setValue(x,y,z);
+		v3_arr0[k].setW(w);
+
+		v3_arr1[k].setValue(w,w,w);
+		v3_arr1[k].setW(w);
+
+		v3_arr3[k] = v3_arr2[k] = v3_arr1[k];
+	}
+    
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				size_t k32 = (k & (DATA_SIZE-1)); 
+                v3skew_ref( &v3_arr0[k32], &v3_arr1[k32], &v3_arr2[k32], &v3_arr3[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = -1LL;
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k < LOOPCOUNT; k++ )
+			{
+				size_t k32 = (k & (DATA_SIZE -1)); 
+                v3_arr0[k32].getSkewSymmetricMatrix(&v3_arr1[k32], &v3_arr2[k32], &v3_arr3[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "    \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+
+static void 
+v3skew_ref(
+	const btVector3* v, 
+	btVector3* v1, 
+	btVector3* v2, 
+	btVector3* v3)
+{
+    v1->setValue(0.		,-v->z(),v->y());
+    v2->setValue(v->z()	,0.		,-v->x());
+    v3->setValue(-v->y(),v->x()	,0.);
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_v3skew.h
+++ b/test/Bullet2/Source/Tests/Test_v3skew.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3skew.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3skew_h
+#define BulletTest_Test_v3skew_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3skew(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Tests/Test_v3triple.cpp
+++ b/test/Bullet2/Source/Tests/Test_v3triple.cpp
@@ -0,0 +1,180 @@
+//
+//  Test_v3triple.cpp
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+
+
+#include "LinearMath/btScalar.h"
+#if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
+
+#include "Test_v3triple.h"
+#include "vector.h"
+#include "Utils.h"
+#include "main.h"
+#include <math.h>
+#include <string.h>
+
+#include <LinearMath/btVector3.h>
+
+// reference code for testing purposes
+static btScalar 
+v3triple_ref(
+	const btVector3& v, 
+	const btVector3& v1,
+	const btVector3& v2);
+
+#define LOOPCOUNT 1024
+#define NUM_CYCLES 10000
+
+int Test_v3triple(void)
+{
+    btVector3 v1, v2, v3;
+   
+    float x,y,z,w;
+    
+    // Init the data
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    w = BT_NAN;     // w channel NaN
+    v1.setValue(x,y,z);
+	v1.setW(w);
+
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    v2.setValue(x,y,z);
+	v2.setW(w);
+
+    x = RANDF_01;
+    y = RANDF_01;
+    z = RANDF_01;
+    v3.setValue(x,y,z);
+	v3.setW(w);
+	
+    float correctTriple0, testTriple0;
+	 
+    {
+		correctTriple0 = w;
+		testTriple0 = w;
+		testTriple0 = v3triple_ref(v1,v2,v3);
+		correctTriple0 = v1.triple(v2, v3);
+	   
+		if( fabsf(correctTriple0 - testTriple0) > FLT_EPSILON * 4 )
+		{
+			vlog( "Error - v3triple result error! %f != %f \n", correctTriple0, testTriple0);
+		
+			return 1;
+		}
+	}
+    
+#define DATA_SIZE 1024
+
+	btVector3 v3_arr1[DATA_SIZE];
+	btVector3 v3_arr2[DATA_SIZE];
+	btVector3 v3_arr3[DATA_SIZE];
+    btScalar  res_arr[DATA_SIZE];
+
+    uint64_t scalarTime;
+    uint64_t vectorTime;
+    size_t j, k;
+
+	for( k = 0; k < DATA_SIZE; k++ )
+	{
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+		v3_arr1[k].setValue(x,y,z);
+		v3_arr1[k].setW(w);
+
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+		v3_arr2[k].setValue(x,y,z);
+		v3_arr2[k].setW(w);
+
+        x = RANDF_01;
+        y = RANDF_01;
+        z = RANDF_01;
+		v3_arr3[k].setValue(x,y,z);
+		v3_arr3[k].setW(w);
+	}
+    
+	{
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = uint64_t(-1LL);
+        scalarTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t k32 = (k & (DATA_SIZE-1)); 
+                res_arr[k32] = v3triple_ref( v3_arr1[k32], v3_arr2[k32], v3_arr3[k32]); k32++;
+				res_arr[k32] = v3triple_ref( v3_arr1[k32], v3_arr2[k32], v3_arr3[k32]); k32++;
+				res_arr[k32] = v3triple_ref( v3_arr1[k32], v3_arr2[k32], v3_arr3[k32]); k32++;
+				res_arr[k32] = v3triple_ref( v3_arr1[k32], v3_arr2[k32], v3_arr3[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            scalarTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            scalarTime = bestTime;        
+        else
+            scalarTime /= NUM_CYCLES;
+    }
+    
+    {
+        uint64_t startTime, bestTime, currentTime;
+        
+        bestTime = uint64_t(-1LL);
+        vectorTime = 0;
+        for (j = 0; j < NUM_CYCLES; j++) 
+		{
+            startTime = ReadTicks();
+            for( k = 0; k+4 <= LOOPCOUNT; k+=4 )
+			{
+				size_t k32 = k & (DATA_SIZE -1); 
+                res_arr[k32] = v3_arr1[k32].triple(v3_arr2[k32], v3_arr3[k32]); k32++;
+                res_arr[k32] = v3_arr1[k32].triple(v3_arr2[k32], v3_arr3[k32]); k32++;
+                res_arr[k32] = v3_arr1[k32].triple(v3_arr2[k32], v3_arr3[k32]); k32++;
+                res_arr[k32] = v3_arr1[k32].triple(v3_arr2[k32], v3_arr3[k32]); 
+			}
+			currentTime = ReadTicks() - startTime;
+            vectorTime += currentTime;
+            if( currentTime < bestTime )
+                bestTime = currentTime;
+        }
+        if( 0 == gReportAverageTimes )
+            vectorTime = bestTime;        
+        else
+            vectorTime /= NUM_CYCLES;
+    }
+
+    vlog( "Timing:\n" );
+    vlog( "     \t    scalar\t    vector\n" );
+    vlog( "    \t%10.4f\t%10.4f\n", TicksToCycles( scalarTime ) / LOOPCOUNT, TicksToCycles( vectorTime ) / LOOPCOUNT );
+
+    return 0;
+}
+
+
+static btScalar 
+v3triple_ref(
+	const btVector3& v, 
+	const btVector3& v1, 
+	const btVector3& v2)
+{
+	return 
+		v.m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) + 
+		v.m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) + 
+		v.m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]);
+}
+
+#endif //BT_USE_SSE
--- a/test/Bullet2/Source/Tests/Test_v3triple.h
+++ b/test/Bullet2/Source/Tests/Test_v3triple.h
@@ -0,0 +1,22 @@
+//
+//  Test_v3triple.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Test_v3triple_h
+#define BulletTest_Test_v3triple_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+int Test_v3triple(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+    
+#endif
--- a/test/Bullet2/Source/Utils.cpp
+++ b/test/Bullet2/Source/Utils.cpp
@@ -0,0 +1,272 @@
+//
+//  File.c
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#include <stdio.h>
+#ifdef __APPLE__
+#include <mach/mach_time.h>
+#include <sys/sysctl.h>
+#include <sys/mman.h>
+#include <errno.h>
+#else
+#include "LinearMath/btAlignedAllocator.h"
+
+#endif //__APPLE__
+
+#include <stdlib.h>
+
+#include "Utils.h"
+
+#pragma mark Timing
+
+int gReportNanoseconds = 0;
+
+#ifdef _WIN32
+#include <intrin.h>
+uint64_t ReadTicks( void )
+{
+	 return __rdtsc();
+}
+double  TicksToCycles( uint64_t delta )
+{
+	return double(delta);
+}
+
+double  TicksToSeconds( uint64_t delta )
+{
+	return double(delta);
+}
+
+void *GuardCalloc( size_t count, size_t size, size_t *objectStride )
+{
+	if (objectStride)
+		*objectStride = size;
+	return (void*) btAlignedAlloc(count * size,16);
+}
+void GuardFree( void *buf )
+{
+	btAlignedFree(buf);
+}
+
+#endif
+
+
+#ifdef __APPLE__
+
+uint64_t ReadTicks( void )
+{
+    return mach_absolute_time();
+}
+
+double  TicksToCycles( uint64_t delta )
+{
+    static long double conversion = 0.0L;
+    if( 0.0L == conversion )
+    {
+        // attempt to get conversion to nanoseconds
+        mach_timebase_info_data_t info;
+        int err = mach_timebase_info( &info );
+        if( err )
+            return __builtin_nanf("");
+        conversion = (long double) info.numer / info.denom;
+        
+        // attempt to get conversion to cycles
+        if( 0 == gReportNanoseconds )
+        {
+            uint64_t frequency = 0;
+            size_t freq_size = sizeof( frequency );
+            err = sysctlbyname( "hw.cpufrequency_max", &frequency, &freq_size, NULL, 0 );
+            if( err || 0 == frequency )
+                vlog( "Failed to get max cpu frequency. Reporting times as nanoseconds.\n" );
+            else
+            {
+                conversion *= 1e-9L /* sec / ns */  * frequency /* cycles / sec */;
+                vlog( "Reporting times as cycles. (%2.2f MHz)\n", 1e-6 * frequency );
+            }
+        }
+        else
+            vlog( "Reporting times as nanoseconds.\n" );
+    }
+    
+    return (double) (delta * conversion);
+}
+
+double  TicksToSeconds( uint64_t delta )
+{
+    static long double conversion = 0.0L;
+    if( 0.0L == conversion )
+    {
+        // attempt to get conversion to nanoseconds
+        mach_timebase_info_data_t info;
+        int err = mach_timebase_info( &info );
+        if( err )
+            return __builtin_nanf("");
+        conversion = info.numer / (1e9L * info.denom);
+    }
+    
+    return (double) (delta * conversion);
+}
+
+
+
+#pragma mark -
+#pragma mark GuardCalloc
+
+#define kPageSize 4096
+
+
+typedef struct BufInfo
+{
+    void    *head;
+    size_t  count;
+    size_t  stride;
+    size_t  totalSize;
+}BufInfo;
+
+static int GuardMarkBuffer( void *buffer, int flag );
+
+void *GuardCalloc( size_t count, size_t size, size_t *objectStride )
+{
+    if( objectStride )
+        *objectStride = 0;
+    
+    // Round size up to a multiple of a page size
+    size_t stride = (size + kPageSize - 1) & -kPageSize;
+    
+    //Calculate total size of the allocation
+    size_t totalSize = count * (stride + kPageSize) + kPageSize;
+
+    // Allocate
+    char *buf = (char*)mmap( NULL, 
+                     totalSize, 
+                     PROT_READ | PROT_WRITE, 
+                     MAP_ANON | MAP_SHARED,
+                     0, 0 );
+    if( MAP_FAILED == buf )
+    {
+        vlog( "mmap failed: %d\n", errno );
+        return NULL;
+    }
+
+    // Find the first byte of user data
+    char *result = buf + kPageSize;
+
+    // Record what we did for posterity
+    BufInfo *bptr = (BufInfo*) result - 1;
+    bptr->head = buf;
+    bptr->count = count;
+    bptr->stride = stride;
+    bptr->totalSize = totalSize;
+    
+    // Place the first guard page. Masks our record above.
+    if( mprotect(buf, kPageSize, PROT_NONE) )
+    {
+        munmap( buf, totalSize);
+        vlog( "mprotect -1 failed: %d\n", errno );
+        return NULL;
+    }
+    
+    // Place the rest of the guard pages
+    size_t i;
+    char *p = result;
+    for( i = 0; i < count; i++ )
+    {
+        p += stride;
+        if( mprotect(p, kPageSize, PROT_NONE) )
+        {
+            munmap( buf, totalSize);
+            vlog( "mprotect %lu failed: %d\n", i, errno );
+            return NULL;
+        }
+        p += kPageSize;
+    }
+    
+    // record the stride from object to object
+    if( objectStride )
+        *objectStride = stride + kPageSize;
+    
+    // return pointer to first object
+    return result;
+}
+
+
+void GuardFree( void *buf )
+{
+    if( mprotect((char*)buf - kPageSize, kPageSize, PROT_READ) )
+    {
+        vlog( "Unable to read buf info. GuardFree failed! %p  (%d)\n", buf, errno );
+        return;
+    }
+    
+    BufInfo *bptr = (BufInfo*) buf - 1;
+    
+    if( munmap( bptr->head, bptr->totalSize ) )
+        vlog( "Unable to unmap data. GuardFree failed! %p (%d)\n", buf, errno );
+}
+
+int GuardMarkReadOnly( void *buf )
+{
+    return GuardMarkBuffer(buf, PROT_READ);
+}
+
+int GuardMarkReadWrite( void *buf)
+{
+    return GuardMarkBuffer(buf, PROT_READ | PROT_WRITE);
+}
+
+int GuardMarkWriteOnly( void *buf)
+{
+    return GuardMarkBuffer(buf, PROT_WRITE);
+}
+
+static int GuardMarkBuffer( void *buf, int flag )
+{
+    if( mprotect((char*)buf - kPageSize, kPageSize, PROT_READ) )
+    {
+        vlog( "Unable to read buf info. GuardMarkBuffer %d failed! %p  (%d)\n", flag, buf, errno );
+        return errno;
+    }
+    
+    BufInfo *bptr = (BufInfo*) buf - 1;
+    
+    size_t count = bptr->count;
+    size_t stride = bptr->stride;
+    
+    size_t i;
+    for( i = 0; i < count; i++ )
+    {
+        if( mprotect(buf, stride, flag) )
+        {
+            vlog( "Unable to protect segment %ld. GuardMarkBuffer %d failed! %p  (%d)\n", i, flag, buf, errno );
+            return errno;
+        }
+        bptr += stride + kPageSize;
+    }
+        
+    if( mprotect((char*)buf - kPageSize, kPageSize, PROT_NONE) )
+    {
+        vlog( "Unable to protect leading guard page. GuardMarkBuffer %d failed! %p  (%d)\n", flag, buf, errno );
+        return errno;
+    }
+    
+    return 0;
+}
+#endif
+
+uint32_t random_number32(void)
+{
+    return ((uint32_t) rand() << 16) ^ rand();
+}
+
+
+uint64_t random_number64(void)
+{
+    return ((uint64_t) rand() << 48) ^
+            ((uint64_t) rand() << 32) ^
+            ((uint64_t) rand() << 16) ^
+            rand();
+}
+
--- a/test/Bullet2/Source/Utils.h
+++ b/test/Bullet2/Source/Utils.h
@@ -0,0 +1,72 @@
+//
+//  Utils.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_Utils_h
+#define BulletTest_Utils_h
+
+#include "btIntDefines.h"
+
+
+
+#include <stddef.h>
+#include <stdio.h>
+
+#ifdef _WIN32
+#define LARGE_FLOAT17 (1.f * powf(2,17))
+#define RANDF_16   (random_number32() * powf(2,-16))
+#define RANDF_01   ( random_number32() * powf(2,-32) )
+#define RANDF      ( random_number32() * powf(2,-8) )
+#define RANDF_m1p1 (2.0f*( random_number32() * powf(2,-32)-1.0f))
+#else
+#define LARGE_FLOAT17 (0x1.0p17f)
+#define RANDF_16   (random_number32() * 0x1.0p-16f)
+#define RANDF_01   ( random_number32() * 0x1.0p-32f )
+#define RANDF      ( random_number32() * 0x1.0p-8f )
+#define RANDF_m1p1 (2.0f*( random_number32() * 0x1.0p-32f )-1.0f)
+#endif//_WIN32
+
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+    
+    /*********************
+     *      Timing       *
+     *********************/
+    extern int gReportNanoseconds;
+
+    uint64_t ReadTicks( void );
+    double  TicksToCycles( uint64_t delta );        // Performance data should be reported in cycles most of the time.
+    double  TicksToSeconds( uint64_t delta );
+
+    
+    /*********************
+     *    Guard Heap     *
+     *********************/
+    // return buffer containing count objects of size size, with guard pages in betweeen.
+    // The stride between one object and the next is given by objectStride. 
+    // objectStride may be NULL. Objects so created are freed with GuardFree
+    void *GuardCalloc( size_t count, size_t size, size_t *objectStride );
+    void GuardFree( void * );
+    // mark the contents of a guard buffer read-only or write-only. Return 0 on success.
+    int GuardMarkReadOnly( void *);
+    int GuardMarkWriteOnly( void *);
+    int GuardMarkReadWrite( void *);
+    
+    /*********************
+     *    Printing       *
+     *********************/
+    #define vlog( ... )        printf( __VA_ARGS__  )
+    uint32_t random_number32(void);    
+    uint64_t random_number64(void);
+    
+#ifdef __cplusplus
+    }
+#endif
+
+
+#endif
--- a/test/Bullet2/Source/btIntDefines.h
+++ b/test/Bullet2/Source/btIntDefines.h
@@ -0,0 +1,19 @@
+
+#ifndef BT_INT_DEFINES_H
+#define BT_INT_DEFINES_H
+
+#ifdef __GNUC__
+	#include <stdint.h>
+#elif defined(_MSC_VER)
+	typedef __int32 int32_t;
+	typedef __int64 int64_t;
+	typedef unsigned __int32 uint32_t;
+	typedef unsigned __int64 uint64_t;
+#else
+	typedef int int32_t;
+	typedef long long int int64_t;
+	typedef unsigned int uint32_t;
+	typedef unsigned long long int uint64_t;
+#endif
+
+#endif //BT_INT_DEFINES_H
--- a/test/Bullet2/Source/main.cpp
+++ b/test/Bullet2/Source/main.cpp
@@ -0,0 +1,326 @@
+//
+//  main.c
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+#include <stdio.h>
+#ifdef __APPLE__
+#include <libgen.h>
+#endif //__APPLE__
+
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "main.h"
+#include "Utils.h"
+#include "TestList.h"
+#include "LinearMath/btScalar.h"
+
+#if defined (BT_USE_NEON) || defined (BT_USE_SSE_IN_API)
+
+#ifdef _WIN32
+#define strcasecmp _stricmp
+#define basename(A) A
+#endif
+
+#define EXIT_NO_ERROR INT_MIN
+
+//int gReportNanoseconds = 0;    // in Utils.c
+
+int gReportAverageTimes = 0;
+int gExitOnError = 0;
+char *gFullPath = NULL;
+const char *gAppName = NULL;
+int gArgc;
+const char **gArgv;
+
+typedef struct TestNode
+{
+    struct TestNode *next;
+    const char      *name;
+}TestNode;
+
+TestNode *gNodeList = NULL;
+
+static int ParseArgs( int argc, const char *argv[] );
+static void PrintUsage( void );
+static int Init( void );
+static void ListTests(void );
+
+const char *gArch = 
+#ifdef __i386__ 
+    "i386";
+#elif defined __x86_64__
+    "x86_64";
+#elif defined __arm__
+    "arm";
+#elif defined _WIN64
+	"win64";
+#elif defined _WIN32
+	"win32";
+#else
+    #error unknown arch
+#endif
+
+
+
+
+
+#include <stdio.h>
+
+int main (int argc, const char * argv[])
+{
+
+   // Enable just one test programatically (instead of command-line param)
+   // TestNode *node = (TestNode*) malloc( sizeof( TestNode ) );
+   // node->name = "btDbvt";
+   // node->next = 0;
+   // gNodeList = node;
+    
+	srand(0.f);
+
+    int numPassedTests=0;
+	int numFailedTests= 0;
+
+    int err;
+    
+    // Parse arguments. Build gNodeList.
+    if( (err = ParseArgs( argc, argv ) ) )
+    {
+        if( EXIT_NO_ERROR == err )
+            return 0;
+        
+        PrintUsage();        
+        return err;
+    }
+    
+    printf("Arch: %s\n", gArch );
+    
+    if( gReportAverageTimes )
+        printf( "Reporting average times.\n" );
+    else
+        printf( "Reporting best times.\n" );
+    
+    // Set a few things up
+    if( (err = Init() ))
+    {
+        printf( "Init failed.\n" );
+        return err;
+    }
+    
+    if( NULL == gNodeList )
+    { // test everything
+        printf( "No function list found. Testing everything...\n" );
+        size_t i;
+        for( i = 0; NULL != gTestList[i].test_func; i++ )
+        {
+            printf( "\n----------------------------------------------\n" );
+            printf( "Testing %s:\n", gTestList[i].name );
+            printf( "----------------------------------------------\n" );
+            uint64_t startTime = ReadTicks();
+            int local_error = gTestList[i].test_func();
+            uint64_t currentTime = ReadTicks() - startTime;
+            if( local_error )
+            {
+				numFailedTests++;
+                printf( "*** %s test failed with error: %d\n", gTestList[i].name, local_error );
+                if( gExitOnError )
+                    return local_error;
+                if( 0 == err )
+                    err = local_error;
+            }
+            else
+			{
+				numPassedTests++;
+                printf("%s Passed.\t\t\t(%2.2gs)\n", gTestList[i].name, TicksToSeconds(currentTime));
+			}
+        }
+    }
+    else
+    { // test just the list
+        while( NULL != gNodeList )
+        {
+            TestNode *currentNode = gNodeList;
+            gNodeList = gNodeList->next;
+            
+            // Find the test with that name
+            size_t i;
+            for( i = 0; NULL != gTestList[i].test_func; i++ )
+                if( 0 == strcasecmp( currentNode->name, gTestList[i].name ) )
+                    break;
+            
+            if( NULL != gTestList[i].test_func )
+            {
+                printf( "\n----------------------------------------------\n" );
+                printf( "Testing %s:\n", gTestList[i].name );
+                printf( "----------------------------------------------\n" );
+                uint64_t startTime = ReadTicks();
+                int local_error = gTestList[i].test_func();
+                uint64_t currentTime = ReadTicks() - startTime;
+                if( local_error )
+                {
+					numFailedTests++;
+                    printf( "*** %s test failed with error: %d\n", gTestList[i].name, local_error );
+                    if( gExitOnError )
+                        return local_error;
+                    if( 0 == err )
+                        err = local_error;
+                }
+                else
+				{
+					numPassedTests++;
+                    printf("%s Passed.\t\t\t(%2.2gs)\n", gTestList[i].name, TicksToSeconds(currentTime));
+				}
+            }
+            else
+            {
+                printf( "\n***Error: Test name \"%s\" not found! Skipping.\n", currentNode->name );
+                err = -1;
+                if( gExitOnError )
+                    return -1;
+            }
+            
+            free( currentNode );
+        }
+    }
+	printf( "\n----------------------------------------------\n" );
+	printf("numPassedTests = %d, numFailedTests = %d\n",numPassedTests,numFailedTests);
+    
+    free(gFullPath);
+    return err;
+}
+
+static int Init( void )
+{
+    // init the timer
+    TicksToCycles(0);
+    
+    return 0;
+}
+
+static int ParseArgs( int argc, const char *argv[] )
+{
+    int listTests = 0;
+    TestNode *list = NULL;
+    
+    gArgc = argc;
+    gArgv = argv;
+    gFullPath = (char*)malloc( strlen(argv[0]) + 1);
+    strcpy(gFullPath, argv[0]);
+    gAppName = basename( gFullPath );
+    if( NULL == gAppName )
+        gAppName = "<unknown app name>";
+    
+    printf( "%s ", gAppName );
+    int skipremaining=0;
+    
+    size_t i;
+    for( i = 1; i < argc; i++ )
+    {
+        const char *arg = argv[i];
+        printf( "\t%s", arg );
+        if( arg[0] == '-' )
+        {
+            arg++;
+            while( arg[0] != '\0' )
+            {
+                int stop = 0;
+                switch( arg[0] )
+                {
+                    case 'a':
+                        gReportAverageTimes ^= 1;
+                        break;
+                    case 'e':
+                        gExitOnError ^= 1;
+                        break;
+                    case 'h':
+                        PrintUsage();
+                        return EXIT_NO_ERROR;
+                    case 'l':
+                        listTests ^= 1;
+                        return EXIT_NO_ERROR;
+                    case 's':
+                        gReportNanoseconds ^= 1;
+                        break;
+                    case ' ':
+                        stop = 1;
+                        break;
+                    case 'N'://ignore the -NSDocumentRevisionsDebugMode argument from XCode 4.3.2
+                        skipremaining = 1;
+                       stop = 1;
+                        break;
+                    default:
+                        printf( "\nError: Unknown flag \'%c\'\n", arg[0] );
+                        return -1;
+                }
+                if( stop )
+                    break;
+                arg++;
+            }
+        }
+        else
+        { // add function name to the list
+            TestNode *node = (TestNode*) malloc( sizeof( TestNode ) );
+            node->name = arg;
+            node->next = list;
+            list = node;
+        }
+        if (skipremaining)
+            break;
+    }
+    
+    // reverse the list of test names, and stick on gNodeList
+    while( list )
+    {
+        TestNode *node = list;
+        TestNode *next = node->next;
+        node->next = gNodeList;
+        gNodeList = node;
+        list = next;
+    }
+    
+    printf( "\n" );
+    if( listTests )
+        ListTests();
+    
+    return 0;
+}
+
+
+static void PrintUsage( void )
+{
+    printf("\nUsage:\n" );
+    printf("%s: <-aehls> <test names>", gAppName);
+    printf("Options:\n");
+    printf("\t-a\tToggle report average times vs. best times. (Default: best times)\n");
+    printf("\t-e\tToggle exit immediately on error behavior. (Default: off)\n");
+    printf("\t-h\tPrint this message.\n");
+    printf("\t-l\tToggle list available test names.  (Default: off)\n");
+    printf("\t-s\tToggle report times in cycles or nanoseconds. (Default: cycles)\n\n");
+    printf("\tOptions may be followed by one or more test names. If no test names \n" );
+    printf("\tare provided, then all tests are run.\n\n");
+}
+
+static void ListTests(void )
+{
+    size_t i;
+    
+    printf("\nTests:\n");
+    for( i = 0; NULL != gTestList[i].test_func; i++ )
+    {
+        printf( "%19s", gTestList[i].name );
+        if( NULL != gTestList[i].test_func )
+            printf( "," );
+        if( 3 == (i&3) )
+            printf( "\n" );
+    }
+}
+#else
+#include <stdio.h>
+int main(int argc, char* argv[])
+{
+	printf("error: no SIMD enabled through BT_USE_NEON or BT_USE_SSE_IN_API \n(enable in LinearMath/btScalar.h or through build system)\n");
+	return 0;
+}
+#endif
--- a/test/Bullet2/Source/main.h
+++ b/test/Bullet2/Source/main.h
@@ -0,0 +1,25 @@
+//
+//  main.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_main_h
+#define BulletTest_main_h
+
+#ifdef __cplusplus
+extern "C" { 
+#endif
+
+    extern int gReportAverageTimes;     // if 0, report best times
+    extern int gExitOnError;            // if non-zero, exit as soon an an error is encountered
+    extern const char *gAppName;        // the name of this application
+    
+#ifdef __cplusplus
+}
+#endif
+
+
+
+#endif
--- a/test/Bullet2/Source/vector.h
+++ b/test/Bullet2/Source/vector.h
@@ -0,0 +1,70 @@
+//
+//  vector.h
+//  BulletTest
+//
+//  Copyright (c) 2011 Apple Inc.
+//
+
+#ifndef BulletTest_vector_h
+#define BulletTest_vector_h
+
+#ifdef __SSE__
+    typedef float float4            __attribute__ ((__vector_size__(16)));
+    #include <xmmintrin.h>
+#endif
+
+#ifdef __SSE2__
+    typedef double double2          __attribute__ ((__vector_size__(16)));
+    typedef char char16             __attribute__ ((__vector_size__(16)));
+    typedef unsigned char uchar16   __attribute__ ((__vector_size__(16)));
+    typedef short short8            __attribute__ ((__vector_size__(16)));
+    typedef unsigned short ushort8  __attribute__ ((__vector_size__(16)));
+    typedef int int4                __attribute__ ((__vector_size__(16)));
+   // typedef unsigned int uint4      __attribute__ ((__vector_size__(16)));
+    #ifdef __LP64__ 
+        typedef long long2              __attribute__ ((__vector_size__(16)));
+        typedef unsigned long ulong2    __attribute__ ((__vector_size__(16)));
+    #else
+        typedef long long long2         __attribute__ ((__vector_size__(16)));
+        typedef unsigned long long ulong2 __attribute__ ((__vector_size__(16)));
+    #endif
+    #include <emmintrin.h> 
+#endif
+
+#ifdef __SSE3__
+    #include <pmmintrin.h>
+#endif
+
+#ifdef __SSSE3__
+    #include <tmmintrin.h>
+#endif
+
+#ifdef __SSE4_1__
+    #include <smmintrin.h>
+#endif
+
+#ifdef __arm__
+    #include <arm/arch.h>
+    #ifdef _ARM_ARCH_7
+        #define ARM_NEON_GCC_COMPATIBILITY  1
+        #include <arm_neon.h>
+        typedef float float4            __attribute__ ((__vector_size__(16)));
+        typedef double double2          __attribute__ ((__vector_size__(16)));
+        typedef char char16             __attribute__ ((__vector_size__(16)));
+        typedef unsigned char uchar16   __attribute__ ((__vector_size__(16)));
+        typedef short short8            __attribute__ ((__vector_size__(16)));
+        typedef unsigned short ushort8  __attribute__ ((__vector_size__(16)));
+        typedef int int4                __attribute__ ((__vector_size__(16)));
+        typedef unsigned int uint4      __attribute__ ((__vector_size__(16)));
+        #ifdef __LP64__ 
+            typedef long long2              __attribute__ ((__vector_size__(16)));
+            typedef unsigned long ulong2    __attribute__ ((__vector_size__(16)));
+        #else
+            typedef long long long2         __attribute__ ((__vector_size__(16)));
+            typedef unsigned long long ulong2 __attribute__ ((__vector_size__(16)));
+        #endif
+    #endif
+#endif
+
+
+#endif
--- a/test/Bullet2/premake4.lua
+++ b/test/Bullet2/premake4.lua
@@ -0,0 +1,23 @@
+
+project "AppUnitTest"
+
+if _OPTIONS["ios"] then
+	kind "WindowedApp"
+else	
+	kind "ConsoleApp"
+end
+targetdir "bin"
+
+includedirs {"../src","Source", "Source/Tests"}
+
+links {
+	"BulletDynamics","BulletCollision", "LinearMath"
+}
+
+language "C++"
+
+files {
+	"Source/**.cpp",
+	"Source/**.h",
+}
+