Several changes to sync Bullet trunk with PlayStation 3 spubullet version

Still needs some cross-platform fixes
2010-07-08 17:02:38 +00:00
parent 76a58e1f4e
commit fbc17731ec
63 changed files with 10363 additions and 522 deletions
--- a/src/BulletMultiThreaded/CMakeLists.txt
+++ b/src/BulletMultiThreaded/CMakeLists.txt
@@ -1,6 +1,6 @@
 INCLUDE_DIRECTORIES(
 	${BULLET_PHYSICS_SOURCE_DIR}/src
-	${BULLET_PHYSICS_SOURCE_DIR}/src/BulletMultiThreaded/vectormath/scalar/cpp
+	${VECTOR_MATH_INCLUDE}
 )

 ADD_LIBRARY(BulletMultiThreaded
@@ -37,6 +37,11 @@ ADD_LIBRARY(BulletMultiThreaded
 		SpuNarrowPhaseCollisionTask/Box.h
 		SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
 		SpuNarrowPhaseCollisionTask/boxBoxDistance.h
+#		SPURS_PEGatherScatterTask/SpuPEGatherScatterTask.cpp
+#		SPURS_PEGatherScatterTask/SpuPEGatherScatterTask.h
+#		SpuPEGatherScatterTaskProcess.cpp
+#		SpuPEGatherScatterTaskProcess.h
+		
 		SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
 		SpuNarrowPhaseCollisionTask/SpuContactResult.h
 		SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
--- a/src/BulletMultiThreaded/HeapManager.h
+++ b/src/BulletMultiThreaded/HeapManager.h
@@ -0,0 +1,116 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef __HEAP_MANAGER_H__
+#define __HEAP_MANAGER_H__
+
+#ifdef __SPU__
+	#define HEAP_STACK_SIZE 32
+#else
+	#define HEAP_STACK_SIZE 64
+#endif
+
+#define MIN_ALLOC_SIZE 16
+
+
+class HeapManager
+{
+private:
+	ATTRIBUTE_ALIGNED16(unsigned char *mHeap);
+	ATTRIBUTE_ALIGNED16(unsigned int mHeapBytes);
+	ATTRIBUTE_ALIGNED16(unsigned char *mPoolStack[HEAP_STACK_SIZE]);
+	ATTRIBUTE_ALIGNED16(unsigned int mCurStack);
+	
+public:
+	enum {ALIGN16,ALIGN128};
+
+	HeapManager(unsigned char *buf,int bytes)
+	{
+		mHeap = buf;
+		mHeapBytes = bytes;
+		clear();
+	}
+	
+	~HeapManager()
+	{
+	}
+	
+	int getAllocated()
+	{
+		return (int)(mPoolStack[mCurStack]-mHeap);
+	}
+	
+	int getRest()
+	{
+		return mHeapBytes-getAllocated();
+	}
+
+	void *allocate(size_t bytes,int alignment = ALIGN16)
+	{
+		if(bytes <= 0) bytes = MIN_ALLOC_SIZE;
+		btAssert(mCurStack < (HEAP_STACK_SIZE-1));
+
+		
+#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64__)
+		unsigned long long p = (unsigned long long )mPoolStack[mCurStack];
+		if(alignment == ALIGN128) {
+			p = ((p+127) & 0xffffffffffffff80);
+			bytes = (bytes+127) & 0xffffffffffffff80;
+		}
+		else {
+			bytes = (bytes+15) & 0xfffffffffffffff0;
+		}
+
+		btAssert(bytes <=(mHeapBytes-(p-(unsigned long long )mHeap)) );
+		
+#else
+		unsigned long p = (unsigned long )mPoolStack[mCurStack];
+		if(alignment == ALIGN128) {
+			p = ((p+127) & 0xffffff80);
+			bytes = (bytes+127) & 0xffffff80;
+		}
+		else {
+			bytes = (bytes+15) & 0xfffffff0;
+		}
+		btAssert(bytes <=(mHeapBytes-(p-(unsigned long)mHeap)) );
+#endif
+		unsigned char * bla = (unsigned char *)(p + bytes);
+		mPoolStack[++mCurStack] = bla;
+		return (void*)p;
+	}
+
+	void deallocate(void *p)
+	{
+		(void) p;
+		mCurStack--;
+	}
+	
+	void clear()
+	{
+		mPoolStack[0] = mHeap;
+		mCurStack = 0;
+	}
+
+//	void printStack()
+//	{
+//		for(unsigned int i=0;i<=mCurStack;i++) {
+//			PRINTF("memStack %2d 0x%x\n",i,(uint32_t)mPoolStack[i]);
+//		}
+//	}
+
+};
+
+#endif
--- a/src/BulletMultiThreaded/MiniCL.cpp
+++ b/src/BulletMultiThreaded/MiniCL.cpp
@@ -25,8 +25,9 @@ subject to the following restrictions:
 #include "MiniCLTaskScheduler.h"
 #include "MiniCLTask/MiniCLTask.h"
 #include "LinearMath/btMinMax.h"
+#include <stdio.h>

-//#define DEBUG_MINICL_KERNELS 1
+#define DEBUG_MINICL_KERNELS 1

 static char* spPlatformID = "MiniCL, SCEA";

@@ -316,7 +317,7 @@ static void* localBufMalloc(int size)
 	if((sLocalBufUsed + size16) > LOCAL_BUF_SIZE)
 	{ // reset
 		spLocalBufCurr = sLocalMemBuf;
-		while((int)spLocalBufCurr & 0x0F) spLocalBufCurr++; // align to 16 bytes
+		while((unsigned long)spLocalBufCurr & 0x0F) spLocalBufCurr++; // align to 16 bytes
 		sLocalBufUsed = 0;
 	}
 	void* ret = spLocalBufCurr;
@@ -339,8 +340,8 @@ CL_API_ENTRY cl_int CL_API_CALL clSetKernelArg(cl_kernel    clKernel ,
 		printf("error: clSetKernelArg arg_index (%d) exceeds %d\n",arg_index,MINI_CL_MAX_ARG);
 	} else
 	{
-//		if (arg_size>=MINICL_MAX_ARGLENGTH)
-		if (arg_size != MINICL_MAX_ARGLENGTH)
+		if (arg_size>MINICL_MAX_ARGLENGTH)
+		//if (arg_size != MINICL_MAX_ARGLENGTH)
 		{
 			printf("error: clSetKernelArg argdata too large: %d (maximum is %d)\n",arg_size,MINICL_MAX_ARGLENGTH);
 		} 
@@ -559,12 +560,12 @@ clGetKernelWorkGroupInfo(cl_kernel                   kernel ,
                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0
 {
 	if((wgi == CL_KERNEL_WORK_GROUP_SIZE)
-	 &&(sz == sizeof(int))
+	 &&(sz == sizeof(size_t))
 	 &&(ptr != NULL))
 	{
 		MiniCLKernel* miniCLKernel = (MiniCLKernel*)kernel;
 		MiniCLTaskScheduler* scheduler = miniCLKernel->m_scheduler;
-		*((int*)ptr) = scheduler->getMaxNumOutstandingTasks();
+		*((size_t*)ptr) = scheduler->getMaxNumOutstandingTasks();
 		return CL_SUCCESS;
 	}
 	else
--- a/src/BulletMultiThreaded/PlatformDefinitions.h
+++ b/src/BulletMultiThreaded/PlatformDefinitions.h
@@ -2,6 +2,18 @@
 #define TYPE_DEFINITIONS_H

 ///This file provides some platform/compiler checks for common definitions
+#include "LinearMath/btScalar.h"
+#include "LinearMath/btMinMax.h"
+
+#include <vectormath_aos.h>
+typedef Vectormath::Aos::Vector3    vmVector3;
+typedef Vectormath::Aos::Quat       vmQuat;
+typedef Vectormath::Aos::Matrix3    vmMatrix3;
+typedef Vectormath::Aos::Transform3 vmTransform3;
+typedef Vectormath::Aos::Point3		vmPoint3;
+
+
+

 #ifdef _WIN32

@@ -19,9 +31,11 @@ typedef union

 		typedef unsigned char     uint8_t;
 #ifndef __PHYSICS_COMMON_H__
+#ifndef __PFX_COMMON_H__
 #ifndef __BT_SKIP_UINT64_H
 		typedef unsigned long int uint64_t;
 #endif //__BT_SKIP_UINT64_H
+#endif //__PFX_COMMON_H__
 		typedef unsigned int      uint32_t;
 #endif //__PHYSICS_COMMON_H__
 		typedef unsigned short    uint16_t;
@@ -54,26 +68,22 @@ typedef union
 #include <stdio.h>		
 #define spu_printf printf	
 #define DWORD unsigned int
-		
 			typedef union
 			{
 			  unsigned long long ull;
 			  unsigned int ui[2];
 			  void *p;
 			} addr64;
-		
-		
-#else
-
-#include <stdio.h>		
-#define spu_printf printf	
-
 #endif // USE_LIBSPE2
-	
+
 #endif	//__CELLOS_LV2__
 	
 #endif

+#ifdef __SPU__
+#include <stdio.h>		
+#define printf spu_printf
+#endif

 /* Included here because we need uint*_t typedefs */
 #include "PpuAddressSpace.h"
--- a/src/BulletMultiThreaded/PpuAddressSpace.h
+++ b/src/BulletMultiThreaded/PpuAddressSpace.h
@@ -8,8 +8,9 @@
 #pragma warning (disable: 4312)
 #endif //_WIN32

-#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64__) || defined(USE_ADDR64)
-typedef uint64_t ppu_address_t;
+
+#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64__)
+typedef unsigned __int64 ppu_address_t;
 #else

 typedef uint32_t ppu_address_t;
--- a/src/BulletMultiThreaded/SequentialThreadSupport.cpp
+++ b/src/BulletMultiThreaded/SequentialThreadSupport.cpp
@@ -91,3 +91,79 @@ void SequentialThreadSupport::setNumTasks(int numTasks)
 {
 	printf("SequentialThreadSupport::setNumTasks(%d) is not implemented and has no effect\n",numTasks);
 }
+
+
+
+
+class btDummyBarrier : public btBarrier
+{
+private:
+		
+public:
+	btDummyBarrier()
+	{
+	}
+	
+	virtual ~btDummyBarrier()
+	{
+	}
+	
+	void sync()
+	{
+	}
+	
+	virtual void setMaxCount(int n) {}
+	virtual int  getMaxCount() {return 1;}
+};
+
+class btDummyCriticalSection : public btCriticalSection
+{
+	
+public:
+	btDummyCriticalSection()
+	{
+	}
+	
+	virtual ~btDummyCriticalSection()
+	{
+	}
+	
+	unsigned int getSharedParam(int i)
+	{
+		btAssert(i>=0&&i<31);
+		return mCommonBuff[i+1];
+	}
+	
+	void setSharedParam(int i,unsigned int p)
+	{
+		btAssert(i>=0&&i<31);
+		mCommonBuff[i+1] = p;
+	}
+	
+	void lock()
+	{
+		mCommonBuff[0] = 1;
+	}
+	
+	void unlock()
+	{
+		mCommonBuff[0] = 0;
+	}
+};
+
+
+
+
+btBarrier*	SequentialThreadSupport::createBarrier()
+{
+	return new btDummyBarrier();
+}
+
+btCriticalSection* SequentialThreadSupport::createCriticalSection()
+{
+	return new btDummyCriticalSection();
+	
+}
+
+
+
--- a/src/BulletMultiThreaded/SequentialThreadSupport.h
+++ b/src/BulletMultiThreaded/SequentialThreadSupport.h
@@ -85,10 +85,10 @@ public:
 	{
 		return 1;
 	}
-	virtual btBarrier*	createBarrier() { return 0;}
-
-	virtual btCriticalSection* createCriticalSection()  { return 0;};
+	virtual btBarrier*	createBarrier();

+	virtual btCriticalSection* createCriticalSection();
+	

 };

--- a/src/BulletMultiThreaded/SpuFakeDma.cpp
+++ b/src/BulletMultiThreaded/SpuFakeDma.cpp
@@ -174,6 +174,9 @@ int	cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid,
 {
 	char* mainMem = (char*)ea;
 	char* localStore = (char*)ls;
+
+//	printf("mainMem=%x, localStore=%x",mainMem,localStore);
+
 #ifdef USE_MEMCPY
 	memcpy(localStore,mainMem,size);
 #else
@@ -182,6 +185,7 @@ int	cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid,
 		localStore[i] = mainMem[i];
 	}	
 #endif //#ifdef USE_MEMCPY
+//	printf(" finished\n");
 	return 0;
 }

--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h
@@ -25,15 +25,11 @@ subject to the following restrictions:
 #include <math.h>

 ///only use a system-wide vectormath_aos.h on CELLOS_LV2 or if USE_SYSTEM_VECTORMATH
-#if defined(__CELLOS_LV2__) || defined (USE_SYSTEM_VECTORMATH)
-#include <vectormath_aos.h>
-#else
-#include "BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h"
-#endif
+#include "vectormath_aos.h"
+#include "../PlatformDefinitions.h"



-using namespace Vectormath::Aos;

 enum FeatureType { F, E, V };

@@ -44,21 +40,21 @@ enum FeatureType { F, E, V };
 class Box
 {
 public:
-	Vector3 half;
+	vmVector3 mHalf;

 	inline Box()
 	{}
-	inline Box(PE_REF(Vector3) half_);
+	inline Box(PE_REF(vmVector3) half_);
 	inline Box(float hx, float hy, float hz);

-	inline void Set(PE_REF(Vector3) half_);
+	inline void Set(PE_REF(vmVector3) half_);
 	inline void Set(float hx, float hy, float hz);

-	inline Vector3 GetAABB(const Matrix3& rotation) const;
+	inline vmVector3 GetAABB(const vmMatrix3& rotation) const;
 };

 inline
-Box::Box(PE_REF(Vector3) half_)
+Box::Box(PE_REF(vmVector3) half_)
 {
 	Set(half_);
 }
@@ -71,23 +67,23 @@ Box::Box(float hx, float hy, float hz)

 inline
 void
-Box::Set(PE_REF(Vector3) half_)
+Box::Set(PE_REF(vmVector3) half_)
 {
-	half = half_;
+	mHalf = half_;
 }

 inline
 void
 Box::Set(float hx, float hy, float hz)
 {
-	half = Vector3(hx, hy, hz);
+	mHalf = vmVector3(hx, hy, hz);
 }

 inline
-Vector3
-Box::GetAABB(const Matrix3& rotation) const
+vmVector3
+Box::GetAABB(const vmMatrix3& rotation) const
 {
-	return absPerElem(rotation) * half;
+	return absPerElem(rotation) * mHalf;
 }

 //-------------------------------------------------------------------------------------------------
@@ -100,7 +96,7 @@ class BoxPoint
 public:
 	BoxPoint() : localPoint(0.0f) {}

-	Point3      localPoint;
+	vmPoint3      localPoint;
 	FeatureType featureType;
 	int         featureIdx;

--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
@@ -22,7 +22,7 @@ subject to the following restrictions:
 #include <stdio.h>
 #define spu_printf printf
 #endif
-#endif DEBUG_SPU_COLLISION_DETECTION
+#endif //DEBUG_SPU_COLLISION_DETECTION

 SpuContactResult::SpuContactResult()
 {
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
@@ -52,7 +52,7 @@ subject to the following restrictions:
 #ifdef __SPU__
 ///Software caching from the IBM Cell SDK, it reduces 25% SPU time for our test cases
 #ifndef USE_LIBSPE2
-#define USE_SOFTWARE_CACHE 1
+//#define USE_SOFTWARE_CACHE 1
 #endif
 #endif //__SPU__

@@ -1216,7 +1216,7 @@ void	processCollisionTask(void* userPtr, void* lsMemPtr)
 #endif											
 										)
 										{
-//#define USE_PE_BOX_BOX 1
+#define USE_PE_BOX_BOX 1
 #ifdef USE_PE_BOX_BOX
 											{

@@ -1225,38 +1225,64 @@ void	processCollisionTask(void* userPtr, void* lsMemPtr)
 												btScalar margin1 = lsMem.getlocalCollisionAlgorithm()->getCollisionMargin1();
 												btVector3 shapeDim0 = lsMem.getlocalCollisionAlgorithm()->getShapeDimensions0()+btVector3(margin0,margin0,margin0);
 												btVector3 shapeDim1 = lsMem.getlocalCollisionAlgorithm()->getShapeDimensions1()+btVector3(margin1,margin1,margin1);
+/*
+												//Box boxA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ());
+												vmVector3 vmPos0 = getVmVector3(collisionPairInput.m_worldTransform0.getOrigin());
+												vmVector3 vmPos1 = getVmVector3(collisionPairInput.m_worldTransform1.getOrigin());
+												vmMatrix3 vmMatrix0 = getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis());
+												vmMatrix3 vmMatrix1 = getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis());

-												Box boxA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ());
-												Vector3 vmPos0 = getVmVector3(collisionPairInput.m_worldTransform0.getOrigin());
-												Vector3 vmPos1 = getVmVector3(collisionPairInput.m_worldTransform1.getOrigin());
-												Matrix3 vmMatrix0 = getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis());
-												Matrix3 vmMatrix1 = getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis());
-
-												Transform3 transformA(vmMatrix0,vmPos0);
+												vmTransform3 transformA(vmMatrix0,vmPos0);
 												Box boxB(shapeDim1.getX(),shapeDim1.getY(),shapeDim1.getZ());
-												Transform3 transformB(vmMatrix1,vmPos1);
+												vmTransform3 transformB(vmMatrix1,vmPos1);
 												BoxPoint resultClosestBoxPointA;
 												BoxPoint resultClosestBoxPointB;
-												Vector3 resultNormal;
+												vmVector3 resultNormal;
+												*/
+
 #ifdef USE_SEPDISTANCE_UTIL
 												float distanceThreshold = FLT_MAX
 #else
-												float distanceThreshold = 0.f;
+												//float distanceThreshold = 0.f;
 #endif


-												distance = boxBoxDistance(resultNormal,resultClosestBoxPointA,resultClosestBoxPointB,  boxA, transformA, boxB,transformB,distanceThreshold);
+												vmVector3 n;
+												Box boxA;
+												vmVector3 hA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ());
+												vmVector3 hB(shapeDim1.getX(),shapeDim1.getY(),shapeDim1.getZ());
+												boxA.mHalf= hA;
+												vmTransform3 trA;
+												trA.setTranslation(getVmVector3(collisionPairInput.m_worldTransform0.getOrigin()));
+												trA.setUpper3x3(getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis()));
+												Box boxB;
+												boxB.mHalf = hB;
+												vmTransform3 trB;
+												trB.setTranslation(getVmVector3(collisionPairInput.m_worldTransform1.getOrigin()));
+												trB.setUpper3x3(getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis()));
 												
-												normalInB = -getBtVector3(resultNormal);
+												float distanceThreshold = spuManifold->getContactBreakingThreshold();//0.001f;

-												if(distance < spuManifold->getContactBreakingThreshold())
+
+												BoxPoint ptA,ptB;
+												float dist = boxBoxDistance(n, ptA, ptB,
+														   boxA, trA, boxB,	   trB,
+															distanceThreshold );
+
+
+//												float distance = boxBoxDistance(resultNormal,resultClosestBoxPointA,resultClosestBoxPointB,  boxA, transformA, boxB,transformB,distanceThreshold);
+												
+												normalInB = -getBtVector3(n);//resultNormal);
+
+												//if(dist < distanceThreshold)//spuManifold->getContactBreakingThreshold())
+												if(dist < spuManifold->getContactBreakingThreshold())
 												{
-													btVector3 pointOnB = collisionPairInput.m_worldTransform1(getBtVector3(resultClosestBoxPointB.localPoint));
+													btVector3 pointOnB = collisionPairInput.m_worldTransform1(getBtVector3(ptB.localPoint));

 													spuContacts.addContactPoint(
 														normalInB,
 														pointOnB,
-														distance);
+														dist);
 												}
 											} 
 #else									
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
@@ -15,7 +15,11 @@ subject to the following restrictions:
 */


-#include "Box.h"
+//#include "PfxContactBoxBox.h"
+
+#include <math.h>
+#include "../PlatformDefinitions.h"
+#include "boxBoxDistance.h"

 static inline float sqr( float a )
 {
@@ -114,18 +118,18 @@ VertexBFaceATest(
 	bool & inVoronoi,
 	float & t0,
 	float & t1,
-	const Vector3 & hA,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesB )
+	const vmVector3 & hA,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesB )
 {
 	// compute a corner of box B in A's coordinate system

-	Vector3 corner =
-		Vector3( faceOffsetAB + matrixAB.getCol0() * scalesB.getX() + matrixAB.getCol1() * scalesB.getY() );
+	vmVector3 corner =
+		vmVector3( faceOffsetAB + matrixAB.getCol0() * scalesB.getX() + matrixAB.getCol1() * scalesB.getY() );

 	// compute the parameters of the point on A, closest to this corner

@@ -144,8 +148,8 @@ VertexBFaceATest(
 	// do the Voronoi test: already know the point on B is in the Voronoi region of the
 	// point on A, check the reverse.

-	Vector3 facePointB =
-		Vector3( mulPerElem( faceOffsetBA + matrixBA.getCol0() * t0 + matrixBA.getCol1() * t1 - scalesB, signsB ) );
+	vmVector3 facePointB =
+		vmVector3( mulPerElem( faceOffsetBA + matrixBA.getCol0() * t0 + matrixBA.getCol1() * t1 - scalesB, signsB ) );

 	inVoronoi = ( ( facePointB[0] >= voronoiTol * facePointB[2] ) &&
 				  ( facePointB[1] >= voronoiTol * facePointB[0] ) &&
@@ -169,17 +173,17 @@ void
 VertexBFaceATests(
 	bool & done,
 	float & minDistSqr,
-	Point3 & localPointA,
-	Point3 & localPointB,
+	vmPoint3 & localPointA,
+	vmPoint3 & localPointB,
 	FeatureType & featureA,
 	FeatureType & featureB,
-	const Vector3 & hA,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesB,
+	const vmVector3 & hA,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesB,
 	bool first )
 {
 		
@@ -247,16 +251,16 @@ VertexAFaceBTest(
 	bool & inVoronoi,
 	float & t0,
 	float & t1,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) scalesA )
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) scalesA )
 {
-	Vector3 corner =
-		Vector3( faceOffsetBA + matrixBA.getCol0() * scalesA.getX() + matrixBA.getCol1() * scalesA.getY() );
+	vmVector3 corner =
+		vmVector3( faceOffsetBA + matrixBA.getCol0() * scalesA.getX() + matrixBA.getCol1() * scalesA.getY() );

 	t0 = corner[0];
 	t1 = corner[1];
@@ -270,8 +274,8 @@ VertexAFaceBTest(
 	else if ( t1 < -hB[1] )
 		t1 = -hB[1];

-	Vector3 facePointA =
-		Vector3( mulPerElem( faceOffsetAB + matrixAB.getCol0() * t0 + matrixAB.getCol1() * t1 - scalesA, signsA ) );
+	vmVector3 facePointA =
+		vmVector3( mulPerElem( faceOffsetAB + matrixAB.getCol0() * t0 + matrixAB.getCol1() * t1 - scalesA, signsA ) );

 	inVoronoi = ( ( facePointA[0] >= voronoiTol * facePointA[2] ) &&
 				  ( facePointA[1] >= voronoiTol * facePointA[0] ) &&
@@ -295,17 +299,17 @@ void
 VertexAFaceBTests(
 	bool & done,
 	float & minDistSqr,
-	Point3 & localPointA,
-	Point3 & localPointB,
+	vmPoint3 & localPointA,
+	vmPoint3 & localPointB,
 	FeatureType & featureA,
 	FeatureType & featureB,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) scalesA,
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) scalesA,
 	bool first )
 {
 	float t0, t1;
@@ -363,7 +367,7 @@ VertexAFaceBTests(
 }

 //-------------------------------------------------------------------------------------------------
-// EdgeEdgeTest:
+// CustomEdgeEdgeTest:
 //
 // tests whether a pair of edges are the closest features
 //
@@ -374,10 +378,10 @@ VertexAFaceBTests(
 // the dimension of the face normal is 2
 //-------------------------------------------------------------------------------------------------

-#define EdgeEdgeTest( ac, ac_letter, ad, ad_letter, bc, bc_letter, bd, bd_letter )              \
+#define CustomEdgeEdgeTest( ac, ac_letter, ad, ad_letter, bc, bc_letter, bd, bd_letter )              \
 {                                                                                               \
-   Vector3 edgeOffsetAB;                                                                          \
-   Vector3 edgeOffsetBA;                                                                          \
+   vmVector3 edgeOffsetAB;                                                                          \
+   vmVector3 edgeOffsetBA;                                                                          \
                                                                                                \
   edgeOffsetAB = faceOffsetAB + matrixAB.getCol##bc() * scalesB.get##bc_letter();            \
   edgeOffsetAB.set##ac_letter( edgeOffsetAB.get##ac_letter() - scalesA.get##ac_letter() );  \
@@ -421,8 +425,8 @@ VertexAFaceBTests(
      else if ( tA > hA[ad] ) tA = hA[ad];                                                      \
   }                                                                                            \
                                                                                                \
-   Vector3 edgeOffAB = Vector3( mulPerElem( edgeOffsetAB + matrixAB.getCol##bd() * tB, signsA ) );\
-   Vector3 edgeOffBA = Vector3( mulPerElem( edgeOffsetBA + matrixBA.getCol##ad() * tA, signsB ) );\
+   vmVector3 edgeOffAB = vmVector3( mulPerElem( edgeOffsetAB + matrixAB.getCol##bd() * tB, signsA ) );\
+   vmVector3 edgeOffBA = vmVector3( mulPerElem( edgeOffsetBA + matrixBA.getCol##ad() * tA, signsB ) );\
                                                                                                \
   inVoronoi = ( edgeOffAB[ac] >= voronoiTol * edgeOffAB[2] ) &&                                \
               ( edgeOffAB[2] >= voronoiTol * edgeOffAB[ac] ) &&                                \
@@ -436,79 +440,79 @@ VertexAFaceBTests(
 }

 float
-EdgeEdgeTest_0101(
+CustomEdgeEdgeTest_0101(
 	bool & inVoronoi,
 	float & tA,
 	float & tB,
-	const Vector3 & hA,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesA,
-	PE_REF(Vector3) scalesB )
+	const vmVector3 & hA,
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesA,
+	PE_REF(vmVector3) scalesB )
 {
-	EdgeEdgeTest( 0, X, 1, Y, 0, X, 1, Y );
+	CustomEdgeEdgeTest( 0, X, 1, Y, 0, X, 1, Y );
 }

 float
-EdgeEdgeTest_0110(
+CustomEdgeEdgeTest_0110(
 	bool & inVoronoi,
 	float & tA,
 	float & tB,
-	const Vector3 & hA,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesA,
-	PE_REF(Vector3) scalesB )
+	const vmVector3 & hA,
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesA,
+	PE_REF(vmVector3) scalesB )
 {
-	EdgeEdgeTest( 0, X, 1, Y, 1, Y, 0, X );
+	CustomEdgeEdgeTest( 0, X, 1, Y, 1, Y, 0, X );
 }

 float
-EdgeEdgeTest_1001(
+CustomEdgeEdgeTest_1001(
 	bool & inVoronoi,
 	float & tA,
 	float & tB,
-	const Vector3 & hA,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesA,
-	PE_REF(Vector3) scalesB )
+	const vmVector3 & hA,
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesA,
+	PE_REF(vmVector3) scalesB )
 {
-	EdgeEdgeTest( 1, Y, 0, X, 0, X, 1, Y );
+	CustomEdgeEdgeTest( 1, Y, 0, X, 0, X, 1, Y );
 }

 float
-EdgeEdgeTest_1010(
+CustomEdgeEdgeTest_1010(
 	bool & inVoronoi,
 	float & tA,
 	float & tB,
-	const Vector3 & hA,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesA,
-	PE_REF(Vector3) scalesB )
+	const vmVector3 & hA,
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesA,
+	PE_REF(vmVector3) scalesB )
 {
-	EdgeEdgeTest( 1, Y, 0, X, 1, Y, 0, X );
+	CustomEdgeEdgeTest( 1, Y, 0, X, 1, Y, 0, X );
 }

 #define EdgeEdge_SetNewMin( ac_letter, ad_letter, bc_letter, bd_letter )   \
@@ -528,22 +532,22 @@ void
 EdgeEdgeTests(
 	bool & done,
 	float & minDistSqr,
-	Point3 & localPointA,
-	Point3 & localPointB,
+	vmPoint3 & localPointA,
+	vmPoint3 & localPointB,
 	int & otherFaceDimA,
 	int & otherFaceDimB,
 	FeatureType & featureA,
 	FeatureType & featureB,
-	const Vector3 & hA,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesA,
-	PE_REF(Vector3) scalesB,
+	const vmVector3 & hA,
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesA,
+	PE_REF(vmVector3) scalesB,
 	bool first )
 {

@@ -555,7 +559,7 @@ EdgeEdgeTests(
 	testOtherFaceDimA = 0;
 	testOtherFaceDimB = 0;

-	distSqr = EdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( first ) {
@@ -572,7 +576,7 @@ EdgeEdgeTests(
 	signsA.setX( -signsA.getX() );
 	scalesA.setX( -scalesA.getX() );

-	distSqr = EdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -585,7 +589,7 @@ EdgeEdgeTests(
 	signsB.setX( -signsB.getX() );
 	scalesB.setX( -scalesB.getX() );

-	distSqr = EdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -598,7 +602,7 @@ EdgeEdgeTests(
 	signsA.setX( -signsA.getX() );
 	scalesA.setX( -scalesA.getX() );

-	distSqr = EdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -613,7 +617,7 @@ EdgeEdgeTests(
 	signsB.setX( -signsB.getX() );
 	scalesB.setX( -scalesB.getX() );

-	distSqr = EdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -626,7 +630,7 @@ EdgeEdgeTests(
 	signsA.setY( -signsA.getY() );
 	scalesA.setY( -scalesA.getY() );

-	distSqr = EdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -639,7 +643,7 @@ EdgeEdgeTests(
 	signsB.setX( -signsB.getX() );
 	scalesB.setX( -scalesB.getX() );

-	distSqr = EdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -652,7 +656,7 @@ EdgeEdgeTests(
 	signsA.setY( -signsA.getY() );
 	scalesA.setY( -scalesA.getY() );

-	distSqr = EdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -667,7 +671,7 @@ EdgeEdgeTests(
 	signsB.setX( -signsB.getX() );
 	scalesB.setX( -scalesB.getX() );

-	distSqr = EdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -680,7 +684,7 @@ EdgeEdgeTests(
 	signsA.setX( -signsA.getX() );
 	scalesA.setX( -scalesA.getX() );

-	distSqr = EdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -693,7 +697,7 @@ EdgeEdgeTests(
 	signsB.setY( -signsB.getY() );
 	scalesB.setY( -scalesB.getY() );

-	distSqr = EdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -706,7 +710,7 @@ EdgeEdgeTests(
 	signsA.setX( -signsA.getX() );
 	scalesA.setX( -scalesA.getX() );

-	distSqr = EdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -721,7 +725,7 @@ EdgeEdgeTests(
 	signsB.setY( -signsB.getY() );
 	scalesB.setY( -scalesB.getY() );

-	distSqr = EdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -734,7 +738,7 @@ EdgeEdgeTests(
 	signsA.setY( -signsA.getY() );
 	scalesA.setY( -scalesA.getY() );

-	distSqr = EdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -747,7 +751,7 @@ EdgeEdgeTests(
 	signsB.setY( -signsB.getY() );
 	scalesB.setY( -scalesB.getY() );

-	distSqr = EdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -760,7 +764,7 @@ EdgeEdgeTests(
 	signsA.setY( -signsA.getY() );
 	scalesA.setY( -scalesA.getY() );

-	distSqr = EdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );

 	if ( distSqr < minDistSqr ) {
@@ -768,27 +772,25 @@ EdgeEdgeTests(
 	}
 }

+
 float
-boxBoxDistance(
-	Vector3& normal,
-	BoxPoint& boxPointA,
-	BoxPoint& boxPointB,
-	PE_REF(Box) boxA, const Transform3& transformA,
-	PE_REF(Box) boxB, const Transform3& transformB,
-	float distanceThreshold )
+boxBoxDistance(vmVector3& normal, BoxPoint& boxPointA, BoxPoint& boxPointB,
+			   PE_REF(Box) boxA, const vmTransform3 & transformA, PE_REF(Box) boxB,
+			   const vmTransform3 & transformB,
+			   float distanceThreshold)
 {
-	Matrix3 identity;
-	identity = Matrix3::identity();
-	Vector3 ident[3];
+	vmMatrix3 identity;
+	identity = vmMatrix3::identity();
+	vmVector3 ident[3];
 	ident[0] = identity.getCol0();
 	ident[1] = identity.getCol1();
 	ident[2] = identity.getCol2();

 	// get relative transformations

-	Transform3 transformAB, transformBA;
-	Matrix3 matrixAB, matrixBA;
-	Vector3 offsetAB, offsetBA;
+	vmTransform3 transformAB, transformBA;
+	vmMatrix3 matrixAB, matrixBA;
+	vmVector3 offsetAB, offsetBA;

 	transformAB = orthoInverse(transformA) * transformB;
 	transformBA = orthoInverse(transformAB);
@@ -798,25 +800,25 @@ boxBoxDistance(
 	matrixBA = transformBA.getUpper3x3();
 	offsetBA = transformBA.getTranslation();

-	Matrix3 absMatrixAB = absPerElem(matrixAB);
-	Matrix3 absMatrixBA = absPerElem(matrixBA);
+	vmMatrix3 absMatrixAB = absPerElem(matrixAB);
+	vmMatrix3 absMatrixBA = absPerElem(matrixBA);

 	// find separating axis with largest gap between projections

 	BoxSepAxisType axisType;
-	Vector3 axisA(0.0f), axisB(0.0f);
+	vmVector3 axisA(0.0f), axisB(0.0f);
 	float gap, maxGap;
 	int faceDimA = 0, faceDimB = 0, edgeDimA = 0, edgeDimB = 0;

 	// face axes

-	Vector3  gapsA   = absPerElem(offsetAB) - boxA.half - absMatrixAB * boxB.half;
+	vmVector3  gapsA   = absPerElem(offsetAB) - boxA.mHalf - absMatrixAB * boxB.mHalf;

 	AaxisTest(0,X,true);
 	AaxisTest(1,Y,false);
 	AaxisTest(2,Z,false);

-	Vector3  gapsB   = absPerElem(offsetBA) - boxB.half - absMatrixBA * boxA.half;
+	vmVector3  gapsB   = absPerElem(offsetBA) - boxB.mHalf - absMatrixBA * boxA.mHalf;

 	BaxisTest(0,X);
 	BaxisTest(1,Y);
@@ -825,10 +827,10 @@ boxBoxDistance(
 	// cross product axes

 	// ŠO<C5A0>Ï‚ª‚O‚Ì‚Æ‚«‚Ì‘Î<E28098>ô
-	absMatrixAB += Matrix3(1.0e-5f);
-	absMatrixBA += Matrix3(1.0e-5f);
+	absMatrixAB += vmMatrix3(1.0e-5f);
+	absMatrixBA += vmMatrix3(1.0e-5f);

-	Matrix3 lsqrs, projOffset, projAhalf, projBhalf;
+	vmMatrix3 lsqrs, projOffset, projAhalf, projBhalf;

 	lsqrs.setCol0( mulPerElem( matrixBA.getCol2(), matrixBA.getCol2() ) +
 				   mulPerElem( matrixBA.getCol1(), matrixBA.getCol1() ) );
@@ -841,15 +843,15 @@ boxBoxDistance(
 	projOffset.setCol1(matrixBA.getCol2() * offsetAB.getX() - matrixBA.getCol0() * offsetAB.getZ());
 	projOffset.setCol2(matrixBA.getCol0() * offsetAB.getY() - matrixBA.getCol1() * offsetAB.getX());

-	projAhalf.setCol0(absMatrixBA.getCol1() * boxA.half.getZ() + absMatrixBA.getCol2() * boxA.half.getY());
-	projAhalf.setCol1(absMatrixBA.getCol2() * boxA.half.getX() + absMatrixBA.getCol0() * boxA.half.getZ());
-	projAhalf.setCol2(absMatrixBA.getCol0() * boxA.half.getY() + absMatrixBA.getCol1() * boxA.half.getX());
+	projAhalf.setCol0(absMatrixBA.getCol1() * boxA.mHalf.getZ() + absMatrixBA.getCol2() * boxA.mHalf.getY());
+	projAhalf.setCol1(absMatrixBA.getCol2() * boxA.mHalf.getX() + absMatrixBA.getCol0() * boxA.mHalf.getZ());
+	projAhalf.setCol2(absMatrixBA.getCol0() * boxA.mHalf.getY() + absMatrixBA.getCol1() * boxA.mHalf.getX());

-	projBhalf.setCol0(absMatrixAB.getCol1() * boxB.half.getZ() + absMatrixAB.getCol2() * boxB.half.getY());
-	projBhalf.setCol1(absMatrixAB.getCol2() * boxB.half.getX() + absMatrixAB.getCol0() * boxB.half.getZ());
-	projBhalf.setCol2(absMatrixAB.getCol0() * boxB.half.getY() + absMatrixAB.getCol1() * boxB.half.getX());
+	projBhalf.setCol0(absMatrixAB.getCol1() * boxB.mHalf.getZ() + absMatrixAB.getCol2() * boxB.mHalf.getY());
+	projBhalf.setCol1(absMatrixAB.getCol2() * boxB.mHalf.getX() + absMatrixAB.getCol0() * boxB.mHalf.getZ());
+	projBhalf.setCol2(absMatrixAB.getCol0() * boxB.mHalf.getY() + absMatrixAB.getCol1() * boxB.mHalf.getX());

-	Matrix3 gapsAxB = absPerElem(projOffset) - projAhalf - transpose(projBhalf);
+	vmMatrix3 gapsAxB = absPerElem(projOffset) - projAhalf - transpose(projBhalf);

 	CrossAxisTest(0,0,X);
 	CrossAxisTest(0,1,Y);
@@ -872,7 +874,7 @@ boxBoxDistance(
 			axisA = -axisA;
 		axisB = matrixBA * -axisA;

-		Vector3 absAxisB = Vector3(absPerElem(axisB));
+		vmVector3 absAxisB = vmVector3(absPerElem(axisB));

 		if ( ( absAxisB[0] > absAxisB[1] ) && ( absAxisB[0] > absAxisB[2] ) )
 			faceDimB = 0;
@@ -885,7 +887,7 @@ boxBoxDistance(
 			axisB = -axisB;
 		axisA = matrixAB * -axisB;

-		Vector3 absAxisA = Vector3(absPerElem(axisA));
+		vmVector3 absAxisA = vmVector3(absPerElem(axisA));

 		if ( ( absAxisA[0] > absAxisA[1] ) && ( absAxisA[0] > absAxisA[2] ) )
 			faceDimA = 0;
@@ -900,8 +902,8 @@ boxBoxDistance(
 			axisA = -axisA;
 		axisB = matrixBA * -axisA;

-		Vector3 absAxisA = Vector3(absPerElem(axisA));
-		Vector3 absAxisB = Vector3(absPerElem(axisB));
+		vmVector3 absAxisA = vmVector3(absPerElem(axisA));
+		vmVector3 absAxisB = vmVector3(absPerElem(axisB));

 		dimA[1] = edgeDimA;
 		dimB[1] = edgeDimB;
@@ -966,7 +968,7 @@ boxBoxDistance(
 		dimB[1] = (faceDimB+2)%3;
 	}

-	Matrix3 aperm_col, bperm_col;
+	vmMatrix3 aperm_col, bperm_col;

 	aperm_col.setCol0(ident[dimA[0]]);
 	aperm_col.setCol1(ident[dimA[1]]);
@@ -976,32 +978,32 @@ boxBoxDistance(
 	bperm_col.setCol1(ident[dimB[1]]);
 	bperm_col.setCol2(ident[dimB[2]]);

-	Matrix3 aperm_row, bperm_row;
+	vmMatrix3 aperm_row, bperm_row;

 	aperm_row = transpose(aperm_col);
 	bperm_row = transpose(bperm_col);

 	// permute all box parameters to be in the face coordinate systems

-	Matrix3 matrixAB_perm = aperm_row * matrixAB * bperm_col;
-	Matrix3 matrixBA_perm = transpose(matrixAB_perm);
+	vmMatrix3 matrixAB_perm = aperm_row * matrixAB * bperm_col;
+	vmMatrix3 matrixBA_perm = transpose(matrixAB_perm);

-	Vector3 offsetAB_perm, offsetBA_perm;
+	vmVector3 offsetAB_perm, offsetBA_perm;

 	offsetAB_perm = aperm_row * offsetAB;
 	offsetBA_perm = bperm_row * offsetBA;

-	Vector3 halfA_perm, halfB_perm;
+	vmVector3 halfA_perm, halfB_perm;

-	halfA_perm = aperm_row * boxA.half;
-	halfB_perm = bperm_row * boxB.half;
+	halfA_perm = aperm_row * boxA.mHalf;
+	halfB_perm = bperm_row * boxB.mHalf;

 	// compute the vector between the centers of each face, in each face's coordinate frame

-	Vector3 signsA_perm, signsB_perm, scalesA_perm, scalesB_perm, faceOffsetAB_perm, faceOffsetBA_perm;
+	vmVector3 signsA_perm, signsB_perm, scalesA_perm, scalesB_perm, faceOffsetAB_perm, faceOffsetBA_perm;

-	signsA_perm = copySignPerElem(Vector3(1.0f),aperm_row * axisA);
-	signsB_perm = copySignPerElem(Vector3(1.0f),bperm_row * axisB);
+	signsA_perm = copySignPerElem(vmVector3(1.0f),aperm_row * axisA);
+	signsB_perm = copySignPerElem(vmVector3(1.0f),bperm_row * axisB);
 	scalesA_perm = mulPerElem( signsA_perm, halfA_perm );
 	scalesB_perm = mulPerElem( signsB_perm, halfB_perm );

@@ -1031,11 +1033,11 @@ boxBoxDistance(
 	// if for some reason no case passes the Voronoi test, the features with the minimum
 	// distance are returned.

-	Point3 localPointA_perm, localPointB_perm;
+	vmPoint3 localPointA_perm, localPointB_perm;
 	float minDistSqr;
 	bool done;

-	Vector3 hA_perm( halfA_perm ), hB_perm( halfB_perm );
+	vmVector3 hA_perm( halfA_perm ), hB_perm( halfB_perm );

 	localPointA_perm.setZ( scalesA_perm.getZ() );
 	localPointB_perm.setZ( scalesB_perm.getZ() );
@@ -1109,9 +1111,11 @@ boxBoxDistance(

 	// convert local points from face-local to box-local coordinate system

-	boxPointA.localPoint = Point3( aperm_col * Vector3( localPointA_perm ) );
-	boxPointB.localPoint = Point3( bperm_col * Vector3( localPointB_perm ) );
+	
+	boxPointA.localPoint = vmPoint3( aperm_col * vmVector3( localPointA_perm )) ;
+	boxPointB.localPoint = vmPoint3( bperm_col * vmVector3( localPointB_perm )) ;

+#if 0
 	// find which features of the boxes are involved.
 	// the only feature pairs which occur in this function are VF, FV, and EE, even though the
 	// closest points might actually lie on sub-features, as in a VF contact might be used for
@@ -1144,6 +1148,7 @@ boxBoxDistance(
 	} else {
 		boxPointB.setVertexFeature( sB[0], sB[1], sB[2] );
 	}
+#endif

 	normal = transformA * axisA;

--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h
@@ -21,7 +21,6 @@ subject to the following restrictions:

 #include "Box.h"

-using namespace Vectormath::Aos;

 //---------------------------------------------------------------------------
 // boxBoxDistance:
@@ -38,7 +37,7 @@ using namespace Vectormath::Aos;
 //    positive or negative distance between two boxes.
 //
 // args:
-//    Vector3& normal: set to a unit contact normal pointing from box A to box B.
+//    vmVector3& normal: set to a unit contact normal pointing from box A to box B.
 //
 //    BoxPoint& boxPointA, BoxPoint& boxPointB:
 //       set to a closest point or point of penetration on each box.
@@ -46,7 +45,7 @@ using namespace Vectormath::Aos;
 //    Box boxA, Box boxB:
 //       boxes, represented as 3 half-widths
 //
-//    const Transform3& transformA, const Transform3& transformB:
+//    const vmTransform3& transformA, const vmTransform3& transformB:
 //       box transformations, in world coordinates
 //
 //    float distanceThreshold:
@@ -58,9 +57,9 @@ using namespace Vectormath::Aos;
 //---------------------------------------------------------------------------

 float
-boxBoxDistance(Vector3& normal, BoxPoint& boxPointA, BoxPoint& boxPointB,
-			   PE_REF(Box) boxA, const Transform3 & transformA, PE_REF(Box) boxB,
-			   const Transform3 & transformB,
+boxBoxDistance(vmVector3& normal, BoxPoint& boxPointA, BoxPoint& boxPointB,
+			   PE_REF(Box) boxA, const vmTransform3 & transformA, PE_REF(Box) boxB,
+			   const vmTransform3 & transformB,
 			   float distanceThreshold = FLT_MAX );

 #endif /* __BOXBOXDISTANCE_H__ */
--- a/src/BulletMultiThreaded/TrbDynBody.h
+++ b/src/BulletMultiThreaded/TrbDynBody.h
@@ -0,0 +1,78 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef __T_RB_DYN_BODY_H__
+#define __T_RB_DYN_BODY_H__
+
+#include <vectormath_aos.h>
+using namespace Vectormath::Aos;
+
+#include "TrbStateVec.h"
+
+class CollObject;
+
+class TrbDynBody
+{
+public:
+	TrbDynBody()
+	{
+		fMass   = 0.0f;
+		fCollObject = NULL;
+		fElasticity = 0.2f;
+		fFriction = 0.8f;
+	}
+
+	// Get methods
+	float          getMass() const {return fMass;};
+	float          getElasticity() const {return fElasticity;}
+	float          getFriction() const {return fFriction;}
+	CollObject*    getCollObject() const {return fCollObject;}
+	const Matrix3 &getBodyInertia() const {return fIBody;}
+	const Matrix3 &getBodyInertiaInv() const {return fIBodyInv;}
+	float          getMassInv() const {return fMassInv;}
+
+	// Set methods
+	void           setMass(float mass) {fMass=mass;fMassInv=mass>0.0f?1.0f/mass:0.0f;}
+	void           setBodyInertia(const Matrix3 bodyInertia) {fIBody = bodyInertia;fIBodyInv = inverse(bodyInertia);}
+	void           setElasticity(float elasticity) {fElasticity = elasticity;}
+	void           setFriction(float friction) {fFriction = friction;}
+	void           setCollObject(CollObject *collObj) {fCollObject = collObj;}
+	
+	void           setBodyInertiaInv(const Matrix3 bodyInertiaInv) 
+	{
+		fIBody = inverse(bodyInertiaInv);
+		fIBodyInv = bodyInertiaInv;
+	}
+	void           setMassInv(float invMass) {
+		fMass= invMass>0.0f ? 1.0f/invMass :0.0f;
+		fMassInv=invMass;
+	}
+
+
+private:
+	// Rigid Body constants
+	float          fMass;        // Rigid Body mass
+	float          fMassInv;     // Inverse of mass
+	Matrix3        fIBody;       // Inertia matrix in body's coords
+	Matrix3        fIBodyInv;    // Inertia matrix inverse in body's coords
+	float          fElasticity;  // Coefficient of restitution
+	float          fFriction;    // Coefficient of friction
+
+public:
+	CollObject*    fCollObject;  // Collision object corresponding the RB
+} __attribute__ ((aligned(16)));
+
+#endif /* __T_RB_DYN_BODY_H__ */
--- a/src/BulletMultiThreaded/TrbStateVec.h
+++ b/src/BulletMultiThreaded/TrbStateVec.h
@@ -0,0 +1,334 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef __TRBSTATEVEC_H__
+#define __TRBSTATEVEC_H__
+
+#include <stdlib.h>
+#include <vectormath_aos.h>
+
+
+#include "PlatformDefinitions.h"
+
+
+static inline vmVector3 read_Vector3(const float* p)
+{
+	vmVector3 v;
+	loadXYZ(v, p);
+	return v;
+}
+
+static inline vmQuat read_Quat(const float* p)
+{
+	vmQuat vq;
+	loadXYZW(vq, p);
+	return vq;
+}
+
+static inline void store_Vector3(const vmVector3 &src, float* p)
+{
+	vmVector3 v = src;
+	storeXYZ(v, p);
+}
+
+static inline void store_Quat(const vmQuat &src, float* p)
+{
+	vmQuat vq = src;
+	storeXYZW(vq, p);
+}
+
+// Motion Type
+enum {
+	PfxMotionTypeFixed = 0,
+	PfxMotionTypeActive,
+	PfxMotionTypeKeyframe,
+	PfxMotionTypeOneWay,
+	PfxMotionTypeTrigger,
+	PfxMotionTypeCount
+};
+
+#define PFX_MOTION_MASK_DYNAMIC 0x0a // Active,OneWay
+#define PFX_MOTION_MASK_STATIC  0x95 // Fixed,Keyframe,Trigger,Sleeping
+#define PFX_MOTION_MASK_SLEEP   0x0e // Can sleep
+#define PFX_MOTION_MASK_TYPE    0x7f
+
+//
+// Rigid Body state
+//
+
+#ifdef __CELLOS_LV2__
+ATTRIBUTE_ALIGNED128(class) TrbState
+#else
+ATTRIBUTE_ALIGNED16(class) TrbState
+#endif
+
+{
+public:
+	TrbState()
+	{
+		setMotionType(PfxMotionTypeActive);
+		contactFilterSelf=contactFilterTarget=0xffffffff;
+		deleted = 0;
+		mSleeping = 0;
+		useSleep = 1;
+		trbBodyIdx=0;
+		mSleepCount=0;
+		useCcd = 0;
+		useContactCallback = 0;
+		useSleepCallback = 0;
+		linearDamping = 1.0f;
+		angularDamping = 0.99f;
+	}
+
+	TrbState(const uint8_t m, const vmVector3& x, const vmQuat& q, const vmVector3& v, const vmVector3& omega );
+	
+	uint16_t	mSleepCount;
+	uint8_t		mMotionType;
+	uint8_t		deleted            : 1;
+	uint8_t		mSleeping           : 1;
+	uint8_t		useSleep           : 1;
+	uint8_t		useCcd		       : 1;
+	uint8_t		useContactCallback : 1;
+	uint8_t		useSleepCallback   : 1;
+
+	uint16_t	trbBodyIdx;
+	uint32_t	contactFilterSelf;
+	uint32_t	contactFilterTarget;
+
+	float		center[3];		// AABB center(World)
+	float		half[3];		// AABB half(World)
+
+	float		linearDamping;
+	float		angularDamping;
+	
+	float		deltaLinearVelocity[3];
+	float		deltaAngularVelocity[3];
+
+	float     fX[3];				// position
+	float     fQ[4];				// orientation
+	float     fV[3];				// velocity
+	float     fOmega[3];			// angular velocity
+
+	inline void setZero();      // Zeroes out the elements
+	inline void setIdentity();  // Sets the rotation to identity and zeroes out the other elements
+
+	bool		isDeleted() const {return deleted==1;}
+
+	uint16_t	getRigidBodyId() const {return trbBodyIdx;}
+	void		setRigidBodyId(uint16_t i) {trbBodyIdx = i;}
+
+
+	uint32_t	getContactFilterSelf() const {return contactFilterSelf;}
+	void		setContactFilterSelf(uint32_t filter) {contactFilterSelf = filter;}
+
+	uint32_t	getContactFilterTarget() const {return contactFilterTarget;}
+	void		setContactFilterTarget(uint32_t filter) {contactFilterTarget = filter;}
+
+	float getLinearDamping() const {return linearDamping;}
+	float getAngularDamping() const {return angularDamping;}
+
+	void setLinearDamping(float damping) {linearDamping=damping;}
+	void setAngularDamping(float damping) {angularDamping=damping;}
+
+
+	uint8_t		getMotionType() const {return mMotionType;}
+	void		setMotionType(uint8_t t) {mMotionType = t;mSleeping=0;mSleepCount=0;}
+
+	uint8_t		getMotionMask() const {return (1<<mMotionType)|(mSleeping<<7);}
+
+	bool		isAsleep() const {return mSleeping==1;}
+	bool		isAwake() const {return mSleeping==0;}
+
+	void		wakeup() {mSleeping=0;mSleepCount=0;}
+	void		sleep() {if(useSleep) {mSleeping=1;mSleepCount=0;}}
+
+	uint8_t		getUseSleep() const {return useSleep;}
+	void		setUseSleep(uint8_t b) {useSleep=b;}
+
+	uint8_t		getUseCcd() const {return useCcd;}
+	void		setUseCcd(uint8_t b) {useCcd=b;}
+
+	uint8_t		getUseContactCallback() const {return useContactCallback;}
+	void		setUseContactCallback(uint8_t b) {useContactCallback=b;}
+
+	uint8_t		getUseSleepCallback() const {return useSleepCallback;}
+	void		setUseSleepCallback(uint8_t b) {useSleepCallback=b;}
+
+	void	 	incrementSleepCount() {mSleepCount++;}
+	void		resetSleepCount() {mSleepCount=0;}
+	uint16_t	getSleepCount() const {return mSleepCount;}
+
+	vmVector3 getPosition() const {return read_Vector3(fX);}
+	vmQuat    getOrientation() const {return read_Quat(fQ);}
+	vmVector3 getLinearVelocity() const {return read_Vector3(fV);}
+	vmVector3 getAngularVelocity() const {return read_Vector3(fOmega);}
+	vmVector3 getDeltaLinearVelocity() const {return read_Vector3(deltaLinearVelocity);}
+	vmVector3 getDeltaAngularVelocity() const {return read_Vector3(deltaAngularVelocity);}
+
+	void setPosition(const vmVector3 &pos) {store_Vector3(pos, fX);}
+	void setLinearVelocity(const vmVector3 &vel) {store_Vector3(vel, fV);}
+	void setAngularVelocity(const vmVector3 &vel) {store_Vector3(vel, fOmega);}
+	void setDeltaLinearVelocity(const vmVector3 &vel) {store_Vector3(vel, deltaLinearVelocity);}
+	void setDeltaAngularVelocity(const vmVector3 &vel) {store_Vector3(vel, deltaAngularVelocity);}
+	void setOrientation(const vmQuat &rot) {store_Quat(rot, fQ);}
+
+	inline void setAuxils(const vmVector3 &centerLocal,const vmVector3 &halfLocal);
+	inline void	setAuxilsCcd(const vmVector3 &centerLocal,const vmVector3 &halfLocal,float timeStep);
+	inline	void reset();
+};
+
+inline
+TrbState::TrbState(const uint8_t m, const vmVector3& x, const vmQuat& q, const vmVector3& v, const vmVector3& omega)
+{
+	setMotionType(m);
+	fX[0] = x[0];
+	fX[1] = x[1];
+	fX[2] = x[2];
+	fQ[0] = q[0];
+	fQ[1] = q[1];
+	fQ[2] = q[2];
+	fQ[3] = q[3];
+	fV[0] = v[0];
+	fV[1] = v[1];
+	fV[2] = v[2];
+	fOmega[0] = omega[0];
+	fOmega[1] = omega[1];
+	fOmega[2] = omega[2];
+	contactFilterSelf=contactFilterTarget=0xffff;
+	trbBodyIdx=0;
+	mSleeping = 0;
+	deleted = 0;
+	useSleep = 1;
+	useCcd = 0;
+	useContactCallback = 0;
+	useSleepCallback = 0;
+	mSleepCount=0;
+	linearDamping = 1.0f;
+	angularDamping = 0.99f;
+}
+
+inline void
+TrbState::setIdentity()
+{
+	fX[0] = 0.0f;
+	fX[1] = 0.0f;
+	fX[2] = 0.0f;
+	fQ[0] = 0.0f;
+	fQ[1] = 0.0f;
+	fQ[2] = 0.0f;
+	fQ[3] = 1.0f;
+	fV[0] = 0.0f;
+	fV[1] = 0.0f;
+	fV[2] = 0.0f;
+	fOmega[0] = 0.0f;
+	fOmega[1] = 0.0f;
+	fOmega[2] = 0.0f;
+}
+
+inline void
+TrbState::setZero()
+{
+	fX[0] = 0.0f;
+	fX[1] = 0.0f;
+	fX[2] = 0.0f;
+	fQ[0] = 0.0f;
+	fQ[1] = 0.0f;
+	fQ[2] = 0.0f;
+	fQ[3] = 0.0f;
+	fV[0] = 0.0f;
+	fV[1] = 0.0f;
+	fV[2] = 0.0f;
+	fOmega[0] = 0.0f;
+	fOmega[1] = 0.0f;
+	fOmega[2] = 0.0f;
+}
+
+inline void
+TrbState::setAuxils(const vmVector3 &centerLocal,const vmVector3 &halfLocal)
+{
+	vmVector3 centerW = getPosition() + rotate(getOrientation(),centerLocal);
+	vmVector3 halfW = absPerElem(vmMatrix3(getOrientation())) * halfLocal;
+	center[0] = centerW[0];
+	center[1] = centerW[1];
+	center[2] = centerW[2];
+	half[0] = halfW[0];
+	half[1] = halfW[1];
+	half[2] = halfW[2];
+}
+
+inline void
+TrbState::setAuxilsCcd(const vmVector3 &centerLocal,const vmVector3 &halfLocal,float timeStep)
+{
+	vmVector3 centerW = getPosition() + rotate(getOrientation(),centerLocal);
+	vmVector3 halfW = absPerElem(vmMatrix3(getOrientation())) * halfLocal;
+
+	vmVector3 diffvec = getLinearVelocity()*timeStep;
+
+	vmVector3 newCenter = centerW + diffvec;
+	vmVector3 aabbMin = minPerElem(newCenter - halfW,centerW - halfW);
+	vmVector3 aabbMax = maxPerElem(newCenter + halfW,centerW + halfW);
+	
+	centerW = 0.5f * (aabbMin + aabbMax);
+	halfW =0.5f * (aabbMax - aabbMin);
+
+	center[0] = centerW[0];
+	center[1] = centerW[1];
+	center[2] = centerW[2];
+
+	half[0] = halfW[0];
+	half[1] = halfW[1];
+	half[2] = halfW[2];
+}
+
+inline
+void TrbState::reset()
+{
+#if 0
+	mSleepCount = 0;
+	mMotionType = PfxMotionTypeActive;
+	mDeleted = 0;
+	mSleeping = 0;
+	mUseSleep = 1;
+	mUseCcd = 0;
+	mUseContactCallback = 0;
+	mUseSleepCallback = 0;
+	mRigidBodyId = 0;
+	mContactFilterSelf = 0xffffffff;
+	mContactFilterTarget = 0xffffffff;
+	mLinearDamping = 1.0f;
+	mAngularDamping = 0.99f;
+	mPosition = vmVector3(0.0f);
+	mOrientation = vmQuat::identity();
+	mLinearVelocity = vmVector3(0.0f);
+	mAngularVelocity = vmVector3(0.0f);
+#endif
+
+	setMotionType(PfxMotionTypeActive);
+	contactFilterSelf=contactFilterTarget=0xffffffff;
+	deleted = 0;
+	mSleeping = 0;
+	useSleep = 1;
+	trbBodyIdx=0;
+	mSleepCount=0;
+	useCcd = 0;
+	useContactCallback = 0;
+	useSleepCallback = 0;
+	linearDamping = 1.0f;
+	angularDamping = 0.99f;
+}
+
+#endif /* __TRBSTATEVEC_H__ */
+
--- a/src/BulletMultiThreaded/btParallelConstraintSolver.cpp
+++ b/src/BulletMultiThreaded/btParallelConstraintSolver.cpp
--- a/src/BulletMultiThreaded/btParallelConstraintSolver.h
+++ b/src/BulletMultiThreaded/btParallelConstraintSolver.h
@@ -18,22 +18,238 @@ subject to the following restrictions:
 #define __BT_PARALLEL_CONSTRAINT_SOLVER_H

 #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"
-									      
+
+
+
+
+#include "LinearMath/btScalar.h"
+#include "PlatformDefinitions.h"
+
+
+#define PFX_MAX_SOLVER_PHASES 64
+#define PFX_MAX_SOLVER_BATCHES 16
+#define PFX_MAX_SOLVER_PAIRS  128
+#define PFX_MIN_SOLVER_PAIRS  16
+
+#ifdef __CELLOS_LV2__
+ATTRIBUTE_ALIGNED128(struct) PfxParallelBatch {
+#else
+ATTRIBUTE_ALIGNED16(struct) PfxParallelBatch {
+#endif
+	uint16_t pairIndices[PFX_MAX_SOLVER_PAIRS];
+};
+
+#ifdef __CELLOS_LV2__
+ATTRIBUTE_ALIGNED128(struct) PfxParallelGroup {
+#else
+ATTRIBUTE_ALIGNED16(struct) PfxParallelGroup {
+#endif
+	uint16_t numPhases;
+	uint16_t numBatches[PFX_MAX_SOLVER_PHASES];
+	uint16_t numPairs[PFX_MAX_SOLVER_PHASES*PFX_MAX_SOLVER_BATCHES];
+};
+
+
+
+ATTRIBUTE_ALIGNED16(struct) PfxSortData16 {
+	union {
+		uint8_t   i8data[16];
+		uint16_t  i16data[8];
+		uint32_t  i32data[4];
+#ifdef __SPU__
+		vec_uint4 vdata;
+#endif
+	};
+
+#ifdef __SPU__
+	void set8(int elem,uint8_t data)   {vdata=(vec_uint4)spu_insert(data,(vec_uchar16)vdata,elem);}
+	void set16(int elem,uint16_t data) {vdata=(vec_uint4)spu_insert(data,(vec_ushort8)vdata,elem);}
+	void set32(int elem,uint32_t data) {vdata=(vec_uint4)spu_insert(data,(vec_uint4)vdata,elem);}
+	uint8_t get8(int elem)   const {return spu_extract((vec_uchar16)vdata,elem);}
+	uint16_t get16(int elem) const {return spu_extract((vec_ushort8)vdata,elem);}
+	uint32_t get32(int elem) const {return spu_extract((vec_uint4)vdata,elem);}
+#else
+	void set8(int elem,uint8_t data)   {i8data[elem] = data;}
+	void set16(int elem,uint16_t data) {i16data[elem] = data;}
+	void set32(int elem,uint32_t data) {i32data[elem] = data;}
+	uint8_t get8(int elem)   const {return i8data[elem];}
+	uint16_t get16(int elem) const {return i16data[elem];}
+	uint32_t get32(int elem) const {return i32data[elem];}
+#endif
+};
+
+typedef PfxSortData16 PfxConstraintPair;
+
+
+//J	PfxBroadphasePair‚Æ‹¤’Ê
+
+SIMD_FORCE_INLINE void pfxSetConstraintId(PfxConstraintPair &pair,uint32_t i)	{pair.set32(2,i);}
+SIMD_FORCE_INLINE void pfxSetNumConstraints(PfxConstraintPair &pair,uint8_t n)	{pair.set8(7,n);}
+
+SIMD_FORCE_INLINE uint32_t pfxGetConstraintId1(const PfxConstraintPair &pair)	{return pair.get32(2);}
+SIMD_FORCE_INLINE uint8_t  pfxGetNumConstraints(const PfxConstraintPair &pair)	{return pair.get8(7);}
+
+typedef PfxSortData16 PfxBroadphasePair;
+
+SIMD_FORCE_INLINE void pfxSetRigidBodyIdA(PfxBroadphasePair &pair,uint16_t i)	{pair.set16(0,i);}
+SIMD_FORCE_INLINE void pfxSetRigidBodyIdB(PfxBroadphasePair &pair,uint16_t i)	{pair.set16(1,i);}
+SIMD_FORCE_INLINE void pfxSetMotionMaskA(PfxBroadphasePair &pair,uint8_t i)		{pair.set8(4,i);}
+SIMD_FORCE_INLINE void pfxSetMotionMaskB(PfxBroadphasePair &pair,uint8_t i)		{pair.set8(5,i);}
+SIMD_FORCE_INLINE void pfxSetBroadphaseFlag(PfxBroadphasePair &pair,uint8_t f)	{pair.set8(6,(pair.get8(6)&0xf0)|(f&0x0f));}
+SIMD_FORCE_INLINE void pfxSetActive(PfxBroadphasePair &pair,bool b)			{pair.set8(6,(pair.get8(6)&0x0f)|((b?1:0)<<4));}
+SIMD_FORCE_INLINE void pfxSetContactId(PfxBroadphasePair &pair,uint32_t i)		{pair.set32(2,i);}
+
+SIMD_FORCE_INLINE uint16_t pfxGetRigidBodyIdA(const PfxBroadphasePair &pair)	{return pair.get16(0);}
+SIMD_FORCE_INLINE uint16_t pfxGetRigidBodyIdB(const PfxBroadphasePair &pair)	{return pair.get16(1);}
+SIMD_FORCE_INLINE uint8_t  pfxGetMotionMaskA(const PfxBroadphasePair &pair)		{return pair.get8(4);}
+SIMD_FORCE_INLINE uint8_t  pfxGetMotionMaskB(const PfxBroadphasePair &pair)		{return pair.get8(5);}
+SIMD_FORCE_INLINE uint8_t  pfxGetBroadphaseFlag(const PfxBroadphasePair &pair)	{return pair.get8(6)&0x0f;}
+SIMD_FORCE_INLINE bool     pfxGetActive(const PfxBroadphasePair &pair)			{return (pair.get8(6)>>4)!=0;}
+SIMD_FORCE_INLINE uint32_t pfxGetContactId1(const PfxBroadphasePair &pair)		{return pair.get32(2);}
+
+
+
+#if defined(__PPU__) || defined (__SPU__)
+ATTRIBUTE_ALIGNED128(struct) PfxSolverBody {
+#else
+ATTRIBUTE_ALIGNED16(struct) PfxSolverBody {
+#endif
+	vmVector3 mDeltaLinearVelocity;
+	vmVector3 mDeltaAngularVelocity;
+	vmMatrix3 mInertiaInv;
+	vmQuat    mOrientation;
+	float   mMassInv;
+	float   friction;
+	float   restitution;
+	float   unused;
+	float   unused2;
+	float   unused3;
+	float   unused4;
+	float   unused5;
+};
+
+
+#ifdef __PPU__
+#include "SpuDispatch/BulletPE2ConstraintSolverSpursSupport.h"
+#endif
+
+
+
+static SIMD_FORCE_INLINE vmVector3 btReadVector3(const float* p)
+{
+	vmVector3 v;
+	loadXYZ(v, p);
+	return v;
+}
+
+static SIMD_FORCE_INLINE vmQuat btReadQuat(const float* p)
+{
+	vmQuat vq;
+	loadXYZW(vq, p);
+	return vq;
+}
+
+static SIMD_FORCE_INLINE void btStoreVector3(const vmVector3 &src, float* p)
+{
+	vmVector3 v = src;
+	storeXYZ(v, p);
+}
+
+
+
+
+class btPersistentManifold;
+
+enum {
+	PFX_CONSTRAINT_SOLVER_CMD_SETUP_SOLVER_BODIES,
+	PFX_CONSTRAINT_SOLVER_CMD_SETUP_CONTACT_CONSTRAINTS,
+	PFX_CONSTRAINT_SOLVER_CMD_SETUP_JOINT_CONSTRAINTS,
+	PFX_CONSTRAINT_SOLVER_CMD_SOLVE_CONSTRAINTS,
+	PFX_CONSTRAINT_SOLVER_CMD_POST_SOLVER
+};
+
+
+struct PfxSetupContactConstraintsIO {
+	PfxConstraintPair *offsetContactPairs;
+	uint32_t numContactPairs1;
+	class TrbState *offsetRigStates;
+	struct PfxSolverBody *offsetSolverBodies;
+	uint32_t numRigidBodies;
+	float separateBias;
+	float timeStep;
+	class btCriticalSection* criticalSection;
+};
+
+
+
+struct PfxSolveConstraintsIO {
+	PfxParallelGroup *contactParallelGroup;
+	PfxParallelBatch *contactParallelBatches;
+	PfxConstraintPair *contactPairs;
+	uint32_t numContactPairs;
+	btPersistentManifold *offsetContactManifolds;
+	PfxParallelGroup *jointParallelGroup;
+	PfxParallelBatch *jointParallelBatches;
+	PfxConstraintPair *jointPairs;
+	uint32_t numJointPairs;
+	TrbState *offsetRigStates;
+	PfxSolverBody *offsetSolverBodies;
+	uint32_t numRigidBodies;
+	uint32_t iteration;
+
+	uint32_t	taskId;
+	
+	class btBarrier* barrier;
+
+};
+
+struct PfxPostSolverIO {
+	TrbState *states;
+	PfxSolverBody *solverBodies;
+	uint32_t numRigidBodies;
+};
+
+ATTRIBUTE_ALIGNED16(struct) btConstraintSolverIO {
+	uint8_t cmd;
+	union {
+		PfxSetupContactConstraintsIO setupContactConstraints;
+		PfxSolveConstraintsIO solveConstraints;
+		PfxPostSolverIO postSolver;
+	};
+	
+	//SPU only
+	uint32_t barrierAddr2;
+	uint32_t criticalsectionAddr2;
+	uint32_t maxTasks1;
+};
+
+
+
+
+void	SolverThreadFunc(void* userPtr,void* lsMemory);
+void*	SolverlsMemoryFunc();
+///The btParallelConstraintSolver performs computations on constraint rows in parallel
+///Using the cross-platform threading it supports Windows, Linux, Mac OSX and PlayStation 3 Cell SPUs
 class btParallelConstraintSolver : public btSequentialImpulseConstraintSolver
 {
+	
 protected:
+	struct btParallelSolverMemoryCache*	m_memoryCache;
+
+	class btThreadSupportInterface*	m_solverThreadSupport;
+
+	struct btConstraintSolverIO* m_solverIO;
+	class btBarrier*			m_barrier;
+	class btCriticalSection*	m_criticalSection;
+

 public:

-	btParallelConstraintSolver();
+	btParallelConstraintSolver(class btThreadSupportInterface* solverThreadSupport);
 	
 	virtual ~btParallelConstraintSolver();

-	//virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher);
-	
-	btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc);
-
-
+	virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher);

 };

--- a/src/BulletMultiThreaded/vectormath/scalar/cpp/boolInVec.h
+++ b/src/BulletMultiThreaded/vectormath/scalar/cpp/boolInVec.h
--- a/src/BulletMultiThreaded/vectormath/scalar/cpp/floatInVec.h
+++ b/src/BulletMultiThreaded/vectormath/scalar/cpp/floatInVec.h
--- a/src/BulletMultiThreaded/vectormath/scalar/cpp/mat_aos.h
+++ b/src/BulletMultiThreaded/vectormath/scalar/cpp/mat_aos.h
--- a/src/BulletMultiThreaded/vectormath/scalar/cpp/quat_aos.h
+++ b/src/BulletMultiThreaded/vectormath/scalar/cpp/quat_aos.h
--- a/src/BulletMultiThreaded/vectormath/scalar/cpp/vec_aos.h
+++ b/src/BulletMultiThreaded/vectormath/scalar/cpp/vec_aos.h
--- a/src/BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h
+++ b/src/BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h
--- a/src/BulletMultiThreaded/vectormath/sse/boolInVec.h
+++ b/src/BulletMultiThreaded/vectormath/sse/boolInVec.h
@@ -0,0 +1,247 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _BOOLINVEC_H
+#define _BOOLINVEC_H
+
+#include <math.h>
+
+namespace Vectormath {
+
+class floatInVec;
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec class
+//
+
+class boolInVec
+{
+    private:
+        __m128 mData;
+
+        inline boolInVec(__m128 vec);
+    public:
+        inline boolInVec() {}
+
+        // matches standard type conversions
+        //
+        inline boolInVec(const floatInVec &vec);
+
+        // explicit cast from bool
+        //
+        explicit inline boolInVec(bool scalar);
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+        // explicit cast to bool
+        // 
+        inline bool getAsBool() const;
+#else
+        // implicit cast to bool
+        // 
+        inline operator bool() const;
+#endif
+        
+        // get vector data
+        // bool value is splatted across all word slots of vector as 0 (false) or -1 (true)
+        //
+        inline __m128 get128() const;
+
+        // operators
+        //
+        inline const boolInVec operator ! () const;
+        inline boolInVec& operator = (const boolInVec &vec);
+        inline boolInVec& operator &= (const boolInVec &vec);
+        inline boolInVec& operator ^= (const boolInVec &vec);
+        inline boolInVec& operator |= (const boolInVec &vec);
+
+        // friend functions
+        //
+        friend inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1);
+};
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec functions
+//
+
+// operators
+//
+inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1);
+
+// select between vec0 and vec1 using boolInVec.
+// false selects vec0, true selects vec1
+//
+inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1);
+
+} // namespace Vectormath
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec implementation
+//
+
+#include "floatInVec.h"
+
+namespace Vectormath {
+
+inline
+boolInVec::boolInVec(__m128 vec)
+{
+    mData = vec;
+}
+
+inline
+boolInVec::boolInVec(const floatInVec &vec)
+{
+    *this = (vec != floatInVec(0.0f));
+}
+
+inline
+boolInVec::boolInVec(bool scalar)
+{
+    unsigned int mask = -(int)scalar;
+	mData = _mm_set1_ps(*(float *)&mask); // TODO: Union
+}
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+inline
+bool
+boolInVec::getAsBool() const
+#else
+inline
+boolInVec::operator bool() const
+#endif
+{
+	return *(bool *)&mData;
+}
+
+inline
+__m128
+boolInVec::get128() const
+{
+    return mData;
+}
+
+inline
+const boolInVec
+boolInVec::operator ! () const
+{
+    return boolInVec(_mm_andnot_ps(mData, _mm_cmpneq_ps(_mm_setzero_ps(),_mm_setzero_ps())));
+}
+
+inline
+boolInVec&
+boolInVec::operator = (const boolInVec &vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator &= (const boolInVec &vec)
+{
+    *this = *this & vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator ^= (const boolInVec &vec)
+{
+    *this = *this ^ vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator |= (const boolInVec &vec)
+{
+    *this = *this | vec;
+    return *this;
+}
+
+inline
+const boolInVec
+operator == (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator != (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128()));
+}
+    
+inline
+const boolInVec
+operator & (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_and_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator | (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_or_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator ^ (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_xor_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1)
+{
+	return boolInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128()));
+}
+ 
+} // namespace Vectormath
+
+#endif // boolInVec_h
--- a/src/BulletMultiThreaded/vectormath/sse/floatInVec.h
+++ b/src/BulletMultiThreaded/vectormath/sse/floatInVec.h
@@ -0,0 +1,340 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _FLOATINVEC_H
+#define _FLOATINVEC_H
+
+#include <math.h>
+#include <xmmintrin.h>
+
+namespace Vectormath {
+
+class boolInVec;
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec class
+//
+
+class floatInVec
+{
+    private:
+        __m128 mData;
+
+    public:
+        inline floatInVec(__m128 vec);
+
+        inline floatInVec() {}
+
+        // matches standard type conversions
+        //
+        inline floatInVec(const boolInVec &vec);
+
+        // construct from a slot of __m128
+        //
+        inline floatInVec(__m128 vec, int slot);
+        
+        // explicit cast from float
+        //
+        explicit inline floatInVec(float scalar);
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+        // explicit cast to float
+        // 
+        inline float getAsFloat() const;
+#else
+        // implicit cast to float
+        //
+        inline operator float() const;
+#endif
+
+        // get vector data
+        // float value is splatted across all word slots of vector
+        //
+        inline __m128 get128() const;
+
+        // operators
+        // 
+        inline const floatInVec operator ++ (int);
+        inline const floatInVec operator -- (int);
+        inline floatInVec& operator ++ ();
+        inline floatInVec& operator -- ();
+        inline const floatInVec operator - () const;
+        inline floatInVec& operator = (const floatInVec &vec);
+        inline floatInVec& operator *= (const floatInVec &vec);
+        inline floatInVec& operator /= (const floatInVec &vec);
+        inline floatInVec& operator += (const floatInVec &vec);
+        inline floatInVec& operator -= (const floatInVec &vec);
+
+        // friend functions
+        //
+        friend inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, boolInVec select_vec1);
+};
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec functions
+//
+
+// operators
+// 
+inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1);
+inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1);
+inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1);
+inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1);
+
+// select between vec0 and vec1 using boolInVec.
+// false selects vec0, true selects vec1
+//
+inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1);
+
+} // namespace Vectormath
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec implementation
+//
+
+#include "boolInVec.h"
+
+namespace Vectormath {
+
+inline
+floatInVec::floatInVec(__m128 vec)
+{
+    mData = vec;
+}
+
+inline
+floatInVec::floatInVec(const boolInVec &vec)
+{
+	mData = vec_sel(_mm_setzero_ps(), _mm_set1_ps(1.0f), vec.get128());
+}
+
+inline
+floatInVec::floatInVec(__m128 vec, int slot)
+{
+	SSEFloat v;
+	v.m128 = vec;
+	mData = _mm_set1_ps(v.f[slot]);
+}
+
+inline
+floatInVec::floatInVec(float scalar)
+{
+	mData = _mm_set1_ps(scalar);
+}
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+inline
+float
+floatInVec::getAsFloat() const
+#else
+inline
+floatInVec::operator float() const
+#endif
+{
+    return *((float *)&mData);
+}
+
+inline
+__m128
+floatInVec::get128() const
+{
+    return mData;
+}
+
+inline
+const floatInVec
+floatInVec::operator ++ (int)
+{
+    __m128 olddata = mData;
+    operator ++();
+    return floatInVec(olddata);
+}
+
+inline
+const floatInVec
+floatInVec::operator -- (int)
+{
+    __m128 olddata = mData;
+    operator --();
+    return floatInVec(olddata);
+}
+
+inline
+floatInVec&
+floatInVec::operator ++ ()
+{
+    *this += floatInVec(_mm_set1_ps(1.0f));
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -- ()
+{
+    *this -= floatInVec(_mm_set1_ps(1.0f));
+    return *this;
+}
+
+inline
+const floatInVec
+floatInVec::operator - () const
+{
+    return floatInVec(_mm_sub_ps(_mm_setzero_ps(), mData));
+}
+
+inline
+floatInVec&
+floatInVec::operator = (const floatInVec &vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator *= (const floatInVec &vec)
+{
+    *this = *this * vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator /= (const floatInVec &vec)
+{
+    *this = *this / vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator += (const floatInVec &vec)
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -= (const floatInVec &vec)
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline
+const floatInVec
+operator * (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return floatInVec(_mm_mul_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const floatInVec
+operator / (const floatInVec &num, const floatInVec &den)
+{
+    return floatInVec(_mm_div_ps(num.get128(), den.get128()));
+}
+
+inline
+const floatInVec
+operator + (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return floatInVec(_mm_add_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const floatInVec
+operator - (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return floatInVec(_mm_sub_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator < (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpgt_ps(vec1.get128(), vec0.get128()));
+}
+
+inline
+const boolInVec
+operator <= (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpge_ps(vec1.get128(), vec0.get128()));
+}
+
+inline
+const boolInVec
+operator > (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpgt_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator >= (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpge_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator == (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator != (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128()));
+}
+    
+inline
+const floatInVec
+select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1)
+{
+    return floatInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128()));
+}
+
+} // namespace Vectormath
+
+#endif // floatInVec_h
--- a/src/BulletMultiThreaded/vectormath/sse/mat_aos.h
+++ b/src/BulletMultiThreaded/vectormath/sse/mat_aos.h
--- a/src/BulletMultiThreaded/vectormath/sse/quat_aos.h
+++ b/src/BulletMultiThreaded/vectormath/sse/quat_aos.h
@@ -0,0 +1,579 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef _VECTORMATH_QUAT_AOS_CPP_H
+#define _VECTORMATH_QUAT_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+__forceinline void Quat::set128(vec_float4 vec)
+{
+    mVec128 = vec;
+}
+
+__forceinline Quat::Quat( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
+{
+	mVec128 = _mm_unpacklo_ps(
+		_mm_unpacklo_ps( _x.get128(), _z.get128() ),
+		_mm_unpacklo_ps( _y.get128(), _w.get128() ) );
+}
+
+__forceinline Quat::Quat( const Vector3 &xyz, float _w )
+{
+    mVec128 = xyz.get128();
+    _vmathVfSetElement(mVec128, _w, 3);
+}
+
+
+
+__forceinline  Quat::Quat(const Quat& quat)
+{
+	mVec128 = quat.get128();
+}
+
+__forceinline Quat::Quat( float _x, float _y, float _z, float _w )
+{
+	mVec128 = _mm_setr_ps(_x, _y, _z, _w);
+}
+
+
+
+
+
+__forceinline Quat::Quat( const Vector3 &xyz, const floatInVec &_w )
+{
+    mVec128 = xyz.get128();
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+}
+
+__forceinline Quat::Quat( const Vector4 &vec )
+{
+    mVec128 = vec.get128();
+}
+
+__forceinline Quat::Quat( float scalar )
+{
+    mVec128 = floatInVec(scalar).get128();
+}
+
+__forceinline Quat::Quat( const floatInVec &scalar )
+{
+    mVec128 = scalar.get128();
+}
+
+__forceinline Quat::Quat( __m128 vf4 )
+{
+    mVec128 = vf4;
+}
+
+__forceinline const Quat Quat::identity( )
+{
+    return Quat( _VECTORMATH_UNIT_0001 );
+}
+
+__forceinline const Quat lerp( float t, const Quat &quat0, const Quat &quat1 )
+{
+    return lerp( floatInVec(t), quat0, quat1 );
+}
+
+__forceinline const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 )
+{
+    return ( quat0 + ( ( quat1 - quat0 ) * t ) );
+}
+
+__forceinline const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 )
+{
+    return slerp( floatInVec(t), unitQuat0, unitQuat1 );
+}
+
+__forceinline const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 )
+{
+    Quat start;
+    vec_float4 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
+    __m128 selectMask;
+    cosAngle = _vmathVfDot4( unitQuat0.get128(), unitQuat1.get128() );
+    selectMask = (__m128)vec_cmpgt( _mm_setzero_ps(), cosAngle );
+    cosAngle = vec_sel( cosAngle, negatef4( cosAngle ), selectMask );
+    start = Quat( vec_sel( unitQuat0.get128(), negatef4( unitQuat0.get128() ), selectMask ) );
+    selectMask = (__m128)vec_cmpgt( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
+    angle = acosf4( cosAngle );
+    tttt = t.get128();
+    oneMinusT = vec_sub( _mm_set1_ps(1.0f), tttt );
+    angles = vec_mergeh( _mm_set1_ps(1.0f), tttt );
+    angles = vec_mergeh( angles, oneMinusT );
+    angles = vec_madd( angles, angle, _mm_setzero_ps() );
+    sines = sinf4( angles );
+    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
+    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
+    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
+    return Quat( vec_madd( start.get128(), scale0, vec_mul( unitQuat1.get128(), scale1 ) ) );
+}
+
+__forceinline const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
+{
+    return squad( floatInVec(t), unitQuat0, unitQuat1, unitQuat2, unitQuat3 );
+}
+
+__forceinline const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
+{
+    return slerp( ( ( floatInVec(2.0f) * t ) * ( floatInVec(1.0f) - t ) ), slerp( t, unitQuat0, unitQuat3 ), slerp( t, unitQuat1, unitQuat2 ) );
+}
+
+__forceinline __m128 Quat::get128( ) const
+{
+    return mVec128;
+}
+
+__forceinline Quat & Quat::operator =( const Quat &quat )
+{
+    mVec128 = quat.mVec128;
+    return *this;
+}
+
+__forceinline Quat & Quat::setXYZ( const Vector3 &vec )
+{
+	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+	mVec128 = vec_sel( vec.get128(), mVec128, sw );
+    return *this;
+}
+
+__forceinline const Vector3 Quat::getXYZ( ) const
+{
+    return Vector3( mVec128 );
+}
+
+__forceinline Quat & Quat::setX( float _x )
+{
+    _vmathVfSetElement(mVec128, _x, 0);
+    return *this;
+}
+
+__forceinline Quat & Quat::setX( const floatInVec &_x )
+{
+    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
+    return *this;
+}
+
+__forceinline const floatInVec Quat::getX( ) const
+{
+    return floatInVec( mVec128, 0 );
+}
+
+__forceinline Quat & Quat::setY( float _y )
+{
+    _vmathVfSetElement(mVec128, _y, 1);
+    return *this;
+}
+
+__forceinline Quat & Quat::setY( const floatInVec &_y )
+{
+    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
+    return *this;
+}
+
+__forceinline const floatInVec Quat::getY( ) const
+{
+    return floatInVec( mVec128, 1 );
+}
+
+__forceinline Quat & Quat::setZ( float _z )
+{
+    _vmathVfSetElement(mVec128, _z, 2);
+    return *this;
+}
+
+__forceinline Quat & Quat::setZ( const floatInVec &_z )
+{
+    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
+    return *this;
+}
+
+__forceinline const floatInVec Quat::getZ( ) const
+{
+    return floatInVec( mVec128, 2 );
+}
+
+__forceinline Quat & Quat::setW( float _w )
+{
+    _vmathVfSetElement(mVec128, _w, 3);
+    return *this;
+}
+
+__forceinline Quat & Quat::setW( const floatInVec &_w )
+{
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+    return *this;
+}
+
+__forceinline const floatInVec Quat::getW( ) const
+{
+    return floatInVec( mVec128, 3 );
+}
+
+__forceinline Quat & Quat::setElem( int idx, float value )
+{
+    _vmathVfSetElement(mVec128, value, idx);
+    return *this;
+}
+
+__forceinline Quat & Quat::setElem( int idx, const floatInVec &value )
+{
+    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
+    return *this;
+}
+
+__forceinline const floatInVec Quat::getElem( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+__forceinline VecIdx Quat::operator []( int idx )
+{
+    return VecIdx( mVec128, idx );
+}
+
+__forceinline const floatInVec Quat::operator []( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+__forceinline const Quat Quat::operator +( const Quat &quat ) const
+{
+    return Quat( _mm_add_ps( mVec128, quat.mVec128 ) );
+}
+
+
+__forceinline const Quat Quat::operator -( const Quat &quat ) const
+{
+    return Quat( _mm_sub_ps( mVec128, quat.mVec128 ) );
+}
+
+__forceinline const Quat Quat::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+__forceinline const Quat Quat::operator *( const floatInVec &scalar ) const
+{
+    return Quat( _mm_mul_ps( mVec128, scalar.get128() ) );
+}
+
+__forceinline Quat & Quat::operator +=( const Quat &quat )
+{
+    *this = *this + quat;
+    return *this;
+}
+
+__forceinline Quat & Quat::operator -=( const Quat &quat )
+{
+    *this = *this - quat;
+    return *this;
+}
+
+__forceinline Quat & Quat::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+__forceinline Quat & Quat::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+__forceinline const Quat Quat::operator /( float scalar ) const
+{
+    return *this / floatInVec(scalar);
+}
+
+__forceinline const Quat Quat::operator /( const floatInVec &scalar ) const
+{
+    return Quat( _mm_div_ps( mVec128, scalar.get128() ) );
+}
+
+__forceinline Quat & Quat::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+__forceinline Quat & Quat::operator /=( const floatInVec &scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+__forceinline const Quat Quat::operator -( ) const
+{
+	return Quat(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
+}
+
+__forceinline const Quat operator *( float scalar, const Quat &quat )
+{
+    return floatInVec(scalar) * quat;
+}
+
+__forceinline const Quat operator *( const floatInVec &scalar, const Quat &quat )
+{
+    return quat * scalar;
+}
+
+__forceinline const floatInVec dot( const Quat &quat0, const Quat &quat1 )
+{
+    return floatInVec( _vmathVfDot4( quat0.get128(), quat1.get128() ), 0 );
+}
+
+__forceinline const floatInVec norm( const Quat &quat )
+{
+    return floatInVec(  _vmathVfDot4( quat.get128(), quat.get128() ), 0 );
+}
+
+__forceinline const floatInVec length( const Quat &quat )
+{
+    return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( quat.get128(), quat.get128() )), 0 );
+}
+
+__forceinline const Quat normalize( const Quat &quat )
+{
+	vec_float4 dot =_vmathVfDot4( quat.get128(), quat.get128());
+    return Quat( _mm_mul_ps( quat.get128(), newtonrapson_rsqrt4( dot ) ) );
+}
+
+
+__forceinline const Quat Quat::rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 )
+{
+    Vector3 crossVec;
+    __m128 cosAngle, cosAngleX2Plus2, recipCosHalfAngleX2, cosHalfAngleX2, res;
+    cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() );
+    cosAngleX2Plus2 = vec_madd( cosAngle, _mm_set1_ps(2.0f), _mm_set1_ps(2.0f) );
+    recipCosHalfAngleX2 = _mm_rsqrt_ps( cosAngleX2Plus2 );
+    cosHalfAngleX2 = vec_mul( recipCosHalfAngleX2, cosAngleX2Plus2 );
+    crossVec = cross( unitVec0, unitVec1 );
+    res = vec_mul( crossVec.get128(), recipCosHalfAngleX2 );
+	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( res, vec_mul( cosHalfAngleX2, _mm_set1_ps(0.5f) ), sw );
+    return Quat( res );
+}
+
+__forceinline const Quat Quat::rotation( float radians, const Vector3 &unitVec )
+{
+    return rotation( floatInVec(radians), unitVec );
+}
+
+__forceinline const Quat Quat::rotation( const floatInVec &radians, const Vector3 &unitVec )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( vec_mul( unitVec.get128(), s ), c, sw );
+    return Quat( res );
+}
+
+__forceinline const Quat Quat::rotationX( float radians )
+{
+    return rotationX( floatInVec(radians) );
+}
+
+__forceinline const Quat Quat::rotationX( const floatInVec &radians )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	__declspec(align(16)) unsigned int xsw[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( _mm_setzero_ps(), s, xsw );
+    res = vec_sel( res, c, wsw );
+    return Quat( res );
+}
+
+__forceinline const Quat Quat::rotationY( float radians )
+{
+    return rotationY( floatInVec(radians) );
+}
+
+__forceinline const Quat Quat::rotationY( const floatInVec &radians )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	__declspec(align(16)) unsigned int ysw[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( _mm_setzero_ps(), s, ysw );
+    res = vec_sel( res, c, wsw );
+    return Quat( res );
+}
+
+__forceinline const Quat Quat::rotationZ( float radians )
+{
+    return rotationZ( floatInVec(radians) );
+}
+
+__forceinline const Quat Quat::rotationZ( const floatInVec &radians )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	__declspec(align(16)) unsigned int zsw[4] = {0, 0, 0xffffffff, 0};
+	__declspec(align(16)) unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( _mm_setzero_ps(), s, zsw );
+    res = vec_sel( res, c, wsw );
+    return Quat( res );
+}
+
+__forceinline const Quat Quat::operator *( const Quat &quat ) const
+{
+    __m128 ldata, rdata, qv, tmp0, tmp1, tmp2, tmp3;
+    __m128 product, l_wxyz, r_wxyz, xy, qw;
+    ldata = mVec128;
+    rdata = quat.mVec128;
+    tmp0 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,0,2,1) );
+    tmp1 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,1,0,2) );
+    tmp2 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,0,2,1) );
+    qv = vec_mul( vec_splat( ldata, 3 ), rdata );
+    qv = vec_madd( vec_splat( rdata, 3 ), ldata, qv );
+    qv = vec_madd( tmp0, tmp1, qv );
+    qv = vec_nmsub( tmp2, tmp3, qv );
+    product = vec_mul( ldata, rdata );
+    l_wxyz = vec_sld( ldata, ldata, 12 );
+    r_wxyz = vec_sld( rdata, rdata, 12 );
+    qw = vec_nmsub( l_wxyz, r_wxyz, product );
+    xy = vec_madd( l_wxyz, r_wxyz, product );
+    qw = vec_sub( qw, vec_sld( xy, xy, 8 ) );
+	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+    return Quat( vec_sel( qv, qw, sw ) );
+}
+
+__forceinline Quat & Quat::operator *=( const Quat &quat )
+{
+    *this = *this * quat;
+    return *this;
+}
+
+__forceinline const Vector3 rotate( const Quat &quat, const Vector3 &vec )
+{    __m128 qdata, vdata, product, tmp0, tmp1, tmp2, tmp3, wwww, qv, qw, res;
+    qdata = quat.get128();
+    vdata = vec.get128();
+    tmp0 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,0,2,1) );
+    tmp1 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,1,0,2) );
+    tmp2 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,0,2,1) );
+    wwww = vec_splat( qdata, 3 );
+    qv = vec_mul( wwww, vdata );
+    qv = vec_madd( tmp0, tmp1, qv );
+    qv = vec_nmsub( tmp2, tmp3, qv );
+    product = vec_mul( qdata, vdata );
+    qw = vec_madd( vec_sld( qdata, qdata, 4 ), vec_sld( vdata, vdata, 4 ), product );
+    qw = vec_add( vec_sld( product, product, 8 ), qw );
+    tmp1 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,0,2,1) );
+    res = vec_mul( vec_splat( qw, 0 ), qdata );
+    res = vec_madd( wwww, qv, res );
+    res = vec_madd( tmp0, tmp1, res );
+    res = vec_nmsub( tmp2, tmp3, res );
+    return Vector3( res );
+}
+
+__forceinline const Quat conj( const Quat &quat )
+{
+	__declspec(align(16)) unsigned int sw[4] = {0x80000000,0x80000000,0x80000000,0};
+    return Quat( vec_xor( quat.get128(), _mm_load_ps((float *)sw) ) );
+}
+
+__forceinline const Quat select( const Quat &quat0, const Quat &quat1, bool select1 )
+{
+    return select( quat0, quat1, boolInVec(select1) );
+}
+
+//__forceinline const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 )
+//{
+//    return Quat( vec_sel( quat0.get128(), quat1.get128(), select1.get128() ) );
+//}
+
+__forceinline void loadXYZW(Quat& quat, const float* fptr)
+{
+#ifdef USE_SSE2_LDDQU
+	quat = Quat(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128		);
+#else
+	SSEFloat fl;
+	fl.f[0] = fptr[0];
+	fl.f[1] = fptr[1];
+	fl.f[2] = fptr[2];
+	fl.f[3] = fptr[3];
+    quat = Quat(	fl.m128);
+#endif
+    
+
+}
+
+__forceinline void storeXYZW(const Quat& quat, float* fptr)
+{
+	fptr[0] = quat.getX();
+	fptr[1] = quat.getY();
+	fptr[2] = quat.getZ();
+	fptr[3] = quat.getW();
+//    _mm_storeu_ps((float*)quat.get128(),fptr);
+}
+
+
+
+#ifdef _VECTORMATH_DEBUG
+
+__forceinline void print( const Quat &quat )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = quat.get128();
+    printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+__forceinline void print( const Quat &quat, const char * name )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = quat.get128();
+    printf( "%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
--- a/src/BulletMultiThreaded/vectormath/sse/vec_aos.h
+++ b/src/BulletMultiThreaded/vectormath/sse/vec_aos.h
--- a/src/BulletMultiThreaded/vectormath/sse/vecidx_aos.h
+++ b/src/BulletMultiThreaded/vectormath/sse/vecidx_aos.h
@@ -0,0 +1,80 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _VECTORMATH_VECIDX_AOS_H
+#define _VECTORMATH_VECIDX_AOS_H
+
+
+#include "floatInVec.h"
+
+namespace Vectormath {
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// VecIdx 
+// Used in setting elements of Vector3, Vector4, Point3, or Quat with the 
+// subscripting operator.
+//
+
+__declspec(align(16)) class VecIdx
+{
+private:
+   __m128 &ref;
+   int i;
+public:
+    inline VecIdx( __m128& vec, int idx ): ref(vec) { i = idx; }
+
+    // implicitly casts to float unless _VECTORMATH_NO_SCALAR_CAST defined
+    // in which case, implicitly casts to floatInVec, and one must call
+    // getAsFloat to convert to float.
+    //
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+    inline operator floatInVec() const;
+    inline float getAsFloat() const;
+#else
+    inline operator float() const;
+#endif
+
+    inline float operator =( float scalar );
+    inline floatInVec operator =( const floatInVec &scalar );
+    inline floatInVec operator =( const VecIdx& scalar );
+    inline floatInVec operator *=( float scalar );
+    inline floatInVec operator *=( const floatInVec &scalar );
+    inline floatInVec operator /=( float scalar );
+    inline floatInVec operator /=( const floatInVec &scalar );
+    inline floatInVec operator +=( float scalar );
+    inline floatInVec operator +=( const floatInVec &scalar );
+    inline floatInVec operator -=( float scalar );
+    inline floatInVec operator -=( const floatInVec &scalar );
+};
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
--- a/src/BulletMultiThreaded/vectormath/sse/vectormath_aos.h
+++ b/src/BulletMultiThreaded/vectormath/sse/vectormath_aos.h
--- a/src/BulletMultiThreaded/vectormath2bullet.h
+++ b/src/BulletMultiThreaded/vectormath2bullet.h
@@ -31,12 +31,7 @@
 #define AOS_VECTORMATH_BULLET_CONVERT_H


-///only use a system-wide vectormath_aos.h on CELLOS_LV2 or if USE_SYSTEM_VECTORMATH
-#if defined(__CELLOS_LV2__) || defined (USE_SYSTEM_VECTORMATH)
-#include <vectormath_aos.h>
-#else
-#include "BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h"
-#endif
+#include "vectormath_aos.h"

 #include "LinearMath/btVector3.h"
 #include "LinearMath/btQuaternion.h"