diff --git a/ChangeLog.txt b/ChangeLog.txt
index 73b944af6..42d49faa3 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,6 +1,15 @@
 Bullet Continuous Collision Detection and Physics Library
 Primary author and maintainer: Erwin Coumans
 
+2007 Sept 9
+	- Added serialization for BVH/btBvhTriangleMeshShape, including endian swapping. See ConcaveDemo for an example.
+	Thanks to Phil Knight for the contribution.
+	- Fixed issues related to stack allocator/compound collision algorithm
+	Thanks Proctoid, http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=18&t=1460
+	- Increase some default memory pool settings, and added a fallback for the constraints solver to use heap memory
+	- Removed accidential testing code in btScalar.h related to operator new.	
+	- Enable btAxis3Sweep and bt32BitAxis3Sweep to be linked in at the same time, using template
+	
 2007 Sept 7
 	- Replaced several dynamic memory allocations by stack allocation and pool allocations
 	- Added branch-free quantized aabb bounding box overlap check, works better on Playstation 3 and XBox 360
diff --git a/Demos/CcdPhysicsDemo/CcdPhysicsDemo.cpp b/Demos/CcdPhysicsDemo/CcdPhysicsDemo.cpp
index c169725be..3939b0c31 100644
--- a/Demos/CcdPhysicsDemo/CcdPhysicsDemo.cpp
+++ b/Demos/CcdPhysicsDemo/CcdPhysicsDemo.cpp
@@ -14,8 +14,8 @@ subject to the following restrictions:
 */
 
 //enable just one, DO_BENCHMARK_PYRAMIDS or DO_WALL
-//#define DO_BENCHMARK_PYRAMIDS 1
-#define DO_WALL 1
+#define DO_BENCHMARK_PYRAMIDS 1
+//#define DO_WALL 1
 
 //Note: some of those settings need 'DO_WALL' demo
 //#define USE_KINEMATIC_GROUND 1
@@ -401,7 +401,10 @@ int maxNumOutstandingTasks = 4;//number of maximum outstanding tasks
 	btVector3 worldAabbMax(1000,1000,1000);
 
 	btBroadphaseInterface* broadphase = new btAxisSweep3(worldAabbMin,worldAabbMax,maxProxies);
-//	btOverlappingPairCache* broadphase = new btSimpleBroadphase;
+/// For large worlds or over 16384 objects, use the bt32BitAxisSweep3 broadphase
+//	btBroadphaseInterface* broadphase = new bt32BitAxisSweep3(worldAabbMin,worldAabbMax,maxProxies);
+/// When trying to debug broadphase issues, try to use the btSimpleBroadphase
+//	btBroadphaseInterface* broadphase = new btSimpleBroadphase;
 	
 #ifdef REGISTER_CUSTOM_COLLISION_ALGORITHM
 	dispatcher->registerCollisionCreateFunc(SPHERE_SHAPE_PROXYTYPE,SPHERE_SHAPE_PROXYTYPE,new btSphereSphereCollisionAlgorithm::CreateFunc);
@@ -576,8 +579,8 @@ int maxNumOutstandingTasks = 4;//number of maximum outstanding tasks
 
 	localCreateRigidBody(0.f,trans,shapePtr[shapeIndex[0]]);
 
-	int numWalls = 10;
-	int wallHeight = 10;
+	int numWalls = 15;
+	int wallHeight = 15;
 	float wallDistance = 3;
 
 
diff --git a/Demos/ConcaveDemo/ConcavePhysicsDemo.cpp b/Demos/ConcaveDemo/ConcavePhysicsDemo.cpp
index adbc353c0..12139112b 100644
--- a/Demos/ConcaveDemo/ConcavePhysicsDemo.cpp
+++ b/Demos/ConcaveDemo/ConcavePhysicsDemo.cpp
@@ -111,8 +111,8 @@ int main(int argc,char** argv)
 }
 
 
-	const int NUM_VERTS_X = 50;
-	const int NUM_VERTS_Y = 50;
+	const int NUM_VERTS_X = 30;
+	const int NUM_VERTS_Y = 30;
 	const int totalVerts = NUM_VERTS_X*NUM_VERTS_Y;
 
 void	ConcaveDemo::setVertexPositions(float waveheight, float offset)
@@ -191,8 +191,50 @@ void	ConcaveDemo::initPhysics()
 		totalVerts,(btScalar*) &gVertices[0].x(),vertStride);
 
 	bool useQuantizedAabbCompression = true;
-	trimeshShape  = new btBvhTriangleMeshShape(indexVertexArrays,useQuantizedAabbCompression);
 
+#define SERIALIZE_TO_DISK 1
+#ifdef SERIALIZE_TO_DISK
+	trimeshShape  = new btBvhTriangleMeshShape(indexVertexArrays,useQuantizedAabbCompression);
+	
+	///we can serialize the BVH data 
+	void* buffer = 0;
+	int numBytes = trimeshShape->getOptimizedBvh()->calculateSerializeBufferSize();
+	buffer = btAlignedAlloc(numBytes,16);
+	bool swapEndian = true;
+	trimeshShape->getOptimizedBvh()->serialize(buffer,numBytes,swapEndian);
+	FILE* file = fopen("bvh.bin","wb");
+	fwrite(buffer,1,numBytes,file);
+	fclose(file);
+
+#else
+
+	trimeshShape  = new btBvhTriangleMeshShape(indexVertexArrays,useQuantizedAabbCompression,false);
+
+	char* fileName = "bvh.bin";
+
+	FILE* file = fopen(fileName,"rb");
+	int size=0;
+	btOptimizedBvh* bvh = 0;
+
+	if (fseek(file, 0, SEEK_END) || (size = ftell(file)) == EOF || fseek(file, 0, SEEK_SET)) {        /* File operations denied? ok, just close and return failure */
+		printf("Error: cannot get filesize from %s\n", fileName);
+		exit(0);
+	} else
+	{
+
+		fseek(file, 0, SEEK_SET);
+
+		void* buffer = btAlignedAlloc(size,16);
+		memset(buffer,0xcc,size);
+		int read = fread(buffer,1,size,file);
+		fclose(file);
+		bool swapEndian = true;
+		bvh = btOptimizedBvh::deSerializeInPlace(buffer,size,swapEndian);
+	}
+
+	trimeshShape->setOptimizedBvh(bvh);
+
+#endif
 
 //	btCollisionShape* groundShape = new btBoxShape(btVector3(50,3,50));
 
diff --git a/Extras/BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.h b/Extras/BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.h
index 84a1480c7..62f81f602 100644
--- a/Extras/BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.h
+++ b/Extras/BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.h
@@ -108,7 +108,7 @@ ATTRIBUTE_ALIGNED16(struct) SpuSolverBody
 
 	btMatrix3x3			m_worldInvInertiaTensor;
 
-	float				m_invertedMass;
+	btScalar				m_invertedMass;
 };
 
 ATTRIBUTE_ALIGNED16(struct) SpuSolverInternalConstraint
@@ -116,13 +116,13 @@ ATTRIBUTE_ALIGNED16(struct) SpuSolverInternalConstraint
 	uint32_t			m_localOffsetBodyA;
 	uint32_t			m_localOffsetBodyB;
 
-	float				m_appliedImpulse;
-	float				m_appliedVelocityImpulse;
+	btScalar				m_appliedImpulse;
+	btScalar				m_appliedVelocityImpulse;
 
-	float				m_friction;
-	float				m_restitution;
-	float				m_jacDiagABInv;
-	float				m_penetration;
+	btScalar				m_friction;
+	btScalar				m_restitution;
+	btScalar				m_jacDiagABInv;
+	btScalar				m_penetration;
 
 	btVector3			m_normal;
 
diff --git a/src/BulletCollision/BroadphaseCollision/btAxisSweep3.cpp b/src/BulletCollision/BroadphaseCollision/btAxisSweep3.cpp
index 87696ea99..272b700b9 100644
--- a/src/BulletCollision/BroadphaseCollision/btAxisSweep3.cpp
+++ b/src/BulletCollision/BroadphaseCollision/btAxisSweep3.cpp
@@ -21,660 +21,18 @@
 
 #include <assert.h>
 
-#ifdef DEBUG_BROADPHASE
-#include <stdio.h>
-void btAxisSweep3::debugPrintAxis(int axis, bool checkCardinality)
+btAxisSweep3::btAxisSweep3(const btPoint3& worldAabbMin,const btPoint3& worldAabbMax, unsigned short int maxHandles, btOverlappingPairCache* pairCache)
+:btAxisSweep3Internal(worldAabbMin,worldAabbMax,0xfffe,0xffff,maxHandles,pairCache)
 {
-	int numEdges = m_pHandles[0].m_maxEdges[axis];
-	printf("SAP Axis %d, numEdges=%d\n",axis,numEdges);
-
-	int i;
-	for (i=0;i<numEdges+1;i++)
-	{
-		Edge* pEdge = m_pEdges[axis] + i;
-		Handle* pHandlePrev = getHandle(pEdge->m_handle);
-		int handleIndex = pEdge->IsMax()? pHandlePrev->m_maxEdges[axis] : pHandlePrev->m_minEdges[axis];
-		char beginOrEnd;
-		beginOrEnd=pEdge->IsMax()?'E':'B';
-		printf("	[%c,h=%d,p=%x,i=%d]\n",beginOrEnd,pEdge->m_handle,pEdge->m_pos,handleIndex);
-	}
-
-	if (checkCardinality)
-		assert(numEdges == m_numHandles*2+1);
-}
-#endif //DEBUG_BROADPHASE
-
-
-btBroadphaseProxy*	btAxisSweep3::createProxy(  const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr,short int collisionFilterGroup,short int collisionFilterMask)
-{
-		(void)shapeType;
-		BP_FP_INT_TYPE handleId = addHandle(aabbMin,aabbMax, userPtr,collisionFilterGroup,collisionFilterMask);
-		
-		Handle* handle = getHandle(handleId);
-				
-		return handle;
-}
-
-void	btAxisSweep3::destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher)
-{
-	Handle* handle = static_cast<Handle*>(proxy);
-	removeHandle(handle->m_handleId,dispatcher);
-}
-
-void	btAxisSweep3::setAabb(btBroadphaseProxy* proxy,const btVector3& aabbMin,const btVector3& aabbMax)
-{
-	Handle* handle = static_cast<Handle*>(proxy);
-	updateHandle(handle->m_handleId,aabbMin,aabbMax);
-
-}
-
-
-
-
-
-
-btAxisSweep3::btAxisSweep3(const btPoint3& worldAabbMin,const btPoint3& worldAabbMax, int maxHandles, btOverlappingPairCache* pairCache)
-:m_invalidPair(0),
-m_pairCache(pairCache),
-m_ownsPairCache(false)
-{
-	if (!m_pairCache)
-	{
-		m_pairCache = new btOverlappingPairCache();
-		m_ownsPairCache = true;
-	}
-
-	//assert(bounds.HasVolume());
-
 	// 1 handle is reserved as sentinel
-	btAssert(maxHandles > 1 && maxHandles < BP_MAX_HANDLES);
-
-	// init bounds
-	m_worldAabbMin = worldAabbMin;
-	m_worldAabbMax = worldAabbMax;
-
-	btVector3 aabbSize = m_worldAabbMax - m_worldAabbMin;
-
-	BP_FP_INT_TYPE	maxInt = BP_HANDLE_SENTINEL;
-
-	m_quantize = btVector3(btScalar(maxInt),btScalar(maxInt),btScalar(maxInt)) / aabbSize;
-
-	// allocate handles buffer and put all handles on free list
-	m_pHandles = new Handle[maxHandles];
-	m_maxHandles = maxHandles;
-	m_numHandles = 0;
-
-	// handle 0 is reserved as the null index, and is also used as the sentinel
-	m_firstFreeHandle = 1;
-	{
-		for (BP_FP_INT_TYPE i = m_firstFreeHandle; i < maxHandles; i++)
-			m_pHandles[i].SetNextFree(i + 1);
-		m_pHandles[maxHandles - 1].SetNextFree(0);
-	}
-
-	{
-	// allocate edge buffers
-	for (int i = 0; i < 3; i++)
-		m_pEdges[i] = new Edge[maxHandles * 2];
-	}
-	//removed overlap management
-
-	// make boundary sentinels
-	
-	m_pHandles[0].m_clientObject = 0;
-
-	for (int axis = 0; axis < 3; axis++)
-	{
-		m_pHandles[0].m_minEdges[axis] = 0;
-		m_pHandles[0].m_maxEdges[axis] = 1;
-
-		m_pEdges[axis][0].m_pos = 0;
-		m_pEdges[axis][0].m_handle = 0;
-		m_pEdges[axis][1].m_pos = BP_HANDLE_SENTINEL;
-		m_pEdges[axis][1].m_handle = 0;
-#ifdef DEBUG_BROADPHASE
-		debugPrintAxis(axis);
-#endif //DEBUG_BROADPHASE
-
-	}
+	btAssert(maxHandles > 1 && maxHandles < 32767);
 
 }
 
-btAxisSweep3::~btAxisSweep3()
+
+bt32BitAxisSweep3::bt32BitAxisSweep3(const btPoint3& worldAabbMin,const btPoint3& worldAabbMax, unsigned int maxHandles , btOverlappingPairCache* pairCache )
+:btAxisSweep3Internal(worldAabbMin,worldAabbMax,0xfffffffe,0x7fffffff,maxHandles,pairCache)
 {
-	
-	for (int i = 2; i >= 0; i--)
-		delete[] m_pEdges[i];
-	delete[] m_pHandles;
-
-	if (m_ownsPairCache)
-	{
-		delete m_pairCache;
-	}
+	// 1 handle is reserved as sentinel
+	btAssert(maxHandles > 1 && maxHandles < 2147483647);
 }
-
-void btAxisSweep3::quantize(BP_FP_INT_TYPE* out, const btPoint3& point, int isMax) const
-{
-	btPoint3 clampedPoint(point);
-	
-
-
-	clampedPoint.setMax(m_worldAabbMin);
-	clampedPoint.setMin(m_worldAabbMax);
-
-	btVector3 v = (clampedPoint - m_worldAabbMin) * m_quantize;
-	out[0] = (BP_FP_INT_TYPE)(((BP_FP_INT_TYPE)v.getX() & BP_HANDLE_MASK) | isMax);
-	out[1] = (BP_FP_INT_TYPE)(((BP_FP_INT_TYPE)v.getY() & BP_HANDLE_MASK) | isMax);
-	out[2] = (BP_FP_INT_TYPE)(((BP_FP_INT_TYPE)v.getZ() & BP_HANDLE_MASK) | isMax);
-	
-}
-
-
-
-BP_FP_INT_TYPE btAxisSweep3::allocHandle()
-{
-	assert(m_firstFreeHandle);
-
-	BP_FP_INT_TYPE handle = m_firstFreeHandle;
-	m_firstFreeHandle = getHandle(handle)->GetNextFree();
-	m_numHandles++;
-
-	return handle;
-}
-
-void btAxisSweep3::freeHandle(BP_FP_INT_TYPE handle)
-{
-	assert(handle > 0 && handle < m_maxHandles);
-
-	getHandle(handle)->SetNextFree(m_firstFreeHandle);
-	m_firstFreeHandle = handle;
-
-	m_numHandles--;
-}
-
-
-
-BP_FP_INT_TYPE btAxisSweep3::addHandle(const btPoint3& aabbMin,const btPoint3& aabbMax, void* pOwner,short int collisionFilterGroup,short int collisionFilterMask)
-{
-	// quantize the bounds
-	BP_FP_INT_TYPE min[3], max[3];
-	quantize(min, aabbMin, 0);
-	quantize(max, aabbMax, 1);
-
-	// allocate a handle
-	BP_FP_INT_TYPE handle = allocHandle();
-	assert(handle!= 0xcdcd);
-
-	Handle* pHandle = getHandle(handle);
-	
-	pHandle->m_handleId = handle;
-	//pHandle->m_pOverlaps = 0;
-	pHandle->m_clientObject = pOwner;
-	pHandle->m_collisionFilterGroup = collisionFilterGroup;
-	pHandle->m_collisionFilterMask = collisionFilterMask;
-
-	// compute current limit of edge arrays
-	BP_FP_INT_TYPE limit = m_numHandles * 2;
-
-	
-	// insert new edges just inside the max boundary edge
-	for (BP_FP_INT_TYPE axis = 0; axis < 3; axis++)
-	{
-
-		m_pHandles[0].m_maxEdges[axis] += 2;
-
-		m_pEdges[axis][limit + 1] = m_pEdges[axis][limit - 1];
-
-		m_pEdges[axis][limit - 1].m_pos = min[axis];
-		m_pEdges[axis][limit - 1].m_handle = handle;
-
-		m_pEdges[axis][limit].m_pos = max[axis];
-		m_pEdges[axis][limit].m_handle = handle;
-
-		pHandle->m_minEdges[axis] = limit - 1;
-		pHandle->m_maxEdges[axis] = limit;
-	}
-
-	// now sort the new edges to their correct position
-	sortMinDown(0, pHandle->m_minEdges[0], false);
-	sortMaxDown(0, pHandle->m_maxEdges[0], false);
-	sortMinDown(1, pHandle->m_minEdges[1], false);
-	sortMaxDown(1, pHandle->m_maxEdges[1], false);
-	sortMinDown(2, pHandle->m_minEdges[2], true);
-	sortMaxDown(2, pHandle->m_maxEdges[2], true);
-
-
-	return handle;
-}
-
-
-void btAxisSweep3::removeHandle(BP_FP_INT_TYPE handle,btDispatcher* dispatcher)
-{
-
-	Handle* pHandle = getHandle(handle);
-
-	//explicitly remove the pairs containing the proxy
-	//we could do it also in the sortMinUp (passing true)
-	//todo: compare performance
-	m_pairCache->removeOverlappingPairsContainingProxy(pHandle,dispatcher);
-
-
-	// compute current limit of edge arrays
-	int limit = m_numHandles * 2;
-	
-	int axis;
-
-	for (axis = 0;axis<3;axis++)
-	{
-		m_pHandles[0].m_maxEdges[axis] -= 2;
-	}
-
-	// remove the edges by sorting them up to the end of the list
-	for ( axis = 0; axis < 3; axis++)
-	{
-		Edge* pEdges = m_pEdges[axis];
-		BP_FP_INT_TYPE max = pHandle->m_maxEdges[axis];
-		pEdges[max].m_pos = BP_HANDLE_SENTINEL;
-
-		sortMaxUp(axis,max,false);
-
-
-		BP_FP_INT_TYPE i = pHandle->m_minEdges[axis];
-		pEdges[i].m_pos = BP_HANDLE_SENTINEL;
-
-
-		sortMinUp(axis,i,false);
-
-		pEdges[limit-1].m_handle = 0;
-		pEdges[limit-1].m_pos = BP_HANDLE_SENTINEL;
-		
-#ifdef DEBUG_BROADPHASE
-			debugPrintAxis(axis,false);
-#endif //DEBUG_BROADPHASE
-
-
-	}
-
-
-	// free the handle
-	freeHandle(handle);
-
-	
-}
-
-extern int gOverlappingPairs;
-
-
-void	btAxisSweep3::calculateOverlappingPairs(btDispatcher* dispatcher)
-{
-	
-	if (m_ownsPairCache)
-	{
-		btBroadphasePairArray&	overlappingPairArray = m_pairCache->getOverlappingPairArray();
-
-		//perform a sort, to find duplicates and to sort 'invalid' pairs to the end
-		overlappingPairArray.heapSort(btBroadphasePairSortPredicate());
-
-		overlappingPairArray.resize(overlappingPairArray.size() - m_invalidPair);
-		m_invalidPair = 0;
-
-		
-		int i;
-
-		btBroadphasePair previousPair;
-		previousPair.m_pProxy0 = 0;
-		previousPair.m_pProxy1 = 0;
-		previousPair.m_algorithm = 0;
-		
-		
-		for (i=0;i<overlappingPairArray.size();i++)
-		{
-		
-			btBroadphasePair& pair = overlappingPairArray[i];
-
-			bool isDuplicate = (pair == previousPair);
-
-			previousPair = pair;
-
-			bool needsRemoval = false;
-
-			if (!isDuplicate)
-			{
-				bool hasOverlap = testAabbOverlap(pair.m_pProxy0,pair.m_pProxy1);
-
-				if (hasOverlap)
-				{
-					needsRemoval = false;//callback->processOverlap(pair);
-				} else
-				{
-					needsRemoval = true;
-				}
-			} else
-			{
-				//remove duplicate
-				needsRemoval = true;
-				//should have no algorithm
-				btAssert(!pair.m_algorithm);
-			}
-			
-			if (needsRemoval)
-			{
-				m_pairCache->cleanOverlappingPair(pair,dispatcher);
-
-		//		m_overlappingPairArray.swap(i,m_overlappingPairArray.size()-1);
-		//		m_overlappingPairArray.pop_back();
-				pair.m_pProxy0 = 0;
-				pair.m_pProxy1 = 0;
-				m_invalidPair++;
-				gOverlappingPairs--;
-			} 
-			
-		}
-
-	///if you don't like to skip the invalid pairs in the array, execute following code:
-	#define CLEAN_INVALID_PAIRS 1
-	#ifdef CLEAN_INVALID_PAIRS
-
-		//perform a sort, to sort 'invalid' pairs to the end
-		overlappingPairArray.heapSort(btBroadphasePairSortPredicate());
-
-		overlappingPairArray.resize(overlappingPairArray.size() - m_invalidPair);
-		m_invalidPair = 0;
-	#endif//CLEAN_INVALID_PAIRS
-		
-	}
-
-}
-
-
-
-bool btAxisSweep3::testAabbOverlap(btBroadphaseProxy* proxy0,btBroadphaseProxy* proxy1)
-{
-	const Handle* pHandleA = static_cast<Handle*>(proxy0);
-	const Handle* pHandleB = static_cast<Handle*>(proxy1);
-	
-	//optimization 1: check the array index (memory address), instead of the m_pos
-
-	for (int axis = 0; axis < 3; axis++)
-	{ 
-		if (pHandleA->m_maxEdges[axis] < pHandleB->m_minEdges[axis] || 
-			pHandleB->m_maxEdges[axis] < pHandleA->m_minEdges[axis]) 
-		{ 
-			return false; 
-		} 
-	} 
-	return true;
-}
-
-bool btAxisSweep3::testOverlap(int ignoreAxis,const Handle* pHandleA, const Handle* pHandleB)
-{
-	//optimization 1: check the array index (memory address), instead of the m_pos
-
-	for (int axis = 0; axis < 3; axis++)
-	{ 
-		if (axis != ignoreAxis)
-		{
-			if (pHandleA->m_maxEdges[axis] < pHandleB->m_minEdges[axis] || 
-				pHandleB->m_maxEdges[axis] < pHandleA->m_minEdges[axis]) 
-			{ 
-				return false; 
-			} 
-		}
-	} 
-
-	//optimization 2: only 2 axis need to be tested (conflicts with 'delayed removal' optimization)
-
-	/*for (int axis = 0; axis < 3; axis++)
-	{
-		if (m_pEdges[axis][pHandleA->m_maxEdges[axis]].m_pos < m_pEdges[axis][pHandleB->m_minEdges[axis]].m_pos ||
-			m_pEdges[axis][pHandleB->m_maxEdges[axis]].m_pos < m_pEdges[axis][pHandleA->m_minEdges[axis]].m_pos)
-		{
-			return false;
-		}
-	}
-	*/
-
-	return true;
-}
-
-void btAxisSweep3::updateHandle(BP_FP_INT_TYPE handle, const btPoint3& aabbMin,const btPoint3& aabbMax)
-{
-//	assert(bounds.IsFinite());
-	//assert(bounds.HasVolume());
-
-	Handle* pHandle = getHandle(handle);
-
-	// quantize the new bounds
-	BP_FP_INT_TYPE min[3], max[3];
-	quantize(min, aabbMin, 0);
-	quantize(max, aabbMax, 1);
-
-	// update changed edges
-	for (int axis = 0; axis < 3; axis++)
-	{
-		BP_FP_INT_TYPE emin = pHandle->m_minEdges[axis];
-		BP_FP_INT_TYPE emax = pHandle->m_maxEdges[axis];
-
-		int dmin = (int)min[axis] - (int)m_pEdges[axis][emin].m_pos;
-		int dmax = (int)max[axis] - (int)m_pEdges[axis][emax].m_pos;
-
-		m_pEdges[axis][emin].m_pos = min[axis];
-		m_pEdges[axis][emax].m_pos = max[axis];
-
-		// expand (only adds overlaps)
-		if (dmin < 0)
-			sortMinDown(axis, emin);
-
-		if (dmax > 0)
-			sortMaxUp(axis, emax);
-
-		// shrink (only removes overlaps)
-		if (dmin > 0)
-			sortMinUp(axis, emin);
-
-		if (dmax < 0)
-			sortMaxDown(axis, emax);
-
-#ifdef DEBUG_BROADPHASE
-	debugPrintAxis(axis);
-#endif //DEBUG_BROADPHASE
-	}
-
-	
-}
-
-
-
-
-// sorting a min edge downwards can only ever *add* overlaps
-void btAxisSweep3::sortMinDown(int axis, BP_FP_INT_TYPE edge, bool updateOverlaps)
-{
-
-	Edge* pEdge = m_pEdges[axis] + edge;
-	Edge* pPrev = pEdge - 1;
-	Handle* pHandleEdge = getHandle(pEdge->m_handle);
-
-	while (pEdge->m_pos < pPrev->m_pos)
-	{
-		Handle* pHandlePrev = getHandle(pPrev->m_handle);
-
-		if (pPrev->IsMax())
-		{
-			// if previous edge is a maximum check the bounds and add an overlap if necessary
-			if (updateOverlaps && testOverlap(axis,pHandleEdge, pHandlePrev))
-			{
-				m_pairCache->addOverlappingPair(pHandleEdge,pHandlePrev);
-
-				//AddOverlap(pEdge->m_handle, pPrev->m_handle);
-
-			}
-
-			// update edge reference in other handle
-			pHandlePrev->m_maxEdges[axis]++;
-		}
-		else
-			pHandlePrev->m_minEdges[axis]++;
-
-		pHandleEdge->m_minEdges[axis]--;
-
-		// swap the edges
-		Edge swap = *pEdge;
-		*pEdge = *pPrev;
-		*pPrev = swap;
-
-		// decrement
-		pEdge--;
-		pPrev--;
-	}
-
-#ifdef DEBUG_BROADPHASE
-	debugPrintAxis(axis);
-#endif //DEBUG_BROADPHASE
-
-}
-
-// sorting a min edge upwards can only ever *remove* overlaps
-void btAxisSweep3::sortMinUp(int axis, BP_FP_INT_TYPE edge, bool updateOverlaps)
-{
-	Edge* pEdge = m_pEdges[axis] + edge;
-	Edge* pNext = pEdge + 1;
-	Handle* pHandleEdge = getHandle(pEdge->m_handle);
-
-	while (pNext->m_handle && (pEdge->m_pos >= pNext->m_pos))
-	{
-		Handle* pHandleNext = getHandle(pNext->m_handle);
-
-		if (pNext->IsMax())
-		{
-			// if next edge is maximum remove any overlap between the two handles
-			if (updateOverlaps)
-			{
-				/*
-				Handle* handle0 = getHandle(pEdge->m_handle);
-				Handle* handle1 = getHandle(pNext->m_handle);
-				btBroadphasePair tmpPair(*handle0,*handle1);
-				removeOverlappingPair(tmpPair);
-				*/
-
-			}
-
-			// update edge reference in other handle
-			pHandleNext->m_maxEdges[axis]--;
-		}
-		else
-			pHandleNext->m_minEdges[axis]--;
-
-		pHandleEdge->m_minEdges[axis]++;
-
-		// swap the edges
-		Edge swap = *pEdge;
-		*pEdge = *pNext;
-		*pNext = swap;
-
-		// increment
-		pEdge++;
-		pNext++;
-	}
-
-
-}
-
-// sorting a max edge downwards can only ever *remove* overlaps
-void btAxisSweep3::sortMaxDown(int axis, BP_FP_INT_TYPE edge, bool updateOverlaps)
-{
-
-	Edge* pEdge = m_pEdges[axis] + edge;
-	Edge* pPrev = pEdge - 1;
-	Handle* pHandleEdge = getHandle(pEdge->m_handle);
-
-	while (pEdge->m_pos < pPrev->m_pos)
-	{
-		Handle* pHandlePrev = getHandle(pPrev->m_handle);
-
-		if (!pPrev->IsMax())
-		{
-			// if previous edge was a minimum remove any overlap between the two handles
-			if (updateOverlaps)
-			{
-				//this is done during the overlappingpairarray iteration/narrowphase collision
-				/*
-				Handle* handle0 = getHandle(pEdge->m_handle);
-				Handle* handle1 = getHandle(pPrev->m_handle);
-				btBroadphasePair* pair = findPair(handle0,handle1);
-				//assert(pair);
-
-				if (pair)
-				{
-					removeOverlappingPair(*pair);
-				}
-				*/
-
-			}
-
-			// update edge reference in other handle
-			pHandlePrev->m_minEdges[axis]++;;
-		}
-		else
-			pHandlePrev->m_maxEdges[axis]++;
-
-		pHandleEdge->m_maxEdges[axis]--;
-
-		// swap the edges
-		Edge swap = *pEdge;
-		*pEdge = *pPrev;
-		*pPrev = swap;
-
-		// decrement
-		pEdge--;
-		pPrev--;
-	}
-
-	
-#ifdef DEBUG_BROADPHASE
-	debugPrintAxis(axis);
-#endif //DEBUG_BROADPHASE
-
-}
-
-// sorting a max edge upwards can only ever *add* overlaps
-void btAxisSweep3::sortMaxUp(int axis, BP_FP_INT_TYPE edge, bool updateOverlaps)
-{
-	Edge* pEdge = m_pEdges[axis] + edge;
-	Edge* pNext = pEdge + 1;
-	Handle* pHandleEdge = getHandle(pEdge->m_handle);
-
-	while (pNext->m_handle && (pEdge->m_pos >= pNext->m_pos))
-	{
-		Handle* pHandleNext = getHandle(pNext->m_handle);
-
-		if (!pNext->IsMax())
-		{
-			// if next edge is a minimum check the bounds and add an overlap if necessary
-			if (updateOverlaps && testOverlap(axis, pHandleEdge, pHandleNext))
-			{
-				Handle* handle0 = getHandle(pEdge->m_handle);
-				Handle* handle1 = getHandle(pNext->m_handle);
-				m_pairCache->addOverlappingPair(handle0,handle1);
-			}
-
-			// update edge reference in other handle
-			pHandleNext->m_minEdges[axis]--;
-		}
-		else
-			pHandleNext->m_maxEdges[axis]--;
-
-		pHandleEdge->m_maxEdges[axis]++;
-
-		// swap the edges
-		Edge swap = *pEdge;
-		*pEdge = *pNext;
-		*pNext = swap;
-
-		// increment
-		pEdge++;
-		pNext++;
-	}
-	
-}
-
-
diff --git a/src/BulletCollision/BroadphaseCollision/btAxisSweep3.h b/src/BulletCollision/BroadphaseCollision/btAxisSweep3.h
index 6e5f85a76..40ec2090c 100644
--- a/src/BulletCollision/BroadphaseCollision/btAxisSweep3.h
+++ b/src/BulletCollision/BroadphaseCollision/btAxisSweep3.h
@@ -26,28 +26,17 @@
 #include "btBroadphaseProxy.h"
 
 
-//Enable BP_USE_FIXEDPOINT_INT_32 if you need more then 32767 objects
-//#define BP_USE_FIXEDPOINT_INT_32 1
-
-#ifdef BP_USE_FIXEDPOINT_INT_32
-	#define BP_FP_INT_TYPE unsigned int
-	#define BP_MAX_HANDLES 1500000 //arbitrary maximum number of handles
-	#define BP_HANDLE_SENTINEL 0x7fffffff
-	#define BP_HANDLE_MASK	0xfffffffe
-#else
-	#define BP_FP_INT_TYPE unsigned short int
-	#define BP_MAX_HANDLES 32767
-	#define BP_HANDLE_SENTINEL 0xffff
-	#define BP_HANDLE_MASK	0xfffe
-#endif //BP_USE_FIXEDPOINT_INT_32
-
 //#define DEBUG_BROADPHASE 1
 
-/// btAxisSweep3 is an efficient implementation of the 3d axis sweep and prune broadphase.
-/// It uses arrays rather then lists for storage of the 3 axis. Also it operates using integer coordinates instead of floats.
-/// The testOverlap check is optimized to check the array index, rather then the actual AABB coordinates/pos
-class btAxisSweep3 : public btBroadphaseInterface
+/// btAxisSweep3Internal is an internal template class that implements sweep and prune.
+/// Dont use this class directly, use btAxisSweep3 or bt32BitAxisSweep3 instead.
+template <typename BP_FP_INT_TYPE>
+class btAxisSweep3Internal : public btBroadphaseInterface
 {
+protected:
+
+	BP_FP_INT_TYPE	m_bpHandleMask;
+	BP_FP_INT_TYPE	m_handleSentinel;
 
 public:
 	
@@ -85,7 +74,7 @@ protected:
 	btVector3 m_quantize;						// scaling factor for quantization
 
 	BP_FP_INT_TYPE m_numHandles;						// number of active handles
-	int m_maxHandles;						// max number of handles
+	BP_FP_INT_TYPE m_maxHandles;						// max number of handles
 	Handle* m_pHandles;						// handles pool
 	BP_FP_INT_TYPE m_firstFreeHandle;		// free handles list
 
@@ -118,8 +107,11 @@ protected:
 	void sortMaxUp(int axis, BP_FP_INT_TYPE edge, bool updateOverlaps = true);
 
 public:
-	btAxisSweep3(const btPoint3& worldAabbMin,const btPoint3& worldAabbMax, int maxHandles = 16384, btOverlappingPairCache* pairCache=0);
-	virtual	~btAxisSweep3();
+
+	btAxisSweep3Internal(const btPoint3& worldAabbMin,const btPoint3& worldAabbMax, BP_FP_INT_TYPE handleMask, BP_FP_INT_TYPE handleSentinel, BP_FP_INT_TYPE maxHandles = 16384, btOverlappingPairCache* pairCache=0);
+
+	virtual	~btAxisSweep3Internal();
+
 
 	virtual void	calculateOverlappingPairs(btDispatcher* dispatcher);
 	
@@ -149,5 +141,708 @@ public:
 
 };
 
+////////////////////////////////////////////////////////////////////
+
+
+
+
+#ifdef DEBUG_BROADPHASE
+#include <stdio.h>
+
+template <typename BP_FP_INT_TYPE>
+void btAxisSweep3<BP_FP_INT_TYPE>::debugPrintAxis(int axis, bool checkCardinality)
+{
+	int numEdges = m_pHandles[0].m_maxEdges[axis];
+	printf("SAP Axis %d, numEdges=%d\n",axis,numEdges);
+
+	int i;
+	for (i=0;i<numEdges+1;i++)
+	{
+		Edge* pEdge = m_pEdges[axis] + i;
+		Handle* pHandlePrev = getHandle(pEdge->m_handle);
+		int handleIndex = pEdge->IsMax()? pHandlePrev->m_maxEdges[axis] : pHandlePrev->m_minEdges[axis];
+		char beginOrEnd;
+		beginOrEnd=pEdge->IsMax()?'E':'B';
+		printf("	[%c,h=%d,p=%x,i=%d]\n",beginOrEnd,pEdge->m_handle,pEdge->m_pos,handleIndex);
+	}
+
+	if (checkCardinality)
+		assert(numEdges == m_numHandles*2+1);
+}
+#endif //DEBUG_BROADPHASE
+
+template <typename BP_FP_INT_TYPE>
+btBroadphaseProxy*	btAxisSweep3Internal<BP_FP_INT_TYPE>::createProxy(  const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr,short int collisionFilterGroup,short int collisionFilterMask)
+{
+		(void)shapeType;
+		BP_FP_INT_TYPE handleId = addHandle(aabbMin,aabbMax, userPtr,collisionFilterGroup,collisionFilterMask);
+		
+		Handle* handle = getHandle(handleId);
+				
+		return handle;
+}
+
+
+
+template <typename BP_FP_INT_TYPE>
+void	btAxisSweep3Internal<BP_FP_INT_TYPE>::destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher)
+{
+	Handle* handle = static_cast<Handle*>(proxy);
+	removeHandle(handle->m_handleId,dispatcher);
+}
+
+template <typename BP_FP_INT_TYPE>
+void	btAxisSweep3Internal<BP_FP_INT_TYPE>::setAabb(btBroadphaseProxy* proxy,const btVector3& aabbMin,const btVector3& aabbMax)
+{
+	Handle* handle = static_cast<Handle*>(proxy);
+	updateHandle(handle->m_handleId,aabbMin,aabbMax);
+
+}
+
+
+
+
+
+template <typename BP_FP_INT_TYPE>
+btAxisSweep3Internal<BP_FP_INT_TYPE>::btAxisSweep3Internal(const btPoint3& worldAabbMin,const btPoint3& worldAabbMax, BP_FP_INT_TYPE handleMask, BP_FP_INT_TYPE handleSentinel,BP_FP_INT_TYPE maxHandles, btOverlappingPairCache* pairCache )
+:m_invalidPair(0),
+m_pairCache(pairCache),
+m_ownsPairCache(false),
+m_bpHandleMask(handleMask),
+m_handleSentinel(handleSentinel)
+{
+	if (!m_pairCache)
+	{
+		m_pairCache = new btOverlappingPairCache();
+		m_ownsPairCache = true;
+	}
+
+	//assert(bounds.HasVolume());
+
+	// init bounds
+	m_worldAabbMin = worldAabbMin;
+	m_worldAabbMax = worldAabbMax;
+
+	btVector3 aabbSize = m_worldAabbMax - m_worldAabbMin;
+
+	BP_FP_INT_TYPE	maxInt = m_handleSentinel;
+
+	m_quantize = btVector3(btScalar(maxInt),btScalar(maxInt),btScalar(maxInt)) / aabbSize;
+
+	// allocate handles buffer and put all handles on free list
+	m_pHandles = new Handle[maxHandles];
+	m_maxHandles = maxHandles;
+	m_numHandles = 0;
+
+	// handle 0 is reserved as the null index, and is also used as the sentinel
+	m_firstFreeHandle = 1;
+	{
+		for (BP_FP_INT_TYPE i = m_firstFreeHandle; i < maxHandles; i++)
+			m_pHandles[i].SetNextFree(i + 1);
+		m_pHandles[maxHandles - 1].SetNextFree(0);
+	}
+
+	{
+	// allocate edge buffers
+	for (int i = 0; i < 3; i++)
+		m_pEdges[i] = new Edge[maxHandles * 2];
+	}
+	//removed overlap management
+
+	// make boundary sentinels
+	
+	m_pHandles[0].m_clientObject = 0;
+
+	for (int axis = 0; axis < 3; axis++)
+	{
+		m_pHandles[0].m_minEdges[axis] = 0;
+		m_pHandles[0].m_maxEdges[axis] = 1;
+
+		m_pEdges[axis][0].m_pos = 0;
+		m_pEdges[axis][0].m_handle = 0;
+		m_pEdges[axis][1].m_pos = m_handleSentinel;
+		m_pEdges[axis][1].m_handle = 0;
+#ifdef DEBUG_BROADPHASE
+		debugPrintAxis(axis);
+#endif //DEBUG_BROADPHASE
+
+	}
+
+}
+
+template <typename BP_FP_INT_TYPE>
+btAxisSweep3Internal<BP_FP_INT_TYPE>::~btAxisSweep3Internal()
+{
+	
+	for (int i = 2; i >= 0; i--)
+		delete[] m_pEdges[i];
+	delete[] m_pHandles;
+
+	if (m_ownsPairCache)
+	{
+		delete m_pairCache;
+	}
+}
+
+template <typename BP_FP_INT_TYPE>
+void btAxisSweep3Internal<BP_FP_INT_TYPE>::quantize(BP_FP_INT_TYPE* out, const btPoint3& point, int isMax) const
+{
+	btPoint3 clampedPoint(point);
+	
+
+
+	clampedPoint.setMax(m_worldAabbMin);
+	clampedPoint.setMin(m_worldAabbMax);
+
+	btVector3 v = (clampedPoint - m_worldAabbMin) * m_quantize;
+	out[0] = (BP_FP_INT_TYPE)(((BP_FP_INT_TYPE)v.getX() & m_bpHandleMask) | isMax);
+	out[1] = (BP_FP_INT_TYPE)(((BP_FP_INT_TYPE)v.getY() & m_bpHandleMask) | isMax);
+	out[2] = (BP_FP_INT_TYPE)(((BP_FP_INT_TYPE)v.getZ() & m_bpHandleMask) | isMax);
+	
+}
+
+
+template <typename BP_FP_INT_TYPE>
+BP_FP_INT_TYPE btAxisSweep3Internal<BP_FP_INT_TYPE>::allocHandle()
+{
+	assert(m_firstFreeHandle);
+
+	BP_FP_INT_TYPE handle = m_firstFreeHandle;
+	m_firstFreeHandle = getHandle(handle)->GetNextFree();
+	m_numHandles++;
+
+	return handle;
+}
+
+template <typename BP_FP_INT_TYPE>
+void btAxisSweep3Internal<BP_FP_INT_TYPE>::freeHandle(BP_FP_INT_TYPE handle)
+{
+	assert(handle > 0 && handle < m_maxHandles);
+
+	getHandle(handle)->SetNextFree(m_firstFreeHandle);
+	m_firstFreeHandle = handle;
+
+	m_numHandles--;
+}
+
+
+template <typename BP_FP_INT_TYPE>
+BP_FP_INT_TYPE btAxisSweep3Internal<BP_FP_INT_TYPE>::addHandle(const btPoint3& aabbMin,const btPoint3& aabbMax, void* pOwner,short int collisionFilterGroup,short int collisionFilterMask)
+{
+	// quantize the bounds
+	BP_FP_INT_TYPE min[3], max[3];
+	quantize(min, aabbMin, 0);
+	quantize(max, aabbMax, 1);
+
+	// allocate a handle
+	BP_FP_INT_TYPE handle = allocHandle();
+	assert(handle!= 0xcdcd);
+
+	Handle* pHandle = getHandle(handle);
+	
+	pHandle->m_handleId = handle;
+	//pHandle->m_pOverlaps = 0;
+	pHandle->m_clientObject = pOwner;
+	pHandle->m_collisionFilterGroup = collisionFilterGroup;
+	pHandle->m_collisionFilterMask = collisionFilterMask;
+
+	// compute current limit of edge arrays
+	BP_FP_INT_TYPE limit = m_numHandles * 2;
+
+	
+	// insert new edges just inside the max boundary edge
+	for (BP_FP_INT_TYPE axis = 0; axis < 3; axis++)
+	{
+
+		m_pHandles[0].m_maxEdges[axis] += 2;
+
+		m_pEdges[axis][limit + 1] = m_pEdges[axis][limit - 1];
+
+		m_pEdges[axis][limit - 1].m_pos = min[axis];
+		m_pEdges[axis][limit - 1].m_handle = handle;
+
+		m_pEdges[axis][limit].m_pos = max[axis];
+		m_pEdges[axis][limit].m_handle = handle;
+
+		pHandle->m_minEdges[axis] = limit - 1;
+		pHandle->m_maxEdges[axis] = limit;
+	}
+
+	// now sort the new edges to their correct position
+	sortMinDown(0, pHandle->m_minEdges[0], false);
+	sortMaxDown(0, pHandle->m_maxEdges[0], false);
+	sortMinDown(1, pHandle->m_minEdges[1], false);
+	sortMaxDown(1, pHandle->m_maxEdges[1], false);
+	sortMinDown(2, pHandle->m_minEdges[2], true);
+	sortMaxDown(2, pHandle->m_maxEdges[2], true);
+
+
+	return handle;
+}
+
+
+template <typename BP_FP_INT_TYPE>
+void btAxisSweep3Internal<BP_FP_INT_TYPE>::removeHandle(BP_FP_INT_TYPE handle,btDispatcher* dispatcher)
+{
+
+	Handle* pHandle = getHandle(handle);
+
+	//explicitly remove the pairs containing the proxy
+	//we could do it also in the sortMinUp (passing true)
+	//todo: compare performance
+	m_pairCache->removeOverlappingPairsContainingProxy(pHandle,dispatcher);
+
+
+	// compute current limit of edge arrays
+	int limit = m_numHandles * 2;
+	
+	int axis;
+
+	for (axis = 0;axis<3;axis++)
+	{
+		m_pHandles[0].m_maxEdges[axis] -= 2;
+	}
+
+	// remove the edges by sorting them up to the end of the list
+	for ( axis = 0; axis < 3; axis++)
+	{
+		Edge* pEdges = m_pEdges[axis];
+		BP_FP_INT_TYPE max = pHandle->m_maxEdges[axis];
+		pEdges[max].m_pos = m_handleSentinel;
+
+		sortMaxUp(axis,max,false);
+
+
+		BP_FP_INT_TYPE i = pHandle->m_minEdges[axis];
+		pEdges[i].m_pos = m_handleSentinel;
+
+
+		sortMinUp(axis,i,false);
+
+		pEdges[limit-1].m_handle = 0;
+		pEdges[limit-1].m_pos = m_handleSentinel;
+		
+#ifdef DEBUG_BROADPHASE
+			debugPrintAxis(axis,false);
+#endif //DEBUG_BROADPHASE
+
+
+	}
+
+
+	// free the handle
+	freeHandle(handle);
+
+	
+}
+
+extern int gOverlappingPairs;
+
+template <typename BP_FP_INT_TYPE>
+void	btAxisSweep3Internal<BP_FP_INT_TYPE>::calculateOverlappingPairs(btDispatcher* dispatcher)
+{
+	
+	if (m_ownsPairCache)
+	{
+		btBroadphasePairArray&	overlappingPairArray = m_pairCache->getOverlappingPairArray();
+
+		//perform a sort, to find duplicates and to sort 'invalid' pairs to the end
+		overlappingPairArray.heapSort(btBroadphasePairSortPredicate());
+
+		overlappingPairArray.resize(overlappingPairArray.size() - m_invalidPair);
+		m_invalidPair = 0;
+
+		
+		int i;
+
+		btBroadphasePair previousPair;
+		previousPair.m_pProxy0 = 0;
+		previousPair.m_pProxy1 = 0;
+		previousPair.m_algorithm = 0;
+		
+		
+		for (i=0;i<overlappingPairArray.size();i++)
+		{
+		
+			btBroadphasePair& pair = overlappingPairArray[i];
+
+			bool isDuplicate = (pair == previousPair);
+
+			previousPair = pair;
+
+			bool needsRemoval = false;
+
+			if (!isDuplicate)
+			{
+				bool hasOverlap = testAabbOverlap(pair.m_pProxy0,pair.m_pProxy1);
+
+				if (hasOverlap)
+				{
+					needsRemoval = false;//callback->processOverlap(pair);
+				} else
+				{
+					needsRemoval = true;
+				}
+			} else
+			{
+				//remove duplicate
+				needsRemoval = true;
+				//should have no algorithm
+				btAssert(!pair.m_algorithm);
+			}
+			
+			if (needsRemoval)
+			{
+				m_pairCache->cleanOverlappingPair(pair,dispatcher);
+
+		//		m_overlappingPairArray.swap(i,m_overlappingPairArray.size()-1);
+		//		m_overlappingPairArray.pop_back();
+				pair.m_pProxy0 = 0;
+				pair.m_pProxy1 = 0;
+				m_invalidPair++;
+				gOverlappingPairs--;
+			} 
+			
+		}
+
+	///if you don't like to skip the invalid pairs in the array, execute following code:
+	#define CLEAN_INVALID_PAIRS 1
+	#ifdef CLEAN_INVALID_PAIRS
+
+		//perform a sort, to sort 'invalid' pairs to the end
+		overlappingPairArray.heapSort(btBroadphasePairSortPredicate());
+
+		overlappingPairArray.resize(overlappingPairArray.size() - m_invalidPair);
+		m_invalidPair = 0;
+	#endif//CLEAN_INVALID_PAIRS
+		
+	}
+
+}
+
+
+template <typename BP_FP_INT_TYPE>
+bool btAxisSweep3Internal<BP_FP_INT_TYPE>::testAabbOverlap(btBroadphaseProxy* proxy0,btBroadphaseProxy* proxy1)
+{
+	const Handle* pHandleA = static_cast<Handle*>(proxy0);
+	const Handle* pHandleB = static_cast<Handle*>(proxy1);
+	
+	//optimization 1: check the array index (memory address), instead of the m_pos
+
+	for (int axis = 0; axis < 3; axis++)
+	{ 
+		if (pHandleA->m_maxEdges[axis] < pHandleB->m_minEdges[axis] || 
+			pHandleB->m_maxEdges[axis] < pHandleA->m_minEdges[axis]) 
+		{ 
+			return false; 
+		} 
+	} 
+	return true;
+}
+
+template <typename BP_FP_INT_TYPE>
+bool btAxisSweep3Internal<BP_FP_INT_TYPE>::testOverlap(int ignoreAxis,const Handle* pHandleA, const Handle* pHandleB)
+{
+	//optimization 1: check the array index (memory address), instead of the m_pos
+
+	for (int axis = 0; axis < 3; axis++)
+	{ 
+		if (axis != ignoreAxis)
+		{
+			if (pHandleA->m_maxEdges[axis] < pHandleB->m_minEdges[axis] || 
+				pHandleB->m_maxEdges[axis] < pHandleA->m_minEdges[axis]) 
+			{ 
+				return false; 
+			} 
+		}
+	} 
+
+	//optimization 2: only 2 axis need to be tested (conflicts with 'delayed removal' optimization)
+
+	/*for (int axis = 0; axis < 3; axis++)
+	{
+		if (m_pEdges[axis][pHandleA->m_maxEdges[axis]].m_pos < m_pEdges[axis][pHandleB->m_minEdges[axis]].m_pos ||
+			m_pEdges[axis][pHandleB->m_maxEdges[axis]].m_pos < m_pEdges[axis][pHandleA->m_minEdges[axis]].m_pos)
+		{
+			return false;
+		}
+	}
+	*/
+
+	return true;
+}
+
+template <typename BP_FP_INT_TYPE>
+void btAxisSweep3Internal<BP_FP_INT_TYPE>::updateHandle(BP_FP_INT_TYPE handle, const btPoint3& aabbMin,const btPoint3& aabbMax)
+{
+//	assert(bounds.IsFinite());
+	//assert(bounds.HasVolume());
+
+	Handle* pHandle = getHandle(handle);
+
+	// quantize the new bounds
+	BP_FP_INT_TYPE min[3], max[3];
+	quantize(min, aabbMin, 0);
+	quantize(max, aabbMax, 1);
+
+	// update changed edges
+	for (int axis = 0; axis < 3; axis++)
+	{
+		BP_FP_INT_TYPE emin = pHandle->m_minEdges[axis];
+		BP_FP_INT_TYPE emax = pHandle->m_maxEdges[axis];
+
+		int dmin = (int)min[axis] - (int)m_pEdges[axis][emin].m_pos;
+		int dmax = (int)max[axis] - (int)m_pEdges[axis][emax].m_pos;
+
+		m_pEdges[axis][emin].m_pos = min[axis];
+		m_pEdges[axis][emax].m_pos = max[axis];
+
+		// expand (only adds overlaps)
+		if (dmin < 0)
+			sortMinDown(axis, emin);
+
+		if (dmax > 0)
+			sortMaxUp(axis, emax);
+
+		// shrink (only removes overlaps)
+		if (dmin > 0)
+			sortMinUp(axis, emin);
+
+		if (dmax < 0)
+			sortMaxDown(axis, emax);
+
+#ifdef DEBUG_BROADPHASE
+	debugPrintAxis(axis);
+#endif //DEBUG_BROADPHASE
+	}
+
+	
+}
+
+
+
+
+// sorting a min edge downwards can only ever *add* overlaps
+template <typename BP_FP_INT_TYPE>
+void btAxisSweep3Internal<BP_FP_INT_TYPE>::sortMinDown(int axis, BP_FP_INT_TYPE edge, bool updateOverlaps)
+{
+
+	Edge* pEdge = m_pEdges[axis] + edge;
+	Edge* pPrev = pEdge - 1;
+	Handle* pHandleEdge = getHandle(pEdge->m_handle);
+
+	while (pEdge->m_pos < pPrev->m_pos)
+	{
+		Handle* pHandlePrev = getHandle(pPrev->m_handle);
+
+		if (pPrev->IsMax())
+		{
+			// if previous edge is a maximum check the bounds and add an overlap if necessary
+			if (updateOverlaps && testOverlap(axis,pHandleEdge, pHandlePrev))
+			{
+				m_pairCache->addOverlappingPair(pHandleEdge,pHandlePrev);
+
+				//AddOverlap(pEdge->m_handle, pPrev->m_handle);
+
+			}
+
+			// update edge reference in other handle
+			pHandlePrev->m_maxEdges[axis]++;
+		}
+		else
+			pHandlePrev->m_minEdges[axis]++;
+
+		pHandleEdge->m_minEdges[axis]--;
+
+		// swap the edges
+		Edge swap = *pEdge;
+		*pEdge = *pPrev;
+		*pPrev = swap;
+
+		// decrement
+		pEdge--;
+		pPrev--;
+	}
+
+#ifdef DEBUG_BROADPHASE
+	debugPrintAxis(axis);
+#endif //DEBUG_BROADPHASE
+
+}
+
+// sorting a min edge upwards can only ever *remove* overlaps
+template <typename BP_FP_INT_TYPE>
+void btAxisSweep3Internal<BP_FP_INT_TYPE>::sortMinUp(int axis, BP_FP_INT_TYPE edge, bool updateOverlaps)
+{
+	Edge* pEdge = m_pEdges[axis] + edge;
+	Edge* pNext = pEdge + 1;
+	Handle* pHandleEdge = getHandle(pEdge->m_handle);
+
+	while (pNext->m_handle && (pEdge->m_pos >= pNext->m_pos))
+	{
+		Handle* pHandleNext = getHandle(pNext->m_handle);
+
+		if (pNext->IsMax())
+		{
+			// if next edge is maximum remove any overlap between the two handles
+			if (updateOverlaps)
+			{
+				/*
+				Handle* handle0 = getHandle(pEdge->m_handle);
+				Handle* handle1 = getHandle(pNext->m_handle);
+				btBroadphasePair tmpPair(*handle0,*handle1);
+				removeOverlappingPair(tmpPair);
+				*/
+
+			}
+
+			// update edge reference in other handle
+			pHandleNext->m_maxEdges[axis]--;
+		}
+		else
+			pHandleNext->m_minEdges[axis]--;
+
+		pHandleEdge->m_minEdges[axis]++;
+
+		// swap the edges
+		Edge swap = *pEdge;
+		*pEdge = *pNext;
+		*pNext = swap;
+
+		// increment
+		pEdge++;
+		pNext++;
+	}
+
+
+}
+
+// sorting a max edge downwards can only ever *remove* overlaps
+template <typename BP_FP_INT_TYPE>
+void btAxisSweep3Internal<BP_FP_INT_TYPE>::sortMaxDown(int axis, BP_FP_INT_TYPE edge, bool updateOverlaps)
+{
+
+	Edge* pEdge = m_pEdges[axis] + edge;
+	Edge* pPrev = pEdge - 1;
+	Handle* pHandleEdge = getHandle(pEdge->m_handle);
+
+	while (pEdge->m_pos < pPrev->m_pos)
+	{
+		Handle* pHandlePrev = getHandle(pPrev->m_handle);
+
+		if (!pPrev->IsMax())
+		{
+			// if previous edge was a minimum remove any overlap between the two handles
+			if (updateOverlaps)
+			{
+				//this is done during the overlappingpairarray iteration/narrowphase collision
+				/*
+				Handle* handle0 = getHandle(pEdge->m_handle);
+				Handle* handle1 = getHandle(pPrev->m_handle);
+				btBroadphasePair* pair = findPair(handle0,handle1);
+				//assert(pair);
+
+				if (pair)
+				{
+					removeOverlappingPair(*pair);
+				}
+				*/
+
+			}
+
+			// update edge reference in other handle
+			pHandlePrev->m_minEdges[axis]++;;
+		}
+		else
+			pHandlePrev->m_maxEdges[axis]++;
+
+		pHandleEdge->m_maxEdges[axis]--;
+
+		// swap the edges
+		Edge swap = *pEdge;
+		*pEdge = *pPrev;
+		*pPrev = swap;
+
+		// decrement
+		pEdge--;
+		pPrev--;
+	}
+
+	
+#ifdef DEBUG_BROADPHASE
+	debugPrintAxis(axis);
+#endif //DEBUG_BROADPHASE
+
+}
+
+// sorting a max edge upwards can only ever *add* overlaps
+template <typename BP_FP_INT_TYPE>
+void btAxisSweep3Internal<BP_FP_INT_TYPE>::sortMaxUp(int axis, BP_FP_INT_TYPE edge, bool updateOverlaps)
+{
+	Edge* pEdge = m_pEdges[axis] + edge;
+	Edge* pNext = pEdge + 1;
+	Handle* pHandleEdge = getHandle(pEdge->m_handle);
+
+	while (pNext->m_handle && (pEdge->m_pos >= pNext->m_pos))
+	{
+		Handle* pHandleNext = getHandle(pNext->m_handle);
+
+		if (!pNext->IsMax())
+		{
+			// if next edge is a minimum check the bounds and add an overlap if necessary
+			if (updateOverlaps && testOverlap(axis, pHandleEdge, pHandleNext))
+			{
+				Handle* handle0 = getHandle(pEdge->m_handle);
+				Handle* handle1 = getHandle(pNext->m_handle);
+				m_pairCache->addOverlappingPair(handle0,handle1);
+			}
+
+			// update edge reference in other handle
+			pHandleNext->m_minEdges[axis]--;
+		}
+		else
+			pHandleNext->m_maxEdges[axis]--;
+
+		pHandleEdge->m_maxEdges[axis]++;
+
+		// swap the edges
+		Edge swap = *pEdge;
+		*pEdge = *pNext;
+		*pNext = swap;
+
+		// increment
+		pEdge++;
+		pNext++;
+	}
+	
+}
+
+
+
+////////////////////////////////////////////////////////////////////
+
+
+/// btAxisSweep3 is an efficient implementation of the 3d axis sweep and prune broadphase.
+/// It uses arrays rather then lists for storage of the 3 axis. Also it operates using 16 bit integer coordinates instead of floats.
+/// For large worlds and many objects, use bt32BitAxisSweep3 instead. bt32BitAxisSweep3 has higher precision and allows more then 16384 objects at the cost of more memory and bit of performance.
+class btAxisSweep3 : public btAxisSweep3Internal<unsigned short int>
+{
+public:
+
+	btAxisSweep3(const btPoint3& worldAabbMin,const btPoint3& worldAabbMax, unsigned short int maxHandles = 16384, btOverlappingPairCache* pairCache = 0);
+
+};
+
+/// bt32BitAxisSweep3 allows higher precision quantization and more objects compared to the btAxisSweep3 sweep and prune.
+/// This comes at the cost of more memory per handle, and a bit slower performance.
+/// It uses arrays rather then lists for storage of the 3 axis.
+class bt32BitAxisSweep3 : public btAxisSweep3Internal<unsigned int>
+{
+public:
+
+	bt32BitAxisSweep3(const btPoint3& worldAabbMin,const btPoint3& worldAabbMax, unsigned int maxHandles = 1500000, btOverlappingPairCache* pairCache = 0);
+
+};
+
 #endif
 
diff --git a/src/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp b/src/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp
index a0f19d486..7c0c7a3b0 100644
--- a/src/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp
+++ b/src/BulletCollision/CollisionDispatch/btCompoundCollisionAlgorithm.cpp
@@ -19,7 +19,8 @@ subject to the following restrictions:
 
 
 btCompoundCollisionAlgorithm::btCompoundCollisionAlgorithm( const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1,bool isSwapped)
-:m_isSwapped(isSwapped)
+:btCollisionAlgorithm(ci),
+m_isSwapped(isSwapped)
 {
 	btCollisionObject* colObj = m_isSwapped? body1 : body0;
 	btCollisionObject* otherObj = m_isSwapped? body0 : body1;
diff --git a/src/BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.cpp b/src/BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.cpp
index bc5680652..17b62a4dc 100644
--- a/src/BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.cpp
+++ b/src/BulletCollision/CollisionDispatch/btDefaultCollisionConfiguration.cpp
@@ -24,10 +24,14 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btSphereBoxCollisionAlgorithm.h"
 #include "BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h"
 
+#define DEFAULT_MAX_OVERLAPPING_PAIRS 65535
+#define DEFAULT_STACK_ALLOCATOR_SIZE	(5*1024*1024)
+
+
 btDefaultCollisionConfiguration::btDefaultCollisionConfiguration()
-:m_persistentManifoldPoolSize(16384),
-m_stackAllocatorSize(2*1024*1024),
-m_collisionAlgorithmPoolSize(16384),
+:m_persistentManifoldPoolSize(DEFAULT_MAX_OVERLAPPING_PAIRS),
+m_stackAllocatorSize(DEFAULT_STACK_ALLOCATOR_SIZE),
+m_collisionAlgorithmPoolSize(DEFAULT_MAX_OVERLAPPING_PAIRS),
 m_collisionAlgorithmMaxElementSize(0)
 {
 
diff --git a/src/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.cpp b/src/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.cpp
index 8da554ef1..42530f2bf 100644
--- a/src/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.cpp
+++ b/src/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.cpp
@@ -21,29 +21,42 @@ subject to the following restrictions:
 
 ///Bvh Concave triangle mesh is a static-triangle mesh shape with Bounding Volume Hierarchy optimization.
 ///Uses an interface to access the triangles to allow for sharing graphics/physics triangles.
-btBvhTriangleMeshShape::btBvhTriangleMeshShape(btStridingMeshInterface* meshInterface, bool useQuantizedAabbCompression)
-:btTriangleMeshShape(meshInterface),m_useQuantizedAabbCompression(useQuantizedAabbCompression)
+btBvhTriangleMeshShape::btBvhTriangleMeshShape(btStridingMeshInterface* meshInterface, bool useQuantizedAabbCompression, bool buildBvh)
+:btTriangleMeshShape(meshInterface),m_useQuantizedAabbCompression(useQuantizedAabbCompression),
+m_bvh(0),
+m_ownsBvh(false)
 {
 	//construct bvh from meshInterface
 #ifndef DISABLE_BVH
 
-	m_bvh = new btOptimizedBvh();
 	btVector3 bvhAabbMin,bvhAabbMax;
 	meshInterface->calculateAabbBruteForce(bvhAabbMin,bvhAabbMax);
-	m_bvh->build(meshInterface,m_useQuantizedAabbCompression,bvhAabbMin,bvhAabbMax);
+	
+	if (buildBvh)
+	{
+		m_bvh = new btOptimizedBvh();
+		m_bvh->build(meshInterface,m_useQuantizedAabbCompression,bvhAabbMin,bvhAabbMax);
+		m_ownsBvh = true;
+	}
 
 #endif //DISABLE_BVH
 
 }
 
-btBvhTriangleMeshShape::btBvhTriangleMeshShape(btStridingMeshInterface* meshInterface, bool useQuantizedAabbCompression,const btVector3& bvhAabbMin,const btVector3& bvhAabbMax)
-:btTriangleMeshShape(meshInterface),m_useQuantizedAabbCompression(useQuantizedAabbCompression)
+btBvhTriangleMeshShape::btBvhTriangleMeshShape(btStridingMeshInterface* meshInterface, bool useQuantizedAabbCompression,const btVector3& bvhAabbMin,const btVector3& bvhAabbMax,bool buildBvh)
+:btTriangleMeshShape(meshInterface),m_useQuantizedAabbCompression(useQuantizedAabbCompression),
+m_bvh(0),
+m_ownsBvh(false)
 {
 	//construct bvh from meshInterface
 #ifndef DISABLE_BVH
 
-	m_bvh = new btOptimizedBvh();
-	m_bvh->build(meshInterface,m_useQuantizedAabbCompression,bvhAabbMin,bvhAabbMax);
+	if (buildBvh)
+	{
+		m_bvh = new btOptimizedBvh();
+		m_bvh->build(meshInterface,m_useQuantizedAabbCompression,bvhAabbMin,bvhAabbMax);
+		m_ownsBvh = true;
+	}
 
 #endif //DISABLE_BVH
 
@@ -67,7 +80,8 @@ void	btBvhTriangleMeshShape::refitTree()
 
 btBvhTriangleMeshShape::~btBvhTriangleMeshShape()
 {
-	delete m_bvh;
+	if (m_ownsBvh)
+		delete m_bvh;
 }
 
 //perform bvh tree traversal and report overlapping triangles to 'callback'
diff --git a/src/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h b/src/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h
index 4914d9f95..3394c965d 100644
--- a/src/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h
+++ b/src/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h
@@ -26,15 +26,16 @@ ATTRIBUTE_ALIGNED16(class) btBvhTriangleMeshShape : public btTriangleMeshShape
 
 	btOptimizedBvh*	m_bvh;
 	bool m_useQuantizedAabbCompression;
-	bool m_pad[12];////need padding due to alignment
+	bool	m_ownsBvh;
+	bool m_pad[11];////need padding due to alignment
 
 public:
 
-	btBvhTriangleMeshShape() :btTriangleMeshShape(0) {};
-	btBvhTriangleMeshShape(btStridingMeshInterface* meshInterface, bool useQuantizedAabbCompression);
+	btBvhTriangleMeshShape() :btTriangleMeshShape(0),m_bvh(0),m_ownsBvh(false) {};
+	btBvhTriangleMeshShape(btStridingMeshInterface* meshInterface, bool useQuantizedAabbCompression, bool buildBvh = true);
 
 	///optionally pass in a larger bvh aabb, used for quantization. This allows for deformations within this aabb
-	btBvhTriangleMeshShape(btStridingMeshInterface* meshInterface, bool useQuantizedAabbCompression,const btVector3& bvhAabbMin,const btVector3& bvhAabbMax);
+	btBvhTriangleMeshShape(btStridingMeshInterface* meshInterface, bool useQuantizedAabbCompression,const btVector3& bvhAabbMin,const btVector3& bvhAabbMax, bool buildBvh = true);
 	
 	virtual ~btBvhTriangleMeshShape();
 
@@ -65,6 +66,17 @@ public:
 	{
 		return m_bvh;
 	}
+
+
+	void	setOptimizedBvh(btOptimizedBvh* bvh)
+	{
+		btAssert(!m_bvh);
+		btAssert(!m_ownsBvh);
+
+		m_bvh = bvh;
+		m_ownsBvh = false;
+	}
+
 	bool	usesQuantizedAabbCompression() const
 	{
 		return	m_useQuantizedAabbCompression;
diff --git a/src/BulletCollision/CollisionShapes/btOptimizedBvh.cpp b/src/BulletCollision/CollisionShapes/btOptimizedBvh.cpp
index 172d664bc..afb033b7b 100644
--- a/src/BulletCollision/CollisionShapes/btOptimizedBvh.cpp
+++ b/src/BulletCollision/CollisionShapes/btOptimizedBvh.cpp
@@ -19,32 +19,11 @@ subject to the following restrictions:
 #include "LinearMath/btIDebugDraw.h"
 
 
-inline bool testQuantizedAabbAgainstQuantizedAabb2(unsigned short int* aabbMin1,unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int* aabbMax2) 
-{
-	bool overlap = true;
-	overlap = (aabbMin1[0] > aabbMax2[0] || aabbMax1[0] < aabbMin2[0]) ? false : overlap;
-	overlap = (aabbMin1[2] > aabbMax2[2] || aabbMax1[2] < aabbMin2[2]) ? false : overlap;
-	overlap = (aabbMin1[1] > aabbMax2[1] || aabbMax1[1] < aabbMin2[1]) ? false : overlap;
-	return overlap;
-}
-
-
-
-///Branch-free version of quantized aabb versus quantized aabb
-inline unsigned testQuantizedAabbAgainstQuantizedAabb(unsigned short int* aabbMin1,unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int* aabbMax2)
-{		
-	return btSelect((unsigned)((aabbMin1[0] <= aabbMax2[0]) & (aabbMax1[0] >= aabbMin2[0])
-		& (aabbMin1[2] <= aabbMax2[2]) & (aabbMax1[2] >= aabbMin2[2])
-		& (aabbMin1[1] <= aabbMax2[1]) & (aabbMax1[1] >= aabbMin2[1])),
-		1, 0);
-}
-
-
-
 btOptimizedBvh::btOptimizedBvh() : m_useQuantization(false), 
-					m_traversalMode(TRAVERSAL_STACKLESS_CACHE_FRIENDLY)
-//					m_traversalMode(TRAVERSAL_STACKLESS)
+					//m_traversalMode(TRAVERSAL_STACKLESS_CACHE_FRIENDLY)
+					m_traversalMode(TRAVERSAL_STACKLESS)
 					//m_traversalMode(TRAVERSAL_RECURSIVE)
+					,m_subtreeHeaderCount(0) //PCK: add this line
 { 
 
 }
@@ -132,6 +111,25 @@ void btOptimizedBvh::build(btStridingMeshInterface* triangles, bool useQuantized
 			aabbMin.setMin(triangle[2]);
 			aabbMax.setMax(triangle[2]);
 
+			//PCK: add these checks for zero dimensions of aabb
+			const btScalar MIN_AABB_DIMENSION = btScalar(0.002);
+			const btScalar MIN_AABB_HALF_DIMENSION = btScalar(0.001);
+			if (aabbMax.x() - aabbMin.x() < MIN_AABB_DIMENSION)
+			{
+				aabbMax.setX(aabbMax.x() + MIN_AABB_HALF_DIMENSION);
+				aabbMin.setX(aabbMin.x() - MIN_AABB_HALF_DIMENSION);
+			}
+			if (aabbMax.y() - aabbMin.y() < MIN_AABB_DIMENSION)
+			{
+				aabbMax.setY(aabbMax.y() + MIN_AABB_HALF_DIMENSION);
+				aabbMin.setY(aabbMin.y() - MIN_AABB_HALF_DIMENSION);
+			}
+			if (aabbMax.z() - aabbMin.z() < MIN_AABB_DIMENSION)
+			{
+				aabbMax.setZ(aabbMax.z() + MIN_AABB_HALF_DIMENSION);
+				aabbMin.setZ(aabbMin.z() - MIN_AABB_HALF_DIMENSION);
+			}
+
 			m_optimizedTree->quantizeWithClamp(&node.m_quantizedAabbMin[0],aabbMin);
 			m_optimizedTree->quantizeWithClamp(&node.m_quantizedAabbMax[0],aabbMax);
 
@@ -192,8 +190,12 @@ void btOptimizedBvh::build(btStridingMeshInterface* triangles, bool useQuantized
 		subtree.m_subtreeSize = m_quantizedContiguousNodes[0].isLeafNode() ? 1 : m_quantizedContiguousNodes[0].getEscapeIndex();
 	}
 
-	m_leafNodes.clear();
+	//PCK: update the copy of the size
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+
+	//PCK: clear m_quantizedLeafNodes and m_leafNodes, they are temporary
 	m_quantizedLeafNodes.clear();
+	m_leafNodes.clear();
 }
 
 
@@ -225,8 +227,9 @@ void	btOptimizedBvh::refitPartial(btStridingMeshInterface* meshInterface,const b
 	{
 		btBvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
 
-		unsigned int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
-		if (overlap)
+		//PCK: unsigned instead of bool
+		unsigned overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+		if (overlap != 0)
 		{
 			updateBvhNodes(meshInterface,subtree.m_rootNodeIndex,subtree.m_rootNodeIndex+subtree.m_subtreeSize,i);
 
@@ -503,6 +506,9 @@ void	btOptimizedBvh::updateSubtreeHeaders(int leftChildNodexIndex,int rightChild
 		subtree.m_rootNodeIndex = rightChildNodexIndex;
 		subtree.m_subtreeSize = rightSubTreeSize;
 	}
+
+	//PCK: update the copy of the size
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
 }
 
 
@@ -635,7 +641,9 @@ void	btOptimizedBvh::walkStacklessTree(btNodeOverlapCallback* nodeCallback,const
 	const btOptimizedBvhNode* rootNode = &m_contiguousNodes[0];
 	int escapeIndex, curIndex = 0;
 	int walkIterations = 0;
-	bool aabbOverlap, isLeafNode;
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap;
 
 	while (curIndex < m_curNodeIndex)
 	{
@@ -646,12 +654,14 @@ void	btOptimizedBvh::walkStacklessTree(btNodeOverlapCallback* nodeCallback,const
 		aabbOverlap = TestAabbAgainstAabb2(aabbMin,aabbMax,rootNode->m_aabbMinOrg,rootNode->m_aabbMaxOrg);
 		isLeafNode = rootNode->m_escapeIndex == -1;
 		
-		if (isLeafNode && aabbOverlap)
+		//PCK: unsigned instead of bool
+		if (isLeafNode && (aabbOverlap != 0))
 		{
 			nodeCallback->processNode(rootNode->m_subPart,rootNode->m_triangleIndex);
 		} 
 		
-		if (aabbOverlap || isLeafNode)
+		//PCK: unsigned instead of bool
+		if ((aabbOverlap != 0) || isLeafNode)
 		{
 			rootNode++;
 			curIndex++;
@@ -692,12 +702,16 @@ void btOptimizedBvh::walkRecursiveQuantizedTreeAgainstQueryAabb(const btQuantize
 {
 	btAssert(m_useQuantization);
 	
-	unsigned int aabbOverlap, isLeafNode;
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap;
 
+	//PCK: unsigned instead of bool
 	aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,currentNode->m_quantizedAabbMin,currentNode->m_quantizedAabbMax);
 	isLeafNode = currentNode->isLeafNode();
 		
-	if (aabbOverlap)
+	//PCK: unsigned instead of bool
+	if (aabbOverlap != 0)
 	{
 		if (isLeafNode)
 		{
@@ -731,7 +745,9 @@ void	btOptimizedBvh::walkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallb
 	const btQuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[startNodeIndex];
 	int escapeIndex;
 	
-	unsigned int aabbOverlap, isLeafNode;
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap;
 
 	while (curIndex < endNodeIndex)
 	{
@@ -756,6 +772,7 @@ void	btOptimizedBvh::walkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallb
 		assert (walkIterations < subTreeSize);
 
 		walkIterations++;
+		//PCK: unsigned instead of bool
 		aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax);
 		isLeafNode = rootNode->isLeafNode();
 		
@@ -764,7 +781,8 @@ void	btOptimizedBvh::walkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallb
 			nodeCallback->processNode(0,rootNode->getTriangleIndex());
 		} 
 		
-		if (aabbOverlap || isLeafNode)
+		//PCK: unsigned instead of bool
+		if ((aabbOverlap != 0) || isLeafNode)
 		{
 			rootNode++;
 			curIndex++;
@@ -792,8 +810,9 @@ void	btOptimizedBvh::walkStacklessQuantizedTreeCacheFriendly(btNodeOverlapCallba
 	{
 		const btBvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
 
-		unsigned int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
-		if (overlap)
+		//PCK: unsigned instead of bool
+		unsigned overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+		if (overlap != 0)
 		{
 			walkStacklessQuantizedTree(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax,
 				subtree.m_rootNodeIndex,
@@ -867,3 +886,305 @@ void	btOptimizedBvh::assignInternalNodeFromLeafNode(int internalNode,int leafNod
 		m_contiguousNodes[internalNode] = m_leafNodes[leafNodeIndex];
 	}
 }
+
+//PCK: include
+#include <new>
+
+//PCK: consts
+static const unsigned BVH_ALIGNMENT = 16;
+static const unsigned BVH_ALIGNMENT_MASK = BVH_ALIGNMENT-1;
+
+static const unsigned BVH_ALIGNMENT_BLOCKS = 2;
+
+
+
+
+unsigned btOptimizedBvh::calculateSerializeBufferSize()
+{
+	unsigned baseSize = sizeof(btOptimizedBvh) + BVH_ALIGNMENT_BLOCKS * BVH_ALIGNMENT;
+	baseSize += sizeof(btBvhSubtreeInfo) * m_subtreeHeaderCount;
+	if (m_useQuantization)
+	{
+		return baseSize + m_curNodeIndex * sizeof(btQuantizedBvhNode);
+	}
+	return baseSize + m_curNodeIndex * sizeof(btOptimizedBvhNode);
+}
+
+bool btOptimizedBvh::serialize(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian)
+{
+	assert(m_subtreeHeaderCount == m_SubtreeHeaders.size());
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+
+
+	if (i_dataBufferSize < calculateSerializeBufferSize() || o_alignedDataBuffer == NULL || (unsigned)o_alignedDataBuffer & BVH_ALIGNMENT_MASK != 0)
+	{
+		///check alignedment for buffer?
+		btAssert(0);
+		return false;
+	}
+
+	btOptimizedBvh *targetBvh = (btOptimizedBvh *)o_alignedDataBuffer;
+
+	// construct the class so the virtual function table, etc will be set up
+	// Also, m_leafNodes and m_quantizedLeafNodes will be initialized to default values by the constructor
+	new (targetBvh) btOptimizedBvh;
+
+	if (i_swapEndian)
+	{
+		targetBvh->m_curNodeIndex = btSwapEndian(m_curNodeIndex);
+
+
+		btSwapVector3Endian(m_bvhAabbMin,targetBvh->m_bvhAabbMin);
+		btSwapVector3Endian(m_bvhAabbMax,targetBvh->m_bvhAabbMax);
+		btSwapVector3Endian(m_bvhQuantization,targetBvh->m_bvhQuantization);
+
+		targetBvh->m_traversalMode = (btTraversalMode)btSwapEndian(m_traversalMode);
+		targetBvh->m_subtreeHeaderCount = btSwapEndian(m_subtreeHeaderCount);
+	}
+	else
+	{
+		targetBvh->m_curNodeIndex = m_curNodeIndex;
+		targetBvh->m_bvhAabbMin = m_bvhAabbMin;
+		targetBvh->m_bvhAabbMax = m_bvhAabbMax;
+		targetBvh->m_bvhQuantization = m_bvhQuantization;
+		targetBvh->m_traversalMode = m_traversalMode;
+		targetBvh->m_subtreeHeaderCount = m_subtreeHeaderCount;
+	}
+
+	targetBvh->m_useQuantization = m_useQuantization;
+
+	unsigned char *nodeData = (unsigned char *)targetBvh;
+	nodeData += sizeof(btOptimizedBvh);
+	
+	unsigned sizeToAdd = (unsigned)nodeData & BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+	
+	int nodeCount = m_curNodeIndex;
+
+	if (m_useQuantization)
+	{
+		targetBvh->m_quantizedContiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2]);
+
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2]);
+
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex);
+			}
+		}
+		else
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+	
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2];
+
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2];
+
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex;
+
+
+			}
+		}
+		nodeData += sizeof(btQuantizedBvhNode) * nodeCount;
+	}
+	else
+	{
+		targetBvh->m_contiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				btSwapVector3Endian(m_contiguousNodes[nodeIndex].m_aabbMinOrg, targetBvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg);
+				btSwapVector3Endian(m_contiguousNodes[nodeIndex].m_aabbMaxOrg, targetBvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg);
+
+				targetBvh->m_contiguousNodes[nodeIndex].m_escapeIndex = btSwapEndian(m_contiguousNodes[nodeIndex].m_escapeIndex);
+				targetBvh->m_contiguousNodes[nodeIndex].m_subPart = btSwapEndian(m_contiguousNodes[nodeIndex].m_subPart);
+				targetBvh->m_contiguousNodes[nodeIndex].m_triangleIndex = btSwapEndian(m_contiguousNodes[nodeIndex].m_triangleIndex);
+			}
+		}
+		else
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				targetBvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg = m_contiguousNodes[nodeIndex].m_aabbMinOrg;
+				targetBvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg = m_contiguousNodes[nodeIndex].m_aabbMaxOrg;
+
+				targetBvh->m_contiguousNodes[nodeIndex].m_escapeIndex = m_contiguousNodes[nodeIndex].m_escapeIndex;
+				targetBvh->m_contiguousNodes[nodeIndex].m_subPart = m_contiguousNodes[nodeIndex].m_subPart;
+				targetBvh->m_contiguousNodes[nodeIndex].m_triangleIndex = m_contiguousNodes[nodeIndex].m_triangleIndex;
+			}
+		}
+		nodeData += sizeof(btOptimizedBvhNode) * nodeCount;
+	}
+
+	sizeToAdd = (unsigned)nodeData & BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+
+	// Now serialize the subtree headers
+	targetBvh->m_SubtreeHeaders.initializeFromBuffer(nodeData, m_subtreeHeaderCount, m_subtreeHeaderCount);
+	if (i_swapEndian)
+	{
+		for (int i = 0; i < m_subtreeHeaderCount; i++)
+		{
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = btSwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = btSwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = btSwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[2]);
+
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = btSwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = btSwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = btSwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[2]);
+
+			targetBvh->m_SubtreeHeaders[i].m_rootNodeIndex = btSwapEndian(m_SubtreeHeaders[i].m_rootNodeIndex);
+			targetBvh->m_SubtreeHeaders[i].m_subtreeSize = btSwapEndian(m_SubtreeHeaders[i].m_subtreeSize);
+		}
+	}
+	else
+	{
+		for (int i = 0; i < m_subtreeHeaderCount; i++)
+		{
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = (m_SubtreeHeaders[i].m_quantizedAabbMin[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = (m_SubtreeHeaders[i].m_quantizedAabbMin[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = (m_SubtreeHeaders[i].m_quantizedAabbMin[2]);
+
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = (m_SubtreeHeaders[i].m_quantizedAabbMax[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = (m_SubtreeHeaders[i].m_quantizedAabbMax[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = (m_SubtreeHeaders[i].m_quantizedAabbMax[2]);
+
+			targetBvh->m_SubtreeHeaders[i].m_rootNodeIndex = (m_SubtreeHeaders[i].m_rootNodeIndex);
+			targetBvh->m_SubtreeHeaders[i].m_subtreeSize = (m_SubtreeHeaders[i].m_subtreeSize);
+			targetBvh->m_SubtreeHeaders[i] = m_SubtreeHeaders[i];
+		}
+	}
+
+	nodeData += sizeof(btBvhSubtreeInfo) * m_subtreeHeaderCount;
+
+	return true;
+}
+
+btOptimizedBvh *btOptimizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian)
+{
+	if (i_alignedDataBuffer == NULL || (unsigned)i_alignedDataBuffer & BVH_ALIGNMENT_MASK != 0)
+	{
+		return NULL;
+	}
+	btOptimizedBvh *bvh = (btOptimizedBvh *)i_alignedDataBuffer;
+
+	if (i_swapEndian)
+	{
+		bvh->m_curNodeIndex = btSwapEndian(bvh->m_curNodeIndex);
+
+		btUnSwapVector3Endian(bvh->m_bvhAabbMin);
+		btUnSwapVector3Endian(bvh->m_bvhAabbMax);
+		btUnSwapVector3Endian(bvh->m_bvhQuantization);
+
+		bvh->m_traversalMode = (btTraversalMode)btSwapEndian(bvh->m_traversalMode);
+		bvh->m_subtreeHeaderCount = btSwapEndian(bvh->m_subtreeHeaderCount);
+	}
+
+	int calculatedBufSize = bvh->calculateSerializeBufferSize();
+	btAssert(calculatedBufSize <= i_dataBufferSize);
+
+	if (calculatedBufSize > i_dataBufferSize)
+	{
+		return NULL;
+	}
+
+	unsigned char *nodeData = (unsigned char *)bvh;
+	nodeData += sizeof(btOptimizedBvh);
+	
+	unsigned sizeToAdd = (unsigned)nodeData & BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+	
+	int nodeCount = bvh->m_curNodeIndex;
+
+	// Must call placement new to fill in virtual function table, etc, but we don't want to overwrite most data, so call a special version of the constructor
+	// Also, m_leafNodes and m_quantizedLeafNodes will be initialized to default values by the constructor
+	new (bvh) btOptimizedBvh(*bvh, false);
+
+	if (bvh->m_useQuantization)
+	{
+		bvh->m_quantizedContiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2]);
+
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2]);
+
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex);
+			}
+		}
+		nodeData += sizeof(btQuantizedBvhNode) * nodeCount;
+	}
+	else
+	{
+		bvh->m_contiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				btUnSwapVector3Endian(bvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg);
+				btUnSwapVector3Endian(bvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg);
+				
+				bvh->m_contiguousNodes[nodeIndex].m_escapeIndex = btSwapEndian(bvh->m_contiguousNodes[nodeIndex].m_escapeIndex);
+				bvh->m_contiguousNodes[nodeIndex].m_subPart = btSwapEndian(bvh->m_contiguousNodes[nodeIndex].m_subPart);
+				bvh->m_contiguousNodes[nodeIndex].m_triangleIndex = btSwapEndian(bvh->m_contiguousNodes[nodeIndex].m_triangleIndex);
+			}
+		}
+		nodeData += sizeof(btOptimizedBvhNode) * nodeCount;
+	}
+
+	sizeToAdd = (unsigned)nodeData & BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+
+	// Now serialize the subtree headers
+	bvh->m_SubtreeHeaders.initializeFromBuffer(nodeData, bvh->m_subtreeHeaderCount, bvh->m_subtreeHeaderCount);
+	if (i_swapEndian)
+	{
+		for (int i = 0; i < bvh->m_subtreeHeaderCount; i++)
+		{
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = btSwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = btSwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = btSwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2]);
+
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = btSwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = btSwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = btSwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2]);
+
+			bvh->m_SubtreeHeaders[i].m_rootNodeIndex = btSwapEndian(bvh->m_SubtreeHeaders[i].m_rootNodeIndex);
+			bvh->m_SubtreeHeaders[i].m_subtreeSize = btSwapEndian(bvh->m_SubtreeHeaders[i].m_subtreeSize);
+		}
+	}
+
+	return bvh;
+}
+
+// Constructor that prevents btVector3's default constructor from being called
+btOptimizedBvh::btOptimizedBvh(btOptimizedBvh &self, bool ownsMemory) :
+m_bvhAabbMin(self.m_bvhAabbMin),
+m_bvhAabbMax(self.m_bvhAabbMax),
+m_bvhQuantization(self.m_bvhQuantization)
+{
+}
+
diff --git a/src/BulletCollision/CollisionShapes/btOptimizedBvh.h b/src/BulletCollision/CollisionShapes/btOptimizedBvh.h
index b069c7c5d..f85b3c37e 100644
--- a/src/BulletCollision/CollisionShapes/btOptimizedBvh.h
+++ b/src/BulletCollision/CollisionShapes/btOptimizedBvh.h
@@ -30,7 +30,6 @@ class btStridingMeshInterface;
 #define MAX_SUBTREE_SIZE_IN_BYTES  2048
 
 
-
 ///btQuantizedBvhNode is a compressed aabb node, 16 bytes.
 ///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
 ATTRIBUTE_ALIGNED16	(struct) btQuantizedBvhNode
@@ -145,7 +144,6 @@ ATTRIBUTE_ALIGNED16(class) btOptimizedBvh
 	btVector3			m_bvhAabbMin;
 	btVector3			m_bvhAabbMax;
 	btVector3			m_bvhQuantization;
-
 public:
 	enum btTraversalMode
 	{
@@ -156,12 +154,12 @@ public:
 protected:
 
 	btTraversalMode	m_traversalMode;
-
 	
-
-
 	BvhSubtreeInfoArray		m_SubtreeHeaders;
 
+	//This is only used for serialization so we don't have to add serialization directly to btAlignedObjectArray
+	int m_subtreeHeaderCount;
+
 
 	///two versions, one for quantized and normal nodes. This allows code-reuse while maintaining readability (no template/macro!)
 	///this might be refactored into a virtual, it is usually not calculated at run-time
@@ -276,7 +274,26 @@ protected:
 	void	walkRecursiveQuantizedTreeAgainstQuantizedTree(const btQuantizedBvhNode* treeNodeA,const btQuantizedBvhNode* treeNodeB,btNodeOverlapCallback* nodeCallback) const;
 	
 
-
+#define USE_BANCHLESS 1
+#ifdef USE_BANCHLESS
+	//This block replaces the block below and uses no branches, and replaces the 8 bit return with a 32 bit return for improved performance (~3x on XBox 360)
+	inline unsigned testQuantizedAabbAgainstQuantizedAabb(unsigned short int* aabbMin1,unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int* aabbMax2) const
+	{		
+		return btSelect((unsigned)((aabbMin1[0] <= aabbMax2[0]) & (aabbMax1[0] >= aabbMin2[0])
+			& (aabbMin1[2] <= aabbMax2[2]) & (aabbMax1[2] >= aabbMin2[2])
+			& (aabbMin1[1] <= aabbMax2[1]) & (aabbMax1[1] >= aabbMin2[1])),
+			1, 0);
+	}
+#else
+	inline bool testQuantizedAabbAgainstQuantizedAabb(unsigned short int* aabbMin1,unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int* aabbMax2) const
+	{
+		bool overlap = true;
+		overlap = (aabbMin1[0] > aabbMax2[0] || aabbMax1[0] < aabbMin2[0]) ? false : overlap;
+		overlap = (aabbMin1[2] > aabbMax2[2] || aabbMax1[2] < aabbMin2[2]) ? false : overlap;
+		overlap = (aabbMin1[1] > aabbMax2[1] || aabbMax1[1] < aabbMin2[1]) ? false : overlap;
+		return overlap;
+	}
+#endif //USE_BANCHLESS
 
 	void	updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex);
 
@@ -317,6 +334,26 @@ public:
 	{
 		return m_SubtreeHeaders;
 	}
+	
+	/////Calculate space needed to store BVH for serialization
+	unsigned calculateSerializeBufferSize();
+
+	/// Data buffer MUST be 16 byte aligned
+	bool serialize(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian);
+
+	///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
+	static btOptimizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian);
+
+	inline bool isQuantized()
+	{
+		return m_useQuantization;
+	}
+
+private:
+	// Special "copy" constructor that allows for in-place deserialization
+	// Prevents btVector3's default constructor from being called, but doesn't inialize much else
+	// ownsMemory should most likely be false if deserializing, and if you are not, don't call this (it also changes the function signature, which we need)
+	btOptimizedBvh(btOptimizedBvh &other, bool ownsMemory);
 
 }
 ;
diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
index dd15c8deb..d8f5e3fae 100644
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
@@ -27,8 +27,7 @@ subject to the following restrictions:
 #include <new>
 #include "LinearMath/btStackAlloc.h"
 #include "LinearMath/btQuickprof.h"
-#include "btSolverBody.h"
-#include "btSolverConstraint.h"
+
 #include "BulletCollision/BroadphaseCollision/btDispatcher.h"
 #include "LinearMath/btAlignedObjectArray.h"
 
@@ -392,19 +391,41 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendly(btCollisio
 
 
 	int tmpSolverBodyPoolSize = 0;
-	int mem1 = sizeof(btSolverBody) * numActiveBodies;
-	btSolverBody* tmpSolverBodyPool = (btSolverBody*) stackAlloc->allocate(sizeof(btSolverBody) * numActiveBodies*2);
+	int memNeeded = sizeof(btSolverBody) * numActiveBodies*2;
+	btSolverBody* tmpSolverBodyPool = 0;		
+	if (memNeeded < stackAlloc->getAvailableMemory())
+	{
+		tmpSolverBodyPool = (btSolverBody*) stackAlloc->allocate(memNeeded);
+	} else
+	{
+		m_solverBodyPool.resize(numActiveBodies*2);
+		tmpSolverBodyPool = &m_solverBodyPool[0];
+	}
 	int tmpSolverConstraintPoolSize = 0;
-	int mem2 = sizeof(btSolverConstraint)*numActiveManifolds;
+	int mem2Needed = sizeof(btSolverConstraint)*totalContacts;
 
-	btSolverConstraint* tmpSolverConstraintPool = (btSolverConstraint*) stackAlloc->allocate(sizeof(btSolverConstraint)*totalContacts);
+	btSolverConstraint* tmpSolverConstraintPool = 0;
+	if (mem2Needed < stackAlloc->getAvailableMemory())
+	{
+		tmpSolverConstraintPool = (btSolverConstraint*) stackAlloc->allocate(sizeof(btSolverConstraint)*totalContacts);
+	} else
+	{
+		m_solverConstraintPool.resize(totalContacts);
+		tmpSolverConstraintPool = &m_solverConstraintPool[0];
+	}
 
 	int tmpSolverFrictionConstraintPoolSize = 0;
-	btSolverConstraint*	tmpSolverFrictionConstraintPool = (btSolverConstraint*) stackAlloc->allocate(sizeof(btSolverConstraint)*totalContacts*2);
-
-	//int sizeofSB = sizeof(btSolverBody);
-	//int sizeofSC = sizeof(btSolverConstraint);
+	int mem3Needed = sizeof(btSolverConstraint)*totalContacts*2;
+	btSolverConstraint*	tmpSolverFrictionConstraintPool = 0;
 
+	if (mem3Needed < stackAlloc->getAvailableMemory())
+	{
+		tmpSolverFrictionConstraintPool	= (btSolverConstraint*) stackAlloc->allocate(mem3Needed);
+	} else
+	{
+		m_solverFrictionConstraintPool.resize(totalContacts*2);
+		tmpSolverFrictionConstraintPool = &m_solverFrictionConstraintPool[0];
+	}
 
 	//if (1)
 	{
@@ -654,9 +675,33 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendly(btCollisio
 	///use the stack allocator for temporarily memory
 	
 	int gOrderTmpConstraintPoolSize = numConstraintPool;
-	int*	gOrderTmpConstraintPool = (int*) stackAlloc->allocate(sizeof(int)*numConstraintPool);
+
+	int mem4Needed = sizeof(int)*numConstraintPool;
+
+	int*	gOrderTmpConstraintPool = 0;
+	if (mem4Needed < stackAlloc->getAvailableMemory())
+	{
+		gOrderTmpConstraintPool = (int*) stackAlloc->allocate(mem4Needed);
+	} else
+	{
+		m_constraintOrder.resize(numConstraintPool);
+		gOrderTmpConstraintPool =	&m_constraintOrder[0];
+	}
+
 	int gOrderFrictionConstraintPoolSize = numFrictionPool;
-	int*	gOrderFrictionConstraintPool = (int*) stackAlloc->allocate(sizeof(int)*numFrictionPool);
+	int mem5Needed = sizeof(int)*numFrictionPool;
+
+		int*	gOrderFrictionConstraintPool =0;
+
+	if (mem5Needed < stackAlloc->getAvailableMemory())
+	{
+		gOrderFrictionConstraintPool = (int*) stackAlloc->allocate(mem5Needed);
+	}
+	else
+	{
+		m_frictionConstraintOrder.resize(numFrictionPool);
+		gOrderFrictionConstraintPool = &m_frictionConstraintOrder[0];
+	}
 
 	{
 		int i;
diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
index 0efcff292..b6b0a2465 100644
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
@@ -19,7 +19,9 @@ subject to the following restrictions:
 #include "btConstraintSolver.h"
 class btIDebugDraw;
 #include "btContactConstraint.h"
-	
+
+#include "btSolverBody.h"
+#include "btSolverConstraint.h"
 
 
 /// btSequentialImpulseConstraintSolver uses a Propagation Method and Sequentially applies impulses
@@ -29,6 +31,14 @@ class btIDebugDraw;
 class btSequentialImpulseConstraintSolver : public btConstraintSolver
 {
 
+	btAlignedObjectArray<btSolverBody> m_solverBodyPool;
+	btAlignedObjectArray<btSolverConstraint> m_solverConstraintPool;
+	btAlignedObjectArray<btSolverConstraint> m_solverFrictionConstraintPool;
+
+	btAlignedObjectArray<int>	m_constraintOrder;
+	btAlignedObjectArray<int>	m_frictionConstraintOrder;
+
+
 protected:
 	btScalar solve(btRigidBody* body0,btRigidBody* body1, btManifoldPoint& cp, const btContactSolverInfo& info,int iter,btIDebugDraw* debugDrawer);
 	btScalar solveFriction(btRigidBody* body0,btRigidBody* body1, btManifoldPoint& cp, const btContactSolverInfo& info,int iter,btIDebugDraw* debugDrawer);
diff --git a/src/BulletDynamics/ConstraintSolver/btSolverBody.h b/src/BulletDynamics/ConstraintSolver/btSolverBody.h
index 2dda865c8..8513f1ee3 100644
--- a/src/BulletDynamics/ConstraintSolver/btSolverBody.h
+++ b/src/BulletDynamics/ConstraintSolver/btSolverBody.h
@@ -19,10 +19,11 @@ subject to the following restrictions:
 class	btRigidBody;
 #include "LinearMath/btVector3.h"
 #include "LinearMath/btMatrix3x3.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
 
 
 
-
+///btSolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance.
 ATTRIBUTE_ALIGNED16 (struct)	btSolverBody
 {
 	btVector3		m_centerOfMassPosition;
diff --git a/src/LinearMath/btAlignedObjectArray.h b/src/LinearMath/btAlignedObjectArray.h
index ac86b5466..61a62a7ad 100644
--- a/src/LinearMath/btAlignedObjectArray.h
+++ b/src/LinearMath/btAlignedObjectArray.h
@@ -50,6 +50,8 @@ class btAlignedObjectArray
 	int					m_size;
 	int					m_capacity;
 	T*					m_data;
+	//PCK: added this line
+	bool				m_ownsMemory;
 
 	protected:
 		SIMD_FORCE_INLINE	int	allocSize(int size)
@@ -69,6 +71,8 @@ class btAlignedObjectArray
 
 		SIMD_FORCE_INLINE	void	init()
 		{
+			//PCK: added this line
+			m_ownsMemory = true;
 			m_data = 0;
 			m_size = 0;
 			m_capacity = 0;
@@ -92,7 +96,11 @@ class btAlignedObjectArray
 		SIMD_FORCE_INLINE	void	deallocate()
 		{
 			if(m_data)	{
-				m_allocator.deallocate(m_data);
+				//PCK: enclosed the deallocation in this block
+				if (m_ownsMemory)
+				{
+					m_allocator.deallocate(m_data);
+				}
 				m_data = 0;
 			}
 		}
@@ -223,6 +231,9 @@ class btAlignedObjectArray
 				destroy(0,size());
 
 				deallocate();
+				
+				//PCK: added this line
+				m_ownsMemory = true;
 
 				m_data = s;
 				
@@ -360,8 +371,16 @@ class btAlignedObjectArray
 		}
 	}
 
+	//PCK: whole function
+	void initializeFromBuffer(void *buffer, int size, int capacity)
+	{
+		clear();
+		m_ownsMemory = false;
+		m_data = (T*)buffer;
+		m_size = size;
+		m_capacity = capacity;
+	}
+
 };
 
 #endif //BT_OBJECT_ARRAY__
-
-
diff --git a/src/LinearMath/btQuadWord.h b/src/LinearMath/btQuadWord.h
index 24751b1be..7ce97e525 100644
--- a/src/LinearMath/btQuadWord.h
+++ b/src/LinearMath/btQuadWord.h
@@ -22,10 +22,17 @@ subject to the following restrictions:
 class btQuadWordStorage
 {
 protected:
-	btScalar	m_x;
-	btScalar	m_y;
-	btScalar	m_z;
-	btScalar	m_unusedW;
+#ifdef BT_USE_DOUBLE_PRECISION
+	union { btScalar	m_x; unsigned long long int m_intx; unsigned char m_charx[8] ;};
+	union { btScalar	m_y; unsigned long long int m_inty; unsigned char m_chary[8] ; };
+	union { btScalar	m_z; unsigned long long int m_intz; unsigned char m_charz[8] ; };
+	union { btScalar	m_unusedW; unsigned long long int m_intw; unsigned char m_charw[8] ; };
+#else
+	union { btScalar	m_x; unsigned int m_intx; unsigned char m_charx[4] ;};
+	union { btScalar	m_y; unsigned int m_inty; unsigned char m_chary[4] ; };
+	union { btScalar	m_z; unsigned int m_intz; unsigned char m_charz[4] ; };
+	union { btScalar	m_unusedW; unsigned int m_intw; unsigned char m_charw[4] ; };
+#endif //BT_USE_DOUBLE_PRECISION
 };
 
 
@@ -63,6 +70,79 @@ class	btQuadWord : public btQuadWordStorage
 		SIMD_FORCE_INLINE	operator       btScalar *()       { return &m_x; }
 		SIMD_FORCE_INLINE	operator const btScalar *() const { return &m_x; }
 
+#ifdef BT_USE_DOUBLE_PRECISION
+		SIMD_FORCE_INLINE unsigned long long int getLongIntXValue() const
+		{
+			return m_intx;
+		}
+		SIMD_FORCE_INLINE unsigned long long int getLongIntYValue() const
+		{
+			return m_inty;
+		}
+		SIMD_FORCE_INLINE unsigned long long int getLongIntZValue() const
+		{
+			return m_intz;
+		}
+		SIMD_FORCE_INLINE unsigned long long int getLongIntWValue() const
+		{
+			return m_intw;
+		}
+		SIMD_FORCE_INLINE void 	setXValueByLongInt(unsigned long long int intval)
+		{
+			m_intx = intval;
+		}
+
+		SIMD_FORCE_INLINE void 	setYValueByLongInt(unsigned long long int intval)
+		{
+			m_inty = intval;
+		}
+
+		SIMD_FORCE_INLINE void 	setZValueByLongInt(unsigned long long int intval)
+		{
+			m_intz = intval;
+		}
+		SIMD_FORCE_INLINE void 	setWValueByLongInt(unsigned long long int intval)
+		{
+			m_intz = intval;
+		}
+#else
+		SIMD_FORCE_INLINE unsigned int getIntXValue() const
+		{
+			return m_intx;
+		}
+		SIMD_FORCE_INLINE unsigned  int getIntYValue() const
+		{
+			return m_inty;
+		}
+		SIMD_FORCE_INLINE unsigned int getIntZValue() const
+		{
+			return m_intz;
+		}
+		SIMD_FORCE_INLINE unsigned int getIntWValue() const
+		{
+			return m_intw;
+		}
+		SIMD_FORCE_INLINE void 	setXValueByInt(unsigned int intval)
+		{
+			m_intx = intval;
+		}
+
+		SIMD_FORCE_INLINE void 	setYValueByInt(unsigned int intval)
+		{
+			m_inty = intval;
+		}
+
+		SIMD_FORCE_INLINE void 	setZValueByInt(unsigned int intval)
+		{
+			m_intz = intval;
+		}
+		SIMD_FORCE_INLINE void 	setWValueByInt(unsigned int intval)
+		{
+			m_intw = intval;
+		}
+
+#endif//BT_USE_DOUBLE_PRECISION
+
 		SIMD_FORCE_INLINE void 	setValue(const btScalar& x, const btScalar& y, const btScalar& z)
 		{
 			m_x=x;
diff --git a/src/LinearMath/btScalar.h b/src/LinearMath/btScalar.h
index 05df31945..2cc6ba5ec 100644
--- a/src/LinearMath/btScalar.h
+++ b/src/LinearMath/btScalar.h
@@ -24,49 +24,6 @@ subject to the following restrictions:
 #include <float.h>
 
 
-#ifdef WIN32
-
-//added __cdecl, thanks Jack
-
-// default new and delete overrides that guarantee 16 byte alignment and zero allocated memory
-void* __cdecl operator new(size_t sz) throw();
-void* __cdecl operator new[](size_t sz) throw();
-void __cdecl operator delete(void* m) throw();
-void __cdecl operator delete[](void* m) throw();
-
-#include <malloc.h>
-#include <stdio.h>
-#define BULLET_ALIGNED_NEW_AND_DELETE \
-\
-inline void* operator new(size_t sz) throw()	\
-{												\
-	printf("new %d\n",sz);						\
-	void* mem = _aligned_malloc(sz + 64, 16);	\
-	return mem;									\
-}												\
-												\
-inline void* operator new[](size_t sz) throw()	\
-{												\
-	printf("new[] %d\n",sz);					\
-	void* mem = _aligned_malloc(sz + 64, 16);	\
-	return mem;									\
-}												\
-												\
-inline void operator delete(void* m) throw()	\
-{					\
-printf("delete %x\n",m);						\
-	if (m == 0)									\
-		return;									\
-	_aligned_free(m);							\
-}												\
-												\
-inline void operator delete[](void* m) throw()	\
-{												\
-	printf("delete[] %x\n",m); \
-_aligned_free(m);							\
-}												\
-
-#endif
 
 #ifdef WIN32
 
@@ -227,6 +184,18 @@ SIMD_FORCE_INLINE btScalar btFsel(btScalar a, btScalar b, btScalar c)
 #define btFsels(a,b,c) (btScalar)btFsel(a,b,c)
 
 
+SIMD_FORCE_INLINE bool btMachineIsLittleEndian()
+{
+   long int i = 1;
+   const char *p = (const char *) &i;
+   if (p[0] == 1)  // Lowest address contains the least significant byte
+	   return true;
+   else
+	   return false;
+}
+
+
+
 ///btSelect avoids branches, which makes performance much better for consoles like Playstation 3 and XBox 360
 ///Thanks Phil Knight. See also http://www.cellperformance.com/articles/2006/04/more_techniques_for_eliminatin_1.html
 SIMD_FORCE_INLINE unsigned btSelect(unsigned condition, unsigned valueIfConditionNonZero, unsigned valueIfConditionZero) 
@@ -255,4 +224,99 @@ SIMD_FORCE_INLINE float btSelect(unsigned condition, float valueIfConditionNonZe
 }
 
 
+//PCK: endian swapping functions
+SIMD_FORCE_INLINE unsigned btSwapEndian(unsigned val)
+{
+	return (((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8)  | ((val & 0x000000ff) << 24));
+}
+
+SIMD_FORCE_INLINE unsigned short btSwapEndian(unsigned short val)
+{
+	return (((val & 0xff00) >> 8) | ((val & 0x00ff) << 8));
+}
+
+SIMD_FORCE_INLINE unsigned btSwapEndian(int val)
+{
+	return btSwapEndian((unsigned)val);
+}
+
+SIMD_FORCE_INLINE unsigned short btSwapEndian(short val)
+{
+	return btSwapEndian((unsigned short) val);
+}
+
+///btSwapFloat uses using char pointers to swap the endianness
+////btSwapFloat/btSwapDouble will NOT return a float, because the machine might 'correct' invalid floating point values
+///Not all values of sign/exponent/mantissa are valid floating point numbers according to IEEE 754. 
+///When a floating point unit is faced with an invalid value, it may actually change the value, or worse, throw an exception. 
+///In most systems, running user mode code, you wouldn't get an exception, but instead the hardware/os/runtime will 'fix' the number for you. 
+///so instead of returning a float/double, we return integer/long long integer
+SIMD_FORCE_INLINE unsigned int  btSwapEndianFloat(float d)
+{
+    unsigned int a;
+    unsigned char *dst = (unsigned char *)&a;
+    unsigned char *src = (unsigned char *)&d;
+
+    dst[0] = src[3];
+    dst[1] = src[2];
+    dst[2] = src[1];
+    dst[3] = src[0];
+    return a;
+}
+
+// unswap using char pointers
+SIMD_FORCE_INLINE float btUnswapEndianFloat(unsigned int a) 
+{
+    float d;
+    unsigned char *src = (unsigned char *)&a;
+    unsigned char *dst = (unsigned char *)&d;
+
+    dst[0] = src[3];
+    dst[1] = src[2];
+    dst[2] = src[1];
+    dst[3] = src[0];
+
+    return d;
+}
+
+
+// swap using char pointers
+SIMD_FORCE_INLINE unsigned long long  btSwapEndianDouble(double d)
+{
+    unsigned long long a;
+    unsigned char *dst = (unsigned char *)&a;
+    unsigned char *src = (unsigned char *)&d;
+
+    dst[0] = src[7];
+    dst[1] = src[6];
+    dst[2] = src[5];
+    dst[3] = src[4];
+    dst[4] = src[3];
+    dst[5] = src[2];
+    dst[6] = src[1];
+    dst[7] = src[0];
+
+    return a;
+}
+
+// unswap using char pointers
+SIMD_FORCE_INLINE double btUnswapEndianDouble(unsigned long long a) 
+{
+    double d;
+    unsigned char *src = (unsigned char *)&a;
+    unsigned char *dst = (unsigned char *)&d;
+
+    dst[0] = src[7];
+    dst[1] = src[6];
+    dst[2] = src[5];
+    dst[3] = src[4];
+    dst[4] = src[3];
+    dst[5] = src[2];
+    dst[6] = src[1];
+    dst[7] = src[0];
+
+    return d;
+}
+
+
 #endif //SIMD___SCALAR_H
diff --git a/src/LinearMath/btStackAlloc.h b/src/LinearMath/btStackAlloc.h
index 8a3eed67d..6c03998b4 100644
--- a/src/LinearMath/btStackAlloc.h
+++ b/src/LinearMath/btStackAlloc.h
@@ -55,6 +55,12 @@ public:
 		}
 		
 	}
+
+	int	getAvailableMemory() const
+	{
+		return totalsize - usedsize;
+	}
+
 	unsigned char*			allocate(unsigned int size)
 	{
 		const unsigned int	nus(usedsize+size);
diff --git a/src/LinearMath/btVector3.h b/src/LinearMath/btVector3.h
index 2153dcea5..f85a20f49 100644
--- a/src/LinearMath/btVector3.h
+++ b/src/LinearMath/btVector3.h
@@ -403,4 +403,58 @@ public:
 
 };
 
+
+///btSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
+SIMD_FORCE_INLINE void	btSwapVector3Endian(const btVector3& source, btVector3& dest)
+{
+#ifdef BT_USE_DOUBLE_PRECISION
+	unsigned long long int tmp;
+	tmp = btSwapDouble(source.getX());
+	dest.setXValueByLongInt(tmp);
+	tmp = btSwapDouble(source.getY());
+	dest.setYValueByLongInt(tmp);
+	tmp = btSwapDouble(source.getZ());
+	dest.setZValueByLongInt(tmp);
+	tmp = btSwapDouble(source[3]);
+	dest.setWValueByLongInt(tmp);
+#else
+	unsigned int tmp;
+	tmp = btSwapEndianFloat(source.getX());
+	dest.setXValueByInt(tmp);
+	tmp = btSwapEndianFloat(source.getY());
+	dest.setYValueByInt(tmp);
+	tmp = btSwapEndianFloat(source.getZ());
+	dest.setZValueByInt(tmp);
+	tmp = btSwapEndianFloat(source[3]);
+	dest.setWValueByInt(tmp);
+#endif //BT_USE_DOUBLE_PRECISION
+}
+///btUnSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
+SIMD_FORCE_INLINE void	btUnSwapVector3Endian(btVector3& vector)
+{
+#ifdef BT_USE_DOUBLE_PRECISION
+	unsigned long long int tmp;
+	tmp = vector.getLongIntXValue();
+	vector.setX( btUnswapDouble(tmp));
+	tmp = vector.getLongIntYValue();
+	vector.setY( btUnswapDouble(tmp));
+	tmp = vector.getLongIntZValue();
+	vector.setZ( btUnswapDouble(tmp));
+	tmp = vector.getLongIntWValue();
+	vector[3] = btUnswapDouble(tmp);
+#else
+	unsigned int tmp;
+	tmp = vector.getIntXValue();
+	vector.setX( btUnswapEndianFloat(tmp));
+	tmp = vector.getIntYValue();
+	vector.setY( btUnswapEndianFloat(tmp));
+	tmp = vector.getIntZValue();
+	vector.setZ( btUnswapEndianFloat(tmp));
+	tmp = vector.getIntWValue();
+	vector[3] = btUnswapEndianFloat(tmp);
+
+#endif //BT_USE_DOUBLE_PRECISION
+
+}
+
 #endif //SIMD__VECTOR3_H