moved Extras/Gimpact to src/BulletCollision/Gimpact

moved Extras/BulletMultiThreaded to src/BulletMultiThreaded (build systems will be updated soon)
2008-10-10 19:48:36 +00:00
parent 6f6f88fa08
commit 512c0f167e
111 changed files with 10811 additions and 11799 deletions
--- a/src/BulletMultiThreaded/CMakeLists.txt
+++ b/src/BulletMultiThreaded/CMakeLists.txt
@@ -0,0 +1,66 @@
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/Extras/BulletMultiThreaded/
+${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+ADD_LIBRARY(LibBulletMultiThreaded
+		PlatformDefinitions.h
+		SpuFakeDma.cpp
+		SpuFakeDma.h
+		SpuSync.h
+		SpuDoubleBuffer.h
+		SpuLibspe2Support.cpp
+		SpuLibspe2Support.h
+		btThreadSupportInterface.cpp
+		btThreadSupportInterface.h
+		
+		Win32ThreadSupport.cpp
+		Win32ThreadSupport.h
+		PosixThreadSupport.cpp
+		PosixThreadSupport.h
+		SequentialThreadSupport.cpp
+		SequentialThreadSupport.h
+		SpuSampleTaskProcess.h
+		SpuSampleTaskProcess.cpp
+
+		SpuCollisionObjectWrapper.cpp 
+		SpuCollisionObjectWrapper.h 
+		SpuCollisionTaskProcess.h
+		SpuCollisionTaskProcess.cpp
+		SpuGatheringCollisionDispatcher.h
+		SpuGatheringCollisionDispatcher.cpp
+		SpuContactManifoldCollisionAlgorithm.cpp
+		SpuContactManifoldCollisionAlgorithm.h
+		SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
+		SpuNarrowPhaseCollisionTask/SpuContactResult.h
+		SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
+		SpuNarrowPhaseCollisionTask/SpuEpaPenetrationDepthSolver.cpp
+		SpuNarrowPhaseCollisionTask/SpuEpaPenetrationDepthSolver.h
+		SpuNarrowPhaseCollisionTask/SpuGjkEpa2.cpp
+		SpuNarrowPhaseCollisionTask/SpuGjkEpa2.h
+		SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h
+		SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
+		SpuNarrowPhaseCollisionTask/SpuPreferredPenetrationDirections.h
+		SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
+		SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
+		SpuNarrowPhaseCollisionTask/SpuVoronoiSimplexSolver.cpp
+		SpuNarrowPhaseCollisionTask/SpuVoronoiSimplexSolver.h
+		SpuNarrowPhaseCollisionTask/SpuGjkPairDetector.cpp
+		SpuNarrowPhaseCollisionTask/SpuGjkPairDetector.h
+		SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
+		SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h
+
+		SpuParallelSolver.cpp
+		SpuParallelSolver.h
+		SpuSolverTask/SpuParallellSolverTask.cpp
+		SpuSolverTask/SpuParallellSolverTask.h
+
+		SpuBatchRaycaster.cpp
+		SpuBatchRaycaster.h
+		SpuRaycastTaskProcess.cpp
+		SpuRaycastTaskProcess.h
+		SpuRaycastTask/SpuRaycastTask.cpp
+		SpuRaycastTask/SpuRaycastTask.h
+		SpuRaycastTask/SpuSubSimplexConvexCast.cpp
+		SpuRaycastTask/SpuSubSimplexConvexCast.h
+)
--- a/src/BulletMultiThreaded/Jamfile
+++ b/src/BulletMultiThreaded/Jamfile
@@ -0,0 +1,12 @@
+SubDir TOP Extras BulletMultiThreaded ;
+
+#IncludeDir Extras/BulletMultiThreaded ;
+
+Library bulletmultithreaded : [ Wildcard . : */.h *.cpp ] [ Wildcard SpuNarrowPhaseCollisionTask : *.h *.cpp  ] [ Wildcard SpuSolverTask : *.h *.cpp  ] : noinstall ;
+CFlags bulletmultithreaded : [ FIncludes $(TOP)/Extras/BulletMultiThreaded ] ;
+LibDepends bulletmultithreaded :  ;
+
+   MsvcIncDirs bulletmultithreaded : 
+	"../../Extras/BulletMultiThreaded"  ;
+
+InstallHeader [ Wildcard *.h ] : bulletmultithreaded ;
--- a/src/BulletMultiThreaded/Makefile.original
+++ b/src/BulletMultiThreaded/Makefile.original
@@ -0,0 +1,177 @@
+__ARCH_BITS__ := 32
+
+# define macros
+NARROWPHASEDIR=./SpuNarrowPhaseCollisionTask
+SPU_TASKFILE=$(NARROWPHASEDIR)/SpuGatheringCollisionTask
+
+IBM_CELLSDK_VERSION := $(shell if [ -d /opt/cell ]; then echo "3.0"; fi)
+
+ifeq ("$(IBM_CELLSDK_VERSION)","3.0")
+        CELL_TOP ?= /opt/cell/sdk
+	CELL_SYSROOT := /opt/cell/sysroot
+else
+        CELL_TOP ?= /opt/ibm/cell-sdk/prototype
+	CELL_SYSROOT := $(CELL_TOP)/sysroot
+endif
+
+
+USE_CCACHE=ccache
+RM=rm -f 
+OUTDIR=./out
+DEBUGFLAG=-DNDEBUG
+LIBOUTDIR=../../lib/ibmsdk
+COLLISIONDIR=../../src/BulletCollision
+MATHDIR=../../src/LinearMath
+ARCHITECTUREFLAG=-m$(__ARCH_BITS__)
+ifeq "$(__ARCH_BITS__)" "64"
+  SPU_DEFFLAGS= -DUSE_LIBSPE2 -D__SPU__ -DUSE_ADDR64
+else
+  SPU_DEFFLAGS= -DUSE_LIBSPE2 -D__SPU__
+endif
+SPU_GCC=$(USE_CCACHE) /usr/bin/spu-gcc
+SPU_INCLUDEDIR= -I. -I$(CELL_SYSROOT)/usr/spu/include -I../../src -I$(NARROWPHASEDIR)
+#SPU_CFLAGS= $(DEBUGFLAG) -W -Wall -Winline -Os -c -include spu_intrinsics.h -include stdbool.h
+SPU_CFLAGS= $(DEBUGFLAG) -W -Wall -Winline -O3 -mbranch-hints -fomit-frame-pointer -ftree-vectorize -finline-functions -ftree-vect-loop-version -ftree-loop-optimize -ffast-math -fno-rtti -fno-exceptions -c -include spu_intrinsics.h -include stdbool.h
+
+SPU_LFLAGS= -Wl,-N
+SPU_LIBRARIES=-lstdc++
+SPU_EMBED=/usr/bin/ppu-embedspu
+SPU_AR=/usr/bin/ar
+SYMBOLNAME=spu_program
+
+ifeq "$(__ARCH_BITS__)" "64"
+  PPU_DEFFLAGS= -DUSE_LIBSPE2 -DUSE_ADDR64
+  PPU_GCC=$(USE_CCACHE) /usr/bin/ppu-gcc
+else
+  PPU_DEFFLAGS= -DUSE_LIBSPE2
+  PPU_GCC=$(USE_CCACHE) /usr/bin/ppu32-gcc
+endif
+
+PPU_CFLAGS= $(ARCHITECTUREFLAG) $(DEBUGFLAG) -W -Wall -Winline -O3 -c -mabi=altivec -maltivec -include altivec.h -include stdbool.h
+PPU_INCLUDEDIR= -I. -I$(CELL_SYSROOT)/usr/include -I../../src -I$(NARROWPHASEDIR)
+PPU_LFLAGS= $(ARCHITECTUREFLAG) -Wl,-m,elf$(__ARCH_BITS__)ppc
+PPU_LIBRARIES= -lstdc++ -lsupc++ -lgcc -lgcov -lspe2 -lpthread -L../../lib/ibmsdk -lbulletcollision -lbulletdynamics -lbulletmath -L$(CELL_SYSROOT)/usr/lib$(__ARCH_BITS__) -R$(CELL_SYSROOT)/usr/lib
+PPU_AR=/usr/bin/ar
+
+MakeOut :
+#	rm -f -R $(OUTDIR) ; mkdir $(OUTDIR)
+	@echo "usage: make spu, make ppu, make all, or make clean"
+# SPU
+SpuTaskFile : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/SpuTaskFile.o $(SPU_TASKFILE).cpp
+
+SpuFakeDma : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+
+SpuContactManifoldCollisionAlgorithm_spu : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o SpuContactManifoldCollisionAlgorithm.cpp
+
+SpuCollisionShapes : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
+
+SpuContactResult : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
+
+#SpuGatheringCollisionTask : MakeOut
+#	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
+
+SpuGjkPairDetector: MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
+
+SpuMinkowskiPenetrationDepthSolver : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
+
+SpuVoronoiSimplexSolver : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
+
+#SpuLibspe2Support_spu : MakeOut
+#	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o SpuLibspe2Support.cpp
+
+## SPU-Bullet
+btPersistentManifold : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/NarrowPhaseCollision/$@.cpp
+
+btOptimizedBvh : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionShapes/$@.cpp
+
+btCollisionObject : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionDispatch/$@.cpp
+
+btTriangleCallback : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionShapes/$@.cpp
+
+btTriangleIndexVertexArray : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionShapes/$@.cpp
+
+btStridingMeshInterface : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionShapes/$@.cpp
+
+btAlignedAllocator : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(MATHDIR)/$@.cpp
+
+
+# PPU
+SpuGatheringCollisionDispatcher : MakeOut
+	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+
+SpuLibspe2Support: MakeOut
+	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+
+btThreadSupportInterface: MakeOut
+	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+
+SpuCollisionTaskProcess : MakeOut
+	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+
+SpuContactManifoldCollisionAlgorithm : MakeOut
+	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+	
+SpuSampleTaskProcess : MakeOut
+	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+	
+
+
+spu : SpuFakeDma SpuContactManifoldCollisionAlgorithm_spu SpuContactResult SpuTaskFile \
+      SpuGjkPairDetector SpuMinkowskiPenetrationDepthSolver SpuVoronoiSimplexSolver SpuCollisionShapes \
+      btPersistentManifold btOptimizedBvh btCollisionObject btTriangleCallback btTriangleIndexVertexArray \
+      btStridingMeshInterface btAlignedAllocator
+	$(SPU_GCC) -o $(OUTDIR)/spuCollision.elf \
+                            $(OUTDIR)/SpuTaskFile.o \
+                            $(OUTDIR)/SpuFakeDma.o \
+                            $(OUTDIR)/SpuContactManifoldCollisionAlgorithm_spu.o \
+                            $(OUTDIR)/SpuContactResult.o \
+                            $(OUTDIR)/SpuCollisionShapes.o \
+                            $(OUTDIR)/SpuGjkPairDetector.o \
+                            $(OUTDIR)/SpuMinkowskiPenetrationDepthSolver.o \
+                            $(OUTDIR)/SpuVoronoiSimplexSolver.o \
+                            $(OUTDIR)/btPersistentManifold.o \
+                            $(OUTDIR)/btTriangleCallback.o \
+                            $(OUTDIR)/btTriangleIndexVertexArray.o \
+                            $(OUTDIR)/btStridingMeshInterface.o \
+                            $(OUTDIR)/btAlignedAllocator.o \
+                            $(SPU_LFLAGS) $(SPU_LIBRARIES)
+
+spu-embed : spu
+	$(SPU_EMBED) $(ARCHITECTUREFLAG) $(SYMBOLNAME) $(OUTDIR)/spuCollision.elf $(OUTDIR)/$@.o
+	$(SPU_AR) -qcs $(LIBOUTDIR)/libspu.a $(OUTDIR)/$@.o
+
+
+
+ppu : SpuGatheringCollisionDispatcher SpuCollisionTaskProcess btThreadSupportInterface \
+      SpuLibspe2Support SpuContactManifoldCollisionAlgorithm SpuSampleTaskProcess
+	$(PPU_AR) -qcs $(LIBOUTDIR)/bulletmultithreaded.a \
+                                                          $(OUTDIR)/SpuCollisionTaskProcess.o \
+                                                          $(OUTDIR)/SpuSampleTaskProcess.o \
+                                                          $(OUTDIR)/SpuGatheringCollisionDispatcher.o \
+                                                          $(OUTDIR)/SpuLibspe2Support.o \
+                                                          $(OUTDIR)/btThreadSupportInterface.o \
+							  $(OUTDIR)/SpuContactManifoldCollisionAlgorithm.o
+
+all : spu-embed ppu 
+
+clean:
+	$(RM) $(OUTDIR)/* ; $(RM) $(LIBOUTDIR)/libspu.a ; $(RM) $(LIBOUTDIR)/bulletmultithreaded.a
+
+
+
+
--- a/src/BulletMultiThreaded/PlatformDefinitions.h
+++ b/src/BulletMultiThreaded/PlatformDefinitions.h
@@ -0,0 +1,82 @@
+#ifndef TYPE_DEFINITIONS_H
+#define TYPE_DEFINITIONS_H
+
+///This file provides some platform/compiler checks for common definitions
+
+#ifdef WIN32
+
+typedef union
+{
+  unsigned int u;
+  void *p;
+} addr64;
+
+#define USE_WIN32_THREADING 1
+
+		#if defined(__MINGW32__) || defined(__CYGWIN__) || (defined (_MSC_VER) && _MSC_VER < 1300)
+		#else
+		#endif //__MINGW32__
+
+		typedef unsigned char     uint8_t;
+#ifndef __PHYSICS_COMMON_H__
+		typedef unsigned long int uint64_t;
+		typedef unsigned int      uint32_t;
+#endif //__PHYSICS_COMMON_H__
+		typedef unsigned short    uint16_t;
+
+		#include <malloc.h>
+		#define memalign(alignment, size) malloc(size);
+			
+#include <string.h> //memcpy
+
+		
+
+		#include <stdio.h>		
+		#define spu_printf printf
+		
+#else
+		#include <stdint.h>
+		#include <stdlib.h>
+		#include <string.h> //for memcpy
+
+#if defined	(__CELLOS_LV2__)
+	// Playstation 3 Cell SDK
+#include <spu_printf.h>
+		
+#else
+	// posix system
+
+#define USE_PTHREADS    (1)
+
+#ifdef USE_LIBSPE2
+#include <stdio.h>		
+#define spu_printf printf	
+#define DWORD unsigned int
+		
+			typedef union
+			{
+			  unsigned long long ull;
+			  unsigned int ui[2];
+			  void *p;
+			} addr64;
+		
+		
+#else
+
+#include <stdio.h>		
+#define spu_printf printf	
+
+#endif // USE_LIBSPE2
+	
+#endif	//__CELLOS_LV2__
+	
+#endif
+
+
+/* Included here because we need uint*_t typedefs */
+#include "PpuAddressSpace.h"
+
+#endif //TYPE_DEFINITIONS_H
+
+
+
--- a/src/BulletMultiThreaded/PosixThreadSupport.cpp
+++ b/src/BulletMultiThreaded/PosixThreadSupport.cpp
@@ -0,0 +1,211 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <stdio.h>
+#include "PosixThreadSupport.h"
+
+
+#ifdef USE_PTHREADS
+
+#include "SpuCollisionTaskProcess.h"
+#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
+
+#define checkPThreadFunction(returnValue) \
+    if(0 != returnValue) { \
+        printf("PThread problem at line %i in file %s: %i\n", __LINE__, __FILE__, returnValue); \
+    }
+
+// The number of threads should be equal to the number of available cores
+// Todo: each worker should be linked to a single core, using SetThreadIdealProcessor.
+
+// PosixThreadSupport helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+// Setup and initialize SPU/CELL/Libspe2
+PosixThreadSupport::PosixThreadSupport(ThreadConstructionInfo& threadConstructionInfo)
+{
+	startThreads(threadConstructionInfo);
+}
+
+// cleanup/shutdown Libspe2
+PosixThreadSupport::~PosixThreadSupport()
+{
+	stopSPU();
+}
+
+// this semaphore will signal, if and how many threads are finished with their work
+static sem_t mainSemaphore;
+
+static void *threadFunction(void *argument) 
+{
+
+	PosixThreadSupport::btSpuStatus* status = (PosixThreadSupport::btSpuStatus*)argument;
+
+	
+	while (1)
+	{
+            checkPThreadFunction(sem_wait(&status->startSemaphore));
+		
+		void* userPtr = status->m_userPtr;
+
+		if (userPtr)
+		{
+			btAssert(status->m_status);
+			status->m_userThreadFunc(userPtr,status->m_lsMemory);
+			status->m_status = 2;
+			checkPThreadFunction(sem_post(&mainSemaphore));
+
+            status->threadUsed++;
+		} else {
+			//exit Thread
+			status->m_status = 3;
+			checkPThreadFunction(sem_post(&mainSemaphore));
+			printf("Thread with taskId %i exiting\n",status->m_taskId);
+			break;
+		}
+		
+	}
+
+	printf("Thread TERMINATED\n");
+	return 0;
+
+}
+
+///send messages to SPUs
+void PosixThreadSupport::sendRequest(uint32_t uiCommand, uint32_t uiArgument0, uint32_t taskId)
+{
+	///	gMidphaseSPU.sendRequest(CMD_GATHER_AND_PROCESS_PAIRLIST, (uint32_t) &taskDesc);
+	
+	///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished
+	
+
+
+	switch (uiCommand)
+	{
+	case 	CMD_GATHER_AND_PROCESS_PAIRLIST:
+		{
+			btSpuStatus&	spuStatus = m_activeSpuStatus[taskId];
+			btAssert(taskId >= 0);
+			btAssert(taskId < m_activeSpuStatus.size());
+
+			spuStatus.m_commandId = uiCommand;
+			spuStatus.m_status = 1;
+			spuStatus.m_userPtr = (void*)uiArgument0;
+
+			// fire event to start new task
+            checkPThreadFunction(sem_post(&spuStatus.startSemaphore));
+			break;
+		}
+	default:
+		{
+			///not implemented
+			btAssert(0);
+		}
+
+	};
+
+
+}
+
+
+///check for messages from SPUs
+void PosixThreadSupport::waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1)
+{
+	///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response
+	
+	///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback'
+
+
+	btAssert(m_activeSpuStatus.size());
+
+        // wait for any of the threads to finish
+        checkPThreadFunction(sem_wait(&mainSemaphore));
+        
+	// get at least one thread which has finished
+        size_t last = -1;
+        
+        for(size_t t=0; t < m_activeSpuStatus.size(); ++t) {
+            if(2 == m_activeSpuStatus[t].m_status) {
+                last = t;
+                break;
+            }
+        }
+
+	btSpuStatus& spuStatus = m_activeSpuStatus[last];
+
+	btAssert(spuStatus.m_status > 1);
+	spuStatus.m_status = 0;
+
+	// need to find an active spu
+	btAssert(last >= 0);
+
+	*puiArgument0 = spuStatus.m_taskId;
+	*puiArgument1 = spuStatus.m_status;
+}
+
+
+
+void PosixThreadSupport::startThreads(ThreadConstructionInfo& threadConstructionInfo)
+{
+        printf("%s creating %i threads.\n", __FUNCTION__, threadConstructionInfo.m_numThreads);
+	m_activeSpuStatus.resize(threadConstructionInfo.m_numThreads);
+        
+        checkPThreadFunction(sem_init(&mainSemaphore, 0, 0));
+
+	for (int i=0;i < threadConstructionInfo.m_numThreads;i++)
+	{
+		printf("starting thread %d\n",i);
+
+		btSpuStatus&	spuStatus = m_activeSpuStatus[i];
+                
+                checkPThreadFunction(sem_init(&spuStatus.startSemaphore, 0, 0));
+                checkPThreadFunction(pthread_create(&spuStatus.thread, NULL, &threadFunction, (void*)&spuStatus));
+
+		spuStatus.m_userPtr=0;
+
+		spuStatus.m_taskId = i;
+		spuStatus.m_commandId = 0;
+		spuStatus.m_status = 0;
+		spuStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
+		spuStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
+        spuStatus.threadUsed = 0;
+
+		printf("started thread %d \n",i);
+		
+	}
+
+}
+
+void PosixThreadSupport::startSPU()
+{
+}
+
+
+///tell the task scheduler we are done with the SPU tasks
+void PosixThreadSupport::stopSPU()
+{
+	for(size_t t=0; t < m_activeSpuStatus.size(); ++t) {
+            btSpuStatus&	spuStatus = m_activeSpuStatus[t];
+        printf("%s: Thread %i used: %ld\n", __FUNCTION__, t, spuStatus.threadUsed);
+        
+        
+            checkPThreadFunction(sem_destroy(&spuStatus.startSemaphore));
+            checkPThreadFunction(pthread_cancel(spuStatus.thread));
+        }
+        checkPThreadFunction(sem_destroy(&mainSemaphore));
+
+	m_activeSpuStatus.clear();
+}
+
+#endif // USE_PTHREADS
+
--- a/src/BulletMultiThreaded/PosixThreadSupport.h
+++ b/src/BulletMultiThreaded/PosixThreadSupport.h
@@ -0,0 +1,118 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "LinearMath/btScalar.h"
+#include "PlatformDefinitions.h"
+
+#ifdef USE_PTHREADS  //platform specific defines are defined in PlatformDefinitions.h
+#include <pthread.h>
+#include <semaphore.h>
+
+#ifndef POSIX_THREAD_SUPPORT_H
+#define POSIX_THREAD_SUPPORT_H
+
+#include "LinearMath/btAlignedObjectArray.h"
+
+#include "btThreadSupportInterface.h"
+
+
+typedef void (*PosixThreadFunc)(void* userPtr,void* lsMemory);
+typedef void* (*PosixlsMemorySetupFunc)();
+
+// PosixThreadSupport helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+class PosixThreadSupport : public btThreadSupportInterface 
+{
+public:
+    typedef enum sStatus {
+        STATUS_BUSY,
+        STATUS_READY,
+        STATUS_FINISHED
+    } Status;
+
+	// placeholder, until libspe2 support is there
+	struct	btSpuStatus
+	{
+		uint32_t	m_taskId;
+		uint32_t	m_commandId;
+		uint32_t	m_status;
+
+		PosixThreadFunc	m_userThreadFunc;
+		void*	m_userPtr; //for taskDesc etc
+		void*	m_lsMemory; //initialized using PosixLocalStoreMemorySetupFunc
+
+                pthread_t thread;
+                sem_t startSemaphore;
+
+        unsigned long threadUsed;
+	};
+private:
+
+	btAlignedObjectArray<btSpuStatus>	m_activeSpuStatus;
+public:
+	///Setup and initialize SPU/CELL/Libspe2
+
+	
+
+	struct	ThreadConstructionInfo
+	{
+		ThreadConstructionInfo(char* uniqueName,
+									PosixThreadFunc userThreadFunc,
+									PosixlsMemorySetupFunc	lsMemoryFunc,
+									int numThreads=1,
+									int threadStackSize=65535
+									)
+									:m_uniqueName(uniqueName),
+									m_userThreadFunc(userThreadFunc),
+									m_lsMemoryFunc(lsMemoryFunc),
+									m_numThreads(numThreads),
+									m_threadStackSize(threadStackSize)
+		{
+
+		}
+
+		char*					m_uniqueName;
+		PosixThreadFunc			m_userThreadFunc;
+		PosixlsMemorySetupFunc	m_lsMemoryFunc;
+		int						m_numThreads;
+		int						m_threadStackSize;
+
+	};
+
+	PosixThreadSupport(ThreadConstructionInfo& threadConstructionInfo);
+
+///cleanup/shutdown Libspe2
+	virtual	~PosixThreadSupport();
+
+	void	startThreads(ThreadConstructionInfo&	threadInfo);
+
+
+///send messages to SPUs
+	virtual	void sendRequest(uint32_t uiCommand, uint32_t uiArgument0, uint32_t uiArgument1);
+
+///check for messages from SPUs
+	virtual	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1);
+
+///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+	virtual	void startSPU();
+
+///tell the task scheduler we are done with the SPU tasks
+	virtual	void stopSPU();
+
+};
+
+#endif // POSIX_THREAD_SUPPORT_H
+
+#endif // USE_PTHREADS
--- a/src/BulletMultiThreaded/PpuAddressSpace.h
+++ b/src/BulletMultiThreaded/PpuAddressSpace.h
@@ -0,0 +1,11 @@
+#ifndef __PPU_ADDRESS_SPACE_H
+#define __PPU_ADDRESS_SPACE_H
+
+#ifdef USE_ADDR64
+typedef uint64_t ppu_address_t;
+#else
+typedef uint32_t ppu_address_t;
+#endif
+
+#endif
+
--- a/src/BulletMultiThreaded/SequentialThreadSupport.cpp
+++ b/src/BulletMultiThreaded/SequentialThreadSupport.cpp
@@ -0,0 +1,89 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SequentialThreadSupport.h"
+
+
+#include "SpuCollisionTaskProcess.h"
+#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
+
+SequentialThreadSupport::SequentialThreadSupport(SequentialThreadConstructionInfo& threadConstructionInfo)
+{
+	startThreads(threadConstructionInfo);
+}
+
+///cleanup/shutdown Libspe2
+SequentialThreadSupport::~SequentialThreadSupport()
+{
+	stopSPU();
+}
+
+#include <stdio.h>
+
+///send messages to SPUs
+void SequentialThreadSupport::sendRequest(uint32_t uiCommand, uint32_t uiArgument0, uint32_t taskId)
+{
+	switch (uiCommand)
+	{
+	case 	CMD_GATHER_AND_PROCESS_PAIRLIST:
+		{
+			btSpuStatus&	spuStatus = m_activeSpuStatus[0];
+			spuStatus.m_userPtr=(void*)uiArgument0;
+			spuStatus.m_userThreadFunc(spuStatus.m_userPtr,spuStatus.m_lsMemory);
+		}
+	break;
+	default:
+		{
+			///not implemented
+			btAssert(0 && "Not implemented");
+		}
+
+	};
+
+
+}
+
+///check for messages from SPUs
+void SequentialThreadSupport::waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1)
+{
+	btAssert(m_activeSpuStatus.size());
+	btSpuStatus& spuStatus = m_activeSpuStatus[0];
+	*puiArgument0 = spuStatus.m_taskId;
+	*puiArgument1 = spuStatus.m_status;
+}
+
+void SequentialThreadSupport::startThreads(SequentialThreadConstructionInfo& threadConstructionInfo)
+{
+	m_activeSpuStatus.resize(1);
+	printf("STS: Not starting any threads\n");
+	btSpuStatus& spuStatus = m_activeSpuStatus[0];
+	spuStatus.m_userPtr = 0;
+	spuStatus.m_taskId = 0;
+	spuStatus.m_commandId = 0;
+	spuStatus.m_status = 0;
+	spuStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
+	spuStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
+	printf("STS: Created local store at %p for task %s\n", spuStatus.m_lsMemory, threadConstructionInfo.m_uniqueName);
+}
+
+void SequentialThreadSupport::startSPU()
+{
+}
+
+void SequentialThreadSupport::stopSPU()
+{
+	m_activeSpuStatus.clear();
+}
+
--- a/src/BulletMultiThreaded/SequentialThreadSupport.h
+++ b/src/BulletMultiThreaded/SequentialThreadSupport.h
@@ -0,0 +1,84 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "LinearMath/btScalar.h"
+#include "PlatformDefinitions.h"
+
+
+#ifndef SEQUENTIAL_THREAD_SUPPORT_H
+#define SEQUENTIAL_THREAD_SUPPORT_H
+
+#include "LinearMath/btAlignedObjectArray.h"
+
+#include "btThreadSupportInterface.h"
+
+typedef void (*SequentialThreadFunc)(void* userPtr,void* lsMemory);
+typedef void* (*SequentiallsMemorySetupFunc)();
+
+
+
+///SequentialThreadSupport is a portable non-parallel implementation of the btThreadSupportInterface
+class SequentialThreadSupport : public btThreadSupportInterface 
+{
+public:
+	struct	btSpuStatus
+	{
+		uint32_t	m_taskId;
+		uint32_t	m_commandId;
+		uint32_t	m_status;
+
+		SequentialThreadFunc	m_userThreadFunc;
+
+		void*	m_userPtr; //for taskDesc etc
+		void*	m_lsMemory; //initialized using SequentiallsMemorySetupFunc
+	};
+private:
+	btAlignedObjectArray<btSpuStatus>	m_activeSpuStatus;
+	btAlignedObjectArray<void*>			m_completeHandles;	
+public:
+	struct	SequentialThreadConstructionInfo
+	{
+		SequentialThreadConstructionInfo (char* uniqueName,
+									SequentialThreadFunc userThreadFunc,
+									SequentiallsMemorySetupFunc	lsMemoryFunc
+									)
+									:m_uniqueName(uniqueName),
+									m_userThreadFunc(userThreadFunc),
+									m_lsMemoryFunc(lsMemoryFunc)
+		{
+
+		}
+
+		char*						m_uniqueName;
+		SequentialThreadFunc		m_userThreadFunc;
+		SequentiallsMemorySetupFunc	m_lsMemoryFunc;
+	};
+
+	SequentialThreadSupport(SequentialThreadConstructionInfo& threadConstructionInfo);
+	virtual	~SequentialThreadSupport();
+	void	startThreads(SequentialThreadConstructionInfo&	threadInfo);
+///send messages to SPUs
+	virtual	void sendRequest(uint32_t uiCommand, uint32_t uiArgument0, uint32_t uiArgument1);
+///check for messages from SPUs
+	virtual	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1);
+///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+	virtual	void startSPU();
+///tell the task scheduler we are done with the SPU tasks
+	virtual	void stopSPU();
+
+};
+
+#endif //SEQUENTIAL_THREAD_SUPPORT_H
+
--- a/src/BulletMultiThreaded/SpuBatchRaycaster.cpp
+++ b/src/BulletMultiThreaded/SpuBatchRaycaster.cpp
@@ -0,0 +1,151 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <new>
+#include "BulletCollision/CollisionShapes/btCollisionShape.h"
+#include "LinearMath/btAlignedAllocator.h"
+#include "SpuBatchRaycaster.h"
+
+SpuBatchRaycaster::SpuBatchRaycaster (class	btThreadSupportInterface* threadInterface, int maxNumOutstandingTasks)
+{
+	m_threadInterface = threadInterface;
+
+	castUponObjectWrappers = NULL;
+	numCastUponObjectWrappers = 0;
+
+	m_spuRaycastTaskProcess = new SpuRaycastTaskProcess(m_threadInterface,maxNumOutstandingTasks); // FIXME non constant
+}
+
+SpuBatchRaycaster::~SpuBatchRaycaster ()
+{
+	if (castUponObjectWrappers)
+	{
+		btAlignedFree (castUponObjectWrappers);
+		castUponObjectWrappers = NULL;
+	}
+}
+
+void
+SpuBatchRaycaster::setCollisionObjects (btCollisionObjectArray& castUponObjects, int numCastUponObjects)
+{
+	if (castUponObjectWrappers)
+	{
+		btAlignedFree (castUponObjectWrappers);
+		castUponObjectWrappers = NULL;
+	}
+
+	castUponObjectWrappers = (SpuCollisionObjectWrapper*)btAlignedAlloc (sizeof(SpuCollisionObjectWrapper) * numCastUponObjects,16);
+	numCastUponObjectWrappers = numCastUponObjects;
+
+	for (int i = 0; i < numCastUponObjectWrappers; i++)
+	{
+		castUponObjectWrappers[i] = SpuCollisionObjectWrapper(castUponObjects[i]);
+	}
+}
+
+void
+SpuBatchRaycaster::setCollisionObjectsSkipPE (btCollisionObjectArray& castUponObjects, int numCastUponObjects)
+{
+	if (castUponObjectWrappers)
+	{
+		btAlignedFree (castUponObjectWrappers);
+		castUponObjectWrappers = NULL;
+	}
+
+	int numNonPEShapes = 0;
+	for (int i = 0; i < numCastUponObjects; i++)
+	{
+		const btCollisionShape* shape = castUponObjects[i]->getCollisionShape();
+
+		if (shape->getShapeType () == BOX_SHAPE_PROXYTYPE ||
+			shape->getShapeType () == SPHERE_SHAPE_PROXYTYPE ||
+			shape->getShapeType () == CAPSULE_SHAPE_PROXYTYPE)
+		{
+			continue;
+		}
+
+		numNonPEShapes++;
+	}
+
+	castUponObjectWrappers = (SpuCollisionObjectWrapper*)btAlignedAlloc (sizeof(SpuCollisionObjectWrapper) * numNonPEShapes,16);
+	numCastUponObjectWrappers = numNonPEShapes;
+
+	int index = 0;
+	for (int i = 0; i < numCastUponObjects; i++)
+	{
+		const btCollisionShape* shape = castUponObjects[i]->getCollisionShape();
+
+		if (shape->getShapeType () == BOX_SHAPE_PROXYTYPE ||
+			shape->getShapeType () == SPHERE_SHAPE_PROXYTYPE ||
+			shape->getShapeType () == CAPSULE_SHAPE_PROXYTYPE)
+		{
+			continue;
+		}
+
+		castUponObjectWrappers[index] = SpuCollisionObjectWrapper(castUponObjects[i]);
+		index++;
+	}
+
+//	printf("Number of shapes bullet is casting against: %d\n", numNonPEShapes);
+	btAssert (index == numNonPEShapes);
+}
+
+void
+SpuBatchRaycaster::addRay (const btVector3& rayFrom, const btVector3& rayTo, const btScalar hitFraction)
+{
+	SpuRaycastTaskWorkUnitOut workUnitOut;
+	workUnitOut.hitFraction = hitFraction;
+	workUnitOut.hitNormal = btVector3(0.0, 1.0, 0.0);
+
+	rayBatchOutput.push_back (workUnitOut);
+
+	SpuRaycastTaskWorkUnit workUnit;
+	workUnit.rayFrom = rayFrom;
+	workUnit.rayTo = rayTo;
+	rayBatch.push_back (workUnit);
+}
+
+void
+SpuBatchRaycaster::clearRays ()
+{
+	rayBatch.clear ();
+	rayBatchOutput.clear ();
+}
+
+void
+SpuBatchRaycaster::performBatchRaycast ()
+{
+	m_spuRaycastTaskProcess->initialize2 (castUponObjectWrappers, numCastUponObjectWrappers);
+
+	for (int i = 0; i < rayBatch.size(); i++)
+	{
+		rayBatch[i].output = &rayBatchOutput[i]; // assign output memory location
+		m_spuRaycastTaskProcess->addWorkToTask(rayBatch[i]);
+	}
+
+	m_spuRaycastTaskProcess->flush2 ();
+}
+
+const SpuRaycastTaskWorkUnitOut&
+SpuBatchRaycaster::operator [] (int i) const
+{
+	return rayBatchOutput[i];
+}
+
+int
+SpuBatchRaycaster::getNumRays () const
+{
+	return rayBatchOutput.size();
+}
--- a/src/BulletMultiThreaded/SpuBatchRaycaster.h
+++ b/src/BulletMultiThreaded/SpuBatchRaycaster.h
@@ -0,0 +1,49 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_BATCH_RAYCASTER_H
+#define SPU_BATCH_RAYCASTER_H
+
+#include "LinearMath/btAlignedObjectArray.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "SpuRaycastTaskProcess.h"
+#include "SpuRaycastTask/SpuRaycastTask.h"
+#include "SpuCollisionObjectWrapper.h"
+
+/* FIXME:
+ * Need to decide how callbacks are performed...
+ */
+class SpuBatchRaycaster 
+{
+protected:
+	SpuCollisionObjectWrapper* castUponObjectWrappers;
+	int numCastUponObjectWrappers;
+	btAlignedObjectArray<SpuRaycastTaskWorkUnit> rayBatch;
+	btAlignedObjectArray<SpuRaycastTaskWorkUnitOut> rayBatchOutput;
+	SpuRaycastTaskProcess* m_spuRaycastTaskProcess;
+	class	btThreadSupportInterface*	m_threadInterface;
+public:
+	SpuBatchRaycaster (class btThreadSupportInterface* threadInterface, int maxNumOutstandingTasks);
+	~SpuBatchRaycaster ();
+	void setCollisionObjects (btCollisionObjectArray& castUponObjects, int numCastUponObjects);
+	void setCollisionObjectsSkipPE (btCollisionObjectArray& castUponObjects, int numCastUponObjects);
+	void addRay (const btVector3& rayFrom, const btVector3& rayTo, const btScalar hitFraction = 1.0);
+	void clearRays ();
+	void performBatchRaycast ();
+	const SpuRaycastTaskWorkUnitOut& operator [] (int i) const;
+	int getNumRays () const;
+};
+
+#endif
--- a/src/BulletMultiThreaded/SpuCollisionObjectWrapper.cpp
+++ b/src/BulletMultiThreaded/SpuCollisionObjectWrapper.cpp
@@ -0,0 +1,48 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuCollisionObjectWrapper.h"
+#include "BulletCollision/CollisionShapes/btCollisionShape.h"
+
+SpuCollisionObjectWrapper::SpuCollisionObjectWrapper ()
+{
+}
+
+#ifndef __SPU__
+SpuCollisionObjectWrapper::SpuCollisionObjectWrapper (const btCollisionObject* collisionObject)
+{
+	m_shapeType = collisionObject->getCollisionShape()->getShapeType ();
+	m_collisionObjectPtr = (ppu_address_t)collisionObject;
+	m_margin = collisionObject->getCollisionShape()->getMargin ();
+}
+#endif
+
+int
+SpuCollisionObjectWrapper::getShapeType () const
+{
+	return m_shapeType;
+}
+
+float
+SpuCollisionObjectWrapper::getCollisionMargin () const
+{
+	return m_margin;
+}
+
+ppu_address_t
+SpuCollisionObjectWrapper::getCollisionObjectPtr () const
+{
+	return m_collisionObjectPtr;
+}
--- a/src/BulletMultiThreaded/SpuCollisionObjectWrapper.h
+++ b/src/BulletMultiThreaded/SpuCollisionObjectWrapper.h
@@ -0,0 +1,35 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "PlatformDefinitions.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+
+ATTRIBUTE_ALIGNED16(class) SpuCollisionObjectWrapper
+{
+protected:
+	int m_shapeType;
+	float m_margin;
+	ppu_address_t m_collisionObjectPtr;
+
+public:
+	SpuCollisionObjectWrapper ();
+
+	SpuCollisionObjectWrapper (const btCollisionObject* collisionObject);
+
+	int           getShapeType () const;
+	float         getCollisionMargin () const;
+	ppu_address_t getCollisionObjectPtr () const;
+};
+
--- a/src/BulletMultiThreaded/SpuCollisionTaskProcess.cpp
+++ b/src/BulletMultiThreaded/SpuCollisionTaskProcess.cpp
@@ -0,0 +1,304 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+//#define DEBUG_SPU_TASK_SCHEDULING 1
+
+
+//class OptimizedBvhNode;
+
+#include "SpuCollisionTaskProcess.h"
+
+
+
+
+
+
+SpuCollisionTaskProcess::SpuCollisionTaskProcess(class	btThreadSupportInterface*	threadInterface, unsigned int	maxNumOutstandingTasks)
+:m_threadInterface(threadInterface),
+m_maxNumOutstandingTasks(maxNumOutstandingTasks)
+{
+	m_workUnitTaskBuffers = (unsigned char *)0;
+	m_taskBusy.resize(m_maxNumOutstandingTasks);
+	m_spuGatherTaskDesc.resize(m_maxNumOutstandingTasks);
+
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		m_taskBusy[i] = false;
+	}
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+	m_currentPage = 0;
+	m_currentPageEntry = 0;
+
+#ifdef DEBUG_SpuCollisionTaskProcess
+	m_initialized = false;
+#endif
+
+	m_threadInterface->startSPU();
+
+	//printf("sizeof vec_float4: %d\n", sizeof(vec_float4));
+	printf("sizeof SpuGatherAndProcessWorkUnitInput: %d\n", sizeof(SpuGatherAndProcessWorkUnitInput));
+
+}
+
+SpuCollisionTaskProcess::~SpuCollisionTaskProcess()
+{
+	
+	if (m_workUnitTaskBuffers != 0)
+	{
+		btAlignedFree(m_workUnitTaskBuffers);
+		m_workUnitTaskBuffers = 0;
+	}
+	
+
+
+	m_threadInterface->stopSPU();
+	
+}
+
+
+
+void SpuCollisionTaskProcess::initialize2(bool useEpa)
+{
+
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("SpuCollisionTaskProcess::initialize()\n");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	if (!m_workUnitTaskBuffers)
+	{
+		m_workUnitTaskBuffers = (unsigned char *)btAlignedAlloc(MIDPHASE_WORKUNIT_TASK_SIZE*m_maxNumOutstandingTasks, 128);
+	}
+
+	
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		m_taskBusy[i] = false;
+	}
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+	m_currentPage = 0;
+	m_currentPageEntry = 0;
+	m_useEpa = useEpa;
+
+#ifdef DEBUG_SpuCollisionTaskProcess
+	m_initialized = true;
+	assert(MIDPHASE_NUM_WORKUNITS_PER_TASK*sizeof(SpuGatherAndProcessWorkUnitInput) <= MIDPHASE_WORKUNIT_TASK_SIZE);
+#endif
+}
+
+
+void SpuCollisionTaskProcess::issueTask2()
+{
+
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("SpuCollisionTaskProcess::issueTask (m_currentTask= %d\n)", m_currentTask);
+#endif //DEBUG_SPU_TASK_SCHEDULING
+
+	m_taskBusy[m_currentTask] = true;
+	m_numBusyTasks++;
+
+
+	SpuGatherAndProcessPairsTaskDesc& taskDesc = m_spuGatherTaskDesc[m_currentTask];
+	taskDesc.m_useEpa = m_useEpa;
+
+	{
+		// send task description in event message
+		// no error checking here...
+		// but, currently, event queue can be no larger than NUM_WORKUNIT_TASKS.
+	
+		taskDesc.inPtr = reinterpret_cast<uint64_t>(MIDPHASE_TASK_PTR(m_currentTask));
+	
+		taskDesc.taskId = m_currentTask;
+		taskDesc.numPages = m_currentPage+1;
+		taskDesc.numOnLastPage = m_currentPageEntry;
+	}
+
+
+
+	m_threadInterface->sendRequest(CMD_GATHER_AND_PROCESS_PAIRLIST, (uint32_t) &taskDesc,m_currentTask);
+
+	// if all tasks busy, wait for spu event to clear the task.
+	
+
+	if (m_numBusyTasks >= m_maxNumOutstandingTasks)
+	{
+		unsigned int taskId;
+		unsigned int outputSize;
+
+		
+		for (int i=0;i<m_maxNumOutstandingTasks;i++)
+		  {
+			  if (m_taskBusy[i])
+			  {
+				  taskId = i;
+				  break;
+			  }
+		  }
+
+	  btAssert(taskId>=0);
+
+	  
+		m_threadInterface->waitForResponse(&taskId, &outputSize);
+
+//		printf("issueTask taskId %d completed, numBusy=%d\n",taskId,m_numBusyTasks);
+
+		//printf("PPU: after issue, received event: %u %d\n", taskId, outputSize);
+
+		//postProcess(taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	}
+	
+}
+
+void SpuCollisionTaskProcess::addWorkToTask(void* pairArrayPtr,int startIndex,int endIndex)
+{
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("#");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	
+#ifdef DEBUG_SpuCollisionTaskProcess
+	assert(m_initialized);
+	assert(m_workUnitTaskBuffers);
+
+#endif
+
+	bool batch = true;
+
+	if (batch)
+	{
+		if (m_currentPageEntry == MIDPHASE_NUM_WORKUNITS_PER_PAGE)
+		{
+			if (m_currentPage == MIDPHASE_NUM_WORKUNIT_PAGES-1)
+			{
+				// task buffer is full, issue current task.
+				// if all task buffers busy, this waits until SPU is done.
+				issueTask2();
+
+				// find new task buffer
+				for (unsigned int i = 0; i < m_maxNumOutstandingTasks; i++)
+				{
+					if (!m_taskBusy[i])
+					{
+						m_currentTask = i;
+						//init the task data
+
+						break;
+					}
+				}
+
+				m_currentPage = 0;
+			}
+			else
+			{
+				m_currentPage++;
+			}
+
+			m_currentPageEntry = 0;
+		}
+	}
+
+	{
+
+
+
+		SpuGatherAndProcessWorkUnitInput &wuInput = 
+			*(reinterpret_cast<SpuGatherAndProcessWorkUnitInput*>
+			(MIDPHASE_ENTRY_PTR(m_currentTask, m_currentPage, m_currentPageEntry)));
+		
+		wuInput.m_pairArrayPtr = reinterpret_cast<uint64_t>(pairArrayPtr);
+		wuInput.m_startIndex = startIndex;
+		wuInput.m_endIndex = endIndex;
+
+		
+	
+		m_currentPageEntry++;
+
+		if (!batch)
+		{
+			issueTask2();
+
+			// find new task buffer
+			for (unsigned int i = 0; i < m_maxNumOutstandingTasks; i++)
+			{
+				if (!m_taskBusy[i])
+				{
+					m_currentTask = i;
+					//init the task data
+
+					break;
+				}
+			}
+
+			m_currentPage = 0;
+			m_currentPageEntry =0;
+		}
+	}
+}
+
+
+void 
+SpuCollisionTaskProcess::flush2()
+{
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("\nSpuCollisionTaskProcess::flush()\n");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	
+	// if there's a partially filled task buffer, submit that task
+	if (m_currentPage > 0 || m_currentPageEntry > 0)
+	{
+		issueTask2();
+	}
+
+
+	// all tasks are issued, wait for all tasks to be complete
+	while(m_numBusyTasks > 0)
+	{
+	  // Consolidating SPU code
+	  unsigned int taskId=-1;
+	  unsigned int outputSize;
+	  
+	  for (int i=0;i<m_maxNumOutstandingTasks;i++)
+	  {
+		  if (m_taskBusy[i])
+		  {
+			  taskId = i;
+			  break;
+		  }
+	  }
+
+	  btAssert(taskId>=0);
+
+	
+	  {
+			
+		// SPURS support.
+		  m_threadInterface->waitForResponse(&taskId, &outputSize);
+	  }
+//		 printf("flush2 taskId %d completed, numBusy =%d \n",taskId,m_numBusyTasks);
+		//printf("PPU: flushing, received event: %u %d\n", taskId, outputSize);
+
+		//postProcess(taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	}
+
+
+}
--- a/src/BulletMultiThreaded/SpuCollisionTaskProcess.h
+++ b/src/BulletMultiThreaded/SpuCollisionTaskProcess.h
@@ -0,0 +1,154 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_COLLISION_TASK_PROCESS_H
+#define SPU_COLLISION_TASK_PROCESS_H
+
+#include <assert.h>
+
+#include <LinearMath/btScalar.h>
+
+#include "PlatformDefinitions.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h" // for definitions processCollisionTask and createCollisionLocalStoreMemory
+
+#include "btThreadSupportInterface.h"
+
+
+//#include "SPUAssert.h"
+#include <string.h>
+
+
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/CollisionShapes/btCollisionShape.h"
+#include "BulletCollision/CollisionShapes/btConvexShape.h"
+
+#include <LinearMath/btAlignedAllocator.h>
+
+#include <stdio.h>
+
+
+#define DEBUG_SpuCollisionTaskProcess 1
+
+
+#define CMD_GATHER_AND_PROCESS_PAIRLIST	1
+
+class btCollisionObject;
+class btPersistentManifold;
+class btDispatcher;
+
+
+/////Task Description for SPU collision detection
+//struct SpuGatherAndProcessPairsTaskDesc
+//{
+//	uint64_t	inPtr;//m_pairArrayPtr;
+//	//mutex variable
+//	uint32_t	m_someMutexVariableInMainMemory;
+//
+//	uint64_t	m_dispatcher;
+//
+//	uint32_t	numOnLastPage;
+//
+//	uint16_t numPages;
+//	uint16_t taskId;
+//
+//	struct	CollisionTask_LocalStoreMemory*	m_lsMemory; 
+//}
+//
+//#if  defined(__CELLOS_LV2__) || defined(USE_LIBSPE2)
+//__attribute__ ((aligned (16)))
+//#endif
+//;
+
+
+///MidphaseWorkUnitInput stores individual primitive versus mesh collision detection input, to be processed by the SPU.
+struct SpuGatherAndProcessWorkUnitInput
+{
+	uint64_t m_pairArrayPtr;
+	int		m_startIndex;
+	int		m_endIndex;
+};
+
+
+
+
+/// SpuCollisionTaskProcess handles SPU processing of collision pairs.
+/// Maintains a set of task buffers.
+/// When the task is full, the task is issued for SPUs to process.  Contact output goes into btPersistentManifold
+/// associated with each task.
+/// When PPU issues a task, it will look for completed task buffers
+/// PPU will do postprocessing, dependent on workunit output (not likely)
+class SpuCollisionTaskProcess
+{
+
+  unsigned char  *m_workUnitTaskBuffers;
+
+
+	// track task buffers that are being used, and total busy tasks
+	btAlignedObjectArray<bool>	m_taskBusy;
+	btAlignedObjectArray<SpuGatherAndProcessPairsTaskDesc>	m_spuGatherTaskDesc;
+
+	class	btThreadSupportInterface*	m_threadInterface;
+
+	unsigned int	m_maxNumOutstandingTasks;
+
+	unsigned int   m_numBusyTasks;
+
+	// the current task and the current entry to insert a new work unit
+	unsigned int   m_currentTask;
+	unsigned int   m_currentPage;
+	unsigned int   m_currentPageEntry;
+
+	bool m_useEpa;
+
+#ifdef DEBUG_SpuCollisionTaskProcess
+	bool m_initialized;
+#endif
+	void issueTask2();
+	//void postProcess(unsigned int taskId, int outputSize);
+
+public:
+	SpuCollisionTaskProcess(btThreadSupportInterface*	threadInterface, unsigned int maxNumOutstandingTasks);
+	
+	~SpuCollisionTaskProcess();
+	
+	///call initialize in the beginning of the frame, before addCollisionPairToTask
+	void initialize2(bool useEpa = false);
+
+	///batch up additional work to a current task for SPU processing. When batch is full, it issues the task.
+	void addWorkToTask(void* pairArrayPtr,int startIndex,int endIndex);
+
+	///call flush to submit potential outstanding work to SPUs and wait for all involved SPUs to be finished
+	void flush2();
+};
+
+
+
+#define MIDPHASE_TASK_PTR(task) (&m_workUnitTaskBuffers[0] + MIDPHASE_WORKUNIT_TASK_SIZE*task)
+#define MIDPHASE_ENTRY_PTR(task,page,entry) (MIDPHASE_TASK_PTR(task) + MIDPHASE_WORKUNIT_PAGE_SIZE*page + sizeof(SpuGatherAndProcessWorkUnitInput)*entry)
+#define MIDPHASE_OUTPUT_PTR(task) (&m_contactOutputBuffers[0] + MIDPHASE_MAX_CONTACT_BUFFER_SIZE*task)
+#define MIDPHASE_TREENODES_PTR(task) (&m_complexShapeBuffers[0] + MIDPHASE_COMPLEX_SHAPE_BUFFER_SIZE*task)
+
+
+#define MIDPHASE_WORKUNIT_PAGE_SIZE (16)
+
+#define MIDPHASE_NUM_WORKUNIT_PAGES 1
+#define MIDPHASE_WORKUNIT_TASK_SIZE (MIDPHASE_WORKUNIT_PAGE_SIZE*MIDPHASE_NUM_WORKUNIT_PAGES)
+#define MIDPHASE_NUM_WORKUNITS_PER_PAGE (MIDPHASE_WORKUNIT_PAGE_SIZE / sizeof(SpuGatherAndProcessWorkUnitInput))
+#define MIDPHASE_NUM_WORKUNITS_PER_TASK (MIDPHASE_NUM_WORKUNITS_PER_PAGE*MIDPHASE_NUM_WORKUNIT_PAGES)
+
+
+#endif // SPU_COLLISION_TASK_PROCESS_H
+
--- a/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp
+++ b/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp
@@ -0,0 +1,58 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuContactManifoldCollisionAlgorithm.h"
+#include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/CollisionShapes/btCollisionShape.h"
+
+
+SpuContactManifoldCollisionAlgorithm::SpuContactManifoldCollisionAlgorithm()
+:m_manifoldPtr(0)
+{
+	
+}
+
+
+void SpuContactManifoldCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+{
+	btAssert(0);
+}
+
+btScalar SpuContactManifoldCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+{
+	btAssert(0);
+	return 1.f;
+}
+
+#ifndef __SPU__
+SpuContactManifoldCollisionAlgorithm::SpuContactManifoldCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1)
+:btCollisionAlgorithm(ci)
+{
+	m_manifoldPtr = m_dispatcher->getNewManifold(body0,body1);
+	m_shapeType0 = body0->getCollisionShape()->getShapeType();
+	m_shapeType1 = body1->getCollisionShape()->getShapeType();
+	m_collisionMargin0 = body0->getCollisionShape()->getMargin();
+	m_collisionMargin1 = body1->getCollisionShape()->getMargin();
+
+}
+#endif //__SPU__
+
+
+SpuContactManifoldCollisionAlgorithm::~SpuContactManifoldCollisionAlgorithm()
+{
+	if (m_manifoldPtr)
+			m_dispatcher->releaseManifold(m_manifoldPtr);
+}
--- a/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h
+++ b/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h
@@ -0,0 +1,89 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_CONTACTMANIFOLD_COLLISION_ALGORITHM_H
+#define SPU_CONTACTMANIFOLD_COLLISION_ALGORITHM_H
+
+#include "BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h"
+#include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
+#include "BulletCollision/CollisionDispatch/btCollisionCreateFunc.h"
+#include "BulletCollision/BroadphaseCollision/btDispatcher.h"
+
+class btPersistentManifold;
+
+/// SpuContactManifoldCollisionAlgorithm  provides contact manifold and should be processed on SPU.
+ATTRIBUTE_ALIGNED16(class) SpuContactManifoldCollisionAlgorithm : public btCollisionAlgorithm
+{
+
+	btPersistentManifold*	m_manifoldPtr;
+	int		m_shapeType0;
+	int		m_shapeType1;
+	float	m_collisionMargin0;
+	float	m_collisionMargin1;
+
+	
+public:
+	
+	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+
+	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+
+	SpuContactManifoldCollisionAlgorithm();
+
+	SpuContactManifoldCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
+
+	virtual ~SpuContactManifoldCollisionAlgorithm();
+
+	virtual	void	getAllContactManifolds(btManifoldArray&	manifoldArray)
+	{
+		if (m_manifoldPtr)
+			manifoldArray.push_back(m_manifoldPtr);
+	}
+
+	btPersistentManifold*	getContactManifoldPtr()
+	{
+		return m_manifoldPtr;
+	}
+
+	int		getShapeType0() const
+	{
+		return m_shapeType0;
+	}
+
+	int		getShapeType1() const
+	{
+		return m_shapeType1;
+	}
+	float	getCollisionMargin0() const
+	{
+		return m_collisionMargin0;
+	}
+	float	getCollisionMargin1() const
+	{
+		return m_collisionMargin1;
+	}
+
+	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
+	{
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		{
+			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(SpuContactManifoldCollisionAlgorithm));
+			return new(mem) SpuContactManifoldCollisionAlgorithm(ci,body0,body1);
+		}
+	};
+
+};
+
+#endif //SPU_CONTACTMANIFOLD_COLLISION_ALGORITHM_H
--- a/src/BulletMultiThreaded/SpuDoubleBuffer.h
+++ b/src/BulletMultiThreaded/SpuDoubleBuffer.h
@@ -0,0 +1,107 @@
+#ifndef DOUBLE_BUFFER_H
+#define DOUBLE_BUFFER_H
+
+#include "SpuFakeDma.h"
+#include <LinearMath/btScalar.h>
+
+
+///DoubleBuffer
+template<class T, int size>
+class DoubleBuffer
+{
+#if defined(__SPU__) || defined(USE_LIBSPE2)
+	ATTRIBUTE_ALIGNED128( T m_buffer0[size] ) ;
+	ATTRIBUTE_ALIGNED128( T m_buffer1[size] ) ;
+#else
+	T m_buffer0[size];
+	T m_buffer1[size];
+#endif
+	
+	T *m_frontBuffer;
+	T *m_backBuffer;
+
+	unsigned int m_dmaTag;
+	bool m_dmaPending;
+public:
+	bool	isPending() const { return m_dmaPending;}
+	DoubleBuffer();
+
+	void init ();
+
+	// dma get and put commands
+	void backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag);
+	void backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag);
+
+	// gets pointer to a buffer
+	T *getFront();
+	T *getBack();
+
+	// if back buffer dma was started, wait for it to complete
+	// then move back to front and vice versa
+	T *swapBuffers();
+};
+
+template<class T, int size>
+DoubleBuffer<T,size>::DoubleBuffer()
+{
+	init ();
+}
+
+template<class T, int size>
+void DoubleBuffer<T,size>::init()
+{
+	this->m_dmaPending = false;
+	this->m_frontBuffer = &this->m_buffer0[0];
+	this->m_backBuffer = &this->m_buffer1[0];
+}
+
+template<class T, int size>
+void
+DoubleBuffer<T,size>::backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag)
+{
+	m_dmaPending = true;
+	m_dmaTag = tag;
+	cellDmaLargeGet(m_backBuffer, ea, numBytes, tag, 0, 0);
+}
+
+template<class T, int size>
+void
+DoubleBuffer<T,size>::backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag)
+{
+	m_dmaPending = true;
+	m_dmaTag = tag;
+	cellDmaLargePut(m_backBuffer, ea, numBytes, tag, 0, 0);
+}
+
+template<class T, int size>
+T *
+DoubleBuffer<T,size>::getFront()
+{
+	return m_frontBuffer;
+}
+
+template<class T, int size>
+T *
+DoubleBuffer<T,size>::getBack()
+{
+	return m_backBuffer;
+}
+
+template<class T, int size>
+T *
+DoubleBuffer<T,size>::swapBuffers()
+{
+	if (m_dmaPending)
+	{
+		cellDmaWaitTagStatusAll(1<<m_dmaTag);
+		m_dmaPending = false;
+	}
+
+	T *tmp = m_backBuffer;
+	m_backBuffer = m_frontBuffer;
+	m_frontBuffer = tmp;
+
+	return m_frontBuffer;
+}
+
+#endif
--- a/src/BulletMultiThreaded/SpuFakeDma.cpp
+++ b/src/BulletMultiThreaded/SpuFakeDma.cpp
@@ -0,0 +1,195 @@
+
+#include "SpuFakeDma.h"
+#include <LinearMath/btScalar.h> //for btAssert
+//Disabling memcpy sometimes helps debugging DMA
+
+#define USE_MEMCPY 1
+#ifdef USE_MEMCPY
+
+#endif
+
+
+void*	cellDmaLargeGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
+{
+
+#if defined (__CELLOS_LV2__) || defined (USE_LIBSPE2)
+	cellDmaLargeGet(ls,ea,size,tag,tid,rid);
+	return ls;
+#else
+	return (void*)(uint32_t)ea;
+#endif
+}
+
+void*	cellDmaSmallGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
+{
+#if defined (__SPU__) || defined (USE_LIBSPE2)
+	mfc_get(ls,ea,size,tag,0,0);
+	return ls;
+#else
+	return (void*)(uint32_t)ea;
+#endif
+}
+
+
+
+
+void*	cellDmaGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
+{
+#if defined (__SPU__) || defined (USE_LIBSPE2)
+	cellDmaGet(ls,ea,size,tag,tid,rid);
+	return ls;
+#else
+	return (void*)(uint32_t)ea;
+#endif
+}
+
+
+///this unalignedDma should not be frequently used, only for small data. It handles alignment and performs check on size (<16 bytes)
+int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size)
+{
+	
+	btAssert(size<32);
+	
+	ATTRIBUTE_ALIGNED16(char	tmpBuffer[32]);
+
+	char* mainMem = (char*)ea;
+	char* localStore = (char*)ls;
+	uint32_t i;
+	
+
+	///make sure last 4 bits are the same, for cellDmaSmallGet
+	uint32_t last4BitsOffset = ea & 0x0f;
+	char* tmpTarget = tmpBuffer + last4BitsOffset;
+	
+#if defined (__SPU__) || defined (USE_LIBSPE2)
+	
+	int remainingSize = size;
+
+//#define FORCE_cellDmaUnalignedGet 1
+#ifdef FORCE_cellDmaUnalignedGet
+	cellDmaUnalignedGet(tmpTarget,ea,size,DMA_TAG(1),0,0);
+#else
+	char* remainingTmpTarget = tmpTarget;
+	uint64_t remainingEa = ea;
+
+	while (remainingSize)
+	{
+		switch (remainingSize)
+		{
+		case 1:
+		case 2:
+		case 4:
+		case 8:
+		case 16:
+			{
+				mfc_get(remainingTmpTarget,remainingEa,remainingSize,DMA_TAG(1),0,0);
+				remainingSize=0;
+				break;
+			}
+		default:
+			{
+				//spu_printf("unaligned DMA with non-natural size:%d\n",remainingSize);
+				int actualSize = 0;
+
+				if (remainingSize > 16)
+					actualSize = 16;
+				else
+					if (remainingSize >8)
+						actualSize=8;
+					else
+						if (remainingSize >4)
+							actualSize=4;
+						else
+							if (remainingSize >2)
+								actualSize=2;
+				mfc_get(remainingTmpTarget,remainingEa,actualSize,DMA_TAG(1),0,0);
+				remainingSize-=actualSize;
+				remainingTmpTarget+=actualSize;
+				remainingEa += actualSize;
+			}
+		}
+	}
+#endif//FORCE_cellDmaUnalignedGet
+
+#else
+	//copy into final destination
+#ifdef USE_MEMCPY
+		memcpy(tmpTarget,mainMem,size);
+#else
+		for ( i=0;i<size;i++)
+		{
+			tmpTarget[i] = mainMem[i];
+		}
+#endif //USE_MEMCPY
+
+#endif
+
+	cellDmaWaitTagStatusAll(DMA_MASK(1));
+
+	//this is slowish, perhaps memcpy on SPU is smarter?
+	for (i=0; btLikely( i<size );i++)
+	{
+		localStore[i] = tmpTarget[i];
+	}
+
+	return 0;
+}
+
+#if defined (__SPU__) || defined (USE_LIBSPE2)
+#else
+
+int	cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
+{
+	char* mainMem = (char*)ea;
+	char* localStore = (char*)ls;
+
+#ifdef USE_MEMCPY
+	memcpy(localStore,mainMem,size);
+#else
+	for (uint32_t i=0;i<size;i++)
+	{
+		localStore[i] = mainMem[i];
+	}
+#endif
+	return 0;
+}
+
+int	cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
+{
+	char* mainMem = (char*)ea;
+	char* localStore = (char*)ls;
+#ifdef USE_MEMCPY
+	memcpy(localStore,mainMem,size);
+#else
+	for (uint32_t i=0;i<size;i++)
+	{
+		localStore[i] = mainMem[i];
+	}	
+#endif //#ifdef USE_MEMCPY
+	return 0;
+}
+
+int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
+{
+	char* mainMem = (char*)ea;
+	const char* localStore = (const char*)ls;
+#ifdef USE_MEMCPY
+	memcpy(mainMem,localStore,size);
+#else
+	for (uint32_t i=0;i<size;i++)
+	{
+		mainMem[i] = localStore[i];
+	}	
+#endif //#ifdef USE_MEMCPY
+
+	return 0;
+}
+
+
+
+void	cellDmaWaitTagStatusAll(int ignore)
+{
+
+}
+
+#endif
--- a/src/BulletMultiThreaded/SpuFakeDma.h
+++ b/src/BulletMultiThreaded/SpuFakeDma.h
@@ -0,0 +1,121 @@
+
+#ifndef FAKE_DMA_H
+#define FAKE_DMA_H
+
+
+#include "PlatformDefinitions.h"
+#include "LinearMath/btScalar.h"
+
+
+#ifdef __SPU__
+
+#ifndef USE_LIBSPE2
+
+#include <cell/dma.h>
+#include <stdint.h>
+
+#define DMA_TAG(xfer) (xfer + 1)
+#define DMA_MASK(xfer) (1 << DMA_TAG(xfer))
+
+#else // !USE_LIBSPE2
+
+#define DMA_TAG(xfer) (xfer + 1)
+#define DMA_MASK(xfer) (1 << DMA_TAG(xfer))
+		
+#include <spu_mfcio.h>		
+		
+#define DEBUG_DMA		
+#ifdef DEBUG_DMA
+#define dUASSERT(a,b) if (!(a)) { printf(b);}
+#define uintsize ppu_address_t
+		
+#define cellDmaLargeGet(ls, ea, size, tag, tid, rid) if (  (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
+															dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
+															dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
+															dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0))  || (size > 16), "Not naturally aligned: "); \
+															dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
+															dUASSERT(size < 16384, "size too big: "); \
+															dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
+	    													dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
+															printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
+															} \
+															mfc_get(ls, ea, size, tag, tid, rid)
+#define cellDmaGet(ls, ea, size, tag, tid, rid) if (  (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
+														dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
+														dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
+														dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0))  || (size > 16), "Not naturally aligned: "); \
+														dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
+    													dUASSERT(size < 16384, "size too big: "); \
+														dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
+    													dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
+    													printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
+														} \
+														mfc_get(ls, ea, size, tag, tid, rid)
+#define cellDmaLargePut(ls, ea, size, tag, tid, rid) if (  (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
+															dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
+															dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
+															dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0))  || (size > 16), "Not naturally aligned: "); \
+															dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
+        													dUASSERT(size < 16384, "size too big: "); \
+															dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
+        													dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
+    														printf("PUT %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ls,(unsigned int)ea,(unsigned int)size); \
+															} \
+															mfc_put(ls, ea, size, tag, tid, rid)
+#define cellDmaSmallGet(ls, ea, size, tag, tid, rid) if (  (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
+																dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
+																dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
+																dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0))  || (size > 16), "Not naturally aligned: "); \
+    															dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
+    															dUASSERT(size < 16384, "size too big: "); \
+    															dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
+    	    													dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
+    															printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
+																} \
+																mfc_get(ls, ea, size, tag, tid, rid)
+#define cellDmaWaitTagStatusAll(ignore) mfc_write_tag_mask(ignore) ; mfc_read_tag_status_all()
+
+#else
+#define cellDmaLargeGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
+#define cellDmaGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
+#define cellDmaLargePut(ls, ea, size, tag, tid, rid) mfc_put(ls, ea, size, tag, tid, rid)
+#define cellDmaSmallGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
+#define cellDmaWaitTagStatusAll(ignore) mfc_write_tag_mask(ignore) ; mfc_read_tag_status_all()
+#endif // DEBUG_DMA
+
+		
+		
+		
+		
+		
+		
+		
+#endif // USE_LIBSPE2
+#else // !__SPU__
+//Simulate DMA using memcpy or direct access on non-CELL platforms that don't have DMAs and SPUs (Win32, Mac, Linux etc)
+//Potential to add networked simulation using this interface
+
+#define DMA_TAG(a) (a)
+#define DMA_MASK(a) (a)
+
+		/// cellDmaLargeGet Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
+		int	cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
+		int	cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
+		/// cellDmaLargePut Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
+		int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
+		/// cellDmaWaitTagStatusAll Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
+		void	cellDmaWaitTagStatusAll(int ignore);
+
+
+#endif //__CELLOS_LV2__
+
+///stallingUnalignedDmaSmallGet internally uses DMA_TAG(1)
+int	stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size);
+
+
+void*	cellDmaLargeGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
+void*	cellDmaGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
+void*	cellDmaSmallGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
+
+
+#endif //FAKE_DMA_H
--- a/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp
+++ b/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp
@@ -0,0 +1,216 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuGatheringCollisionDispatcher.h"
+#include "SpuCollisionTaskProcess.h"
+
+
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+#include "BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h"
+#include "SpuContactManifoldCollisionAlgorithm.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/CollisionShapes/btCollisionShape.h"
+
+
+
+
+SpuGatheringCollisionDispatcher::SpuGatheringCollisionDispatcher(class	btThreadSupportInterface*	threadInterface, unsigned int	maxNumOutstandingTasks,btCollisionConfiguration* collisionConfiguration)
+:btCollisionDispatcher(collisionConfiguration),
+m_spuCollisionTaskProcess(0),
+m_threadInterface(threadInterface),
+m_maxNumOutstandingTasks(maxNumOutstandingTasks)
+{
+	
+}
+
+
+bool	SpuGatheringCollisionDispatcher::supportsDispatchPairOnSpu(int proxyType0,int proxyType1)
+{
+	bool supported0 = (
+		(proxyType0 == BOX_SHAPE_PROXYTYPE) ||
+		(proxyType0 == TRIANGLE_SHAPE_PROXYTYPE) ||
+		(proxyType0 == SPHERE_SHAPE_PROXYTYPE) ||
+		(proxyType0 == CAPSULE_SHAPE_PROXYTYPE) ||
+		(proxyType0 == CYLINDER_SHAPE_PROXYTYPE) ||
+//		(proxyType0 == CONE_SHAPE_PROXYTYPE) ||
+		(proxyType0 == TRIANGLE_MESH_SHAPE_PROXYTYPE) ||
+		(proxyType0 == CONVEX_HULL_SHAPE_PROXYTYPE)||
+		(proxyType0 == COMPOUND_SHAPE_PROXYTYPE)
+		);
+
+	bool supported1 = (
+		(proxyType1 == BOX_SHAPE_PROXYTYPE) ||
+		(proxyType1 == TRIANGLE_SHAPE_PROXYTYPE) ||
+		(proxyType1 == SPHERE_SHAPE_PROXYTYPE) ||
+		(proxyType1 == CAPSULE_SHAPE_PROXYTYPE) ||
+		(proxyType1 == CYLINDER_SHAPE_PROXYTYPE) ||
+//		(proxyType1 == CONE_SHAPE_PROXYTYPE) ||
+		(proxyType1 == TRIANGLE_MESH_SHAPE_PROXYTYPE) ||
+		(proxyType1 == CONVEX_HULL_SHAPE_PROXYTYPE) ||
+		(proxyType1 == COMPOUND_SHAPE_PROXYTYPE)
+		);
+
+	return supported0 && supported1;
+}
+
+
+
+SpuGatheringCollisionDispatcher::~SpuGatheringCollisionDispatcher()
+{
+	if (m_spuCollisionTaskProcess)
+		delete m_spuCollisionTaskProcess;
+	
+}
+
+#include "stdio.h"
+
+
+
+///interface for iterating all overlapping collision pairs, no matter how those pairs are stored (array, set, map etc)
+///this is useful for the collision dispatcher.
+class btSpuCollisionPairCallback : public btOverlapCallback
+{
+	const btDispatcherInfo& m_dispatchInfo;
+	SpuGatheringCollisionDispatcher*	m_dispatcher;
+
+public:
+
+	btSpuCollisionPairCallback(const btDispatcherInfo& dispatchInfo, SpuGatheringCollisionDispatcher*	dispatcher)
+	:m_dispatchInfo(dispatchInfo),
+	m_dispatcher(dispatcher)
+	{
+	}
+
+	virtual bool	processOverlap(btBroadphasePair& collisionPair)
+	{
+
+
+		//PPU version
+		//(*m_dispatcher->getNearCallback())(collisionPair,*m_dispatcher,m_dispatchInfo);
+
+		//only support discrete collision detection for now, we could fallback on PPU/unoptimized version for TOI/CCD
+		btAssert(m_dispatchInfo.m_dispatchFunc == btDispatcherInfo::DISPATCH_DISCRETE);
+
+		//by default, Bullet will use this near callback
+		{
+			///userInfo is used to determine if the SPU has to handle this case or not (skip PPU tasks)
+			if (!collisionPair.m_userInfo)
+			{
+				collisionPair.m_userInfo = (void*) 1;
+			}
+			if (!collisionPair.m_algorithm)
+			{
+				btCollisionObject* colObj0 = (btCollisionObject*)collisionPair.m_pProxy0->m_clientObject;
+				btCollisionObject* colObj1 = (btCollisionObject*)collisionPair.m_pProxy1->m_clientObject;
+
+				btCollisionAlgorithmConstructionInfo ci;
+				ci.m_dispatcher1 = m_dispatcher;
+				ci.m_manifold = 0;
+
+				if (m_dispatcher->needsCollision(colObj0,colObj1))
+				{
+					int	proxyType0 = colObj0->getCollisionShape()->getShapeType();
+					int	proxyType1 = colObj1->getCollisionShape()->getShapeType();
+					if (m_dispatcher->supportsDispatchPairOnSpu(proxyType0,proxyType1))
+					{
+						int so = sizeof(SpuContactManifoldCollisionAlgorithm);
+						void* mem = m_dispatcher->allocateCollisionAlgorithm(so);
+						collisionPair.m_algorithm = new(mem) SpuContactManifoldCollisionAlgorithm(ci,colObj0,colObj1);
+						collisionPair.m_userInfo = (void*) 2;
+					} else
+					{
+						collisionPair.m_algorithm = m_dispatcher->findAlgorithm(colObj0,colObj1);
+						collisionPair.m_userInfo = (void*)3;
+					}
+				} 
+			}
+		}
+		return false;
+	}
+};
+
+void	SpuGatheringCollisionDispatcher::dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo, btDispatcher* dispatcher) 
+{
+
+	if (dispatchInfo.m_enableSPU)
+	{
+		if (!m_spuCollisionTaskProcess)
+			m_spuCollisionTaskProcess = new SpuCollisionTaskProcess(m_threadInterface,m_maxNumOutstandingTasks);
+	
+		m_spuCollisionTaskProcess->initialize2(dispatchInfo.m_useEpa);
+	
+		///modified version of btCollisionDispatcher::dispatchAllCollisionPairs:
+		{
+			btSpuCollisionPairCallback	collisionCallback(dispatchInfo,this);
+
+			pairCache->processAllOverlappingPairs(&collisionCallback,dispatcher);
+		}
+
+		//send one big batch
+		int numTotalPairs = pairCache->getNumOverlappingPairs();
+		btBroadphasePair* pairPtr = pairCache->getOverlappingPairArrayPtr();
+		int i;
+		for (i=0;i<numTotalPairs;)
+		{
+			//Performance Hint: tweak this number during benchmarking
+			static const int pairRange = SPU_BATCHSIZE_BROADPHASE_PAIRS;
+			int endIndex = (i+pairRange) < numTotalPairs ? i+pairRange : numTotalPairs;
+			m_spuCollisionTaskProcess->addWorkToTask(pairPtr,i,endIndex);
+			i = endIndex;
+		}
+
+		//handle PPU fallback pairs
+		for (i=0;i<numTotalPairs;i++)
+		{
+			btBroadphasePair& collisionPair = pairPtr[i];
+			if (collisionPair.m_userInfo == (void*)3)
+			{
+				if (collisionPair.m_algorithm)
+				{
+					btCollisionObject* colObj0 = (btCollisionObject*)collisionPair.m_pProxy0->m_clientObject;
+					btCollisionObject* colObj1 = (btCollisionObject*)collisionPair.m_pProxy1->m_clientObject;
+
+					if (dispatcher->needsCollision(colObj0,colObj1))
+					{
+						btManifoldResult contactPointResult(colObj0,colObj1);
+						
+						if (dispatchInfo.m_dispatchFunc == 		btDispatcherInfo::DISPATCH_DISCRETE)
+						{
+							//discrete collision detection query
+							collisionPair.m_algorithm->processCollision(colObj0,colObj1,dispatchInfo,&contactPointResult);
+						} else
+						{
+							//continuous collision detection query, time of impact (toi)
+							btScalar toi = collisionPair.m_algorithm->calculateTimeOfImpact(colObj0,colObj1,dispatchInfo,&contactPointResult);
+							if (dispatchInfo.m_timeOfImpact > toi)
+								dispatchInfo.m_timeOfImpact = toi;
+
+						}
+					}
+				}
+			}
+		}
+
+		//make sure all SPU work is done
+		m_spuCollisionTaskProcess->flush2();
+
+	} else
+	{
+		///PPU fallback
+		///!Need to make sure to clear all 'algorithms' when switching between SPU and PPU
+		btCollisionDispatcher::dispatchAllCollisionPairs(pairCache,dispatchInfo,dispatcher);
+	}
+}
+
--- a/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.h
+++ b/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.h
@@ -0,0 +1,64 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#ifndef SPU_GATHERING_COLLISION__DISPATCHER_H
+#define SPU_GATHERING_COLLISION__DISPATCHER_H
+
+#include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
+
+
+///Tuning value to optimized SPU utilization 
+///Too small value means Task overhead is large compared to computation (too fine granularity)
+///Too big value might render some SPUs are idle, while a few other SPUs are doing all work.
+#define SPU_BATCHSIZE_BROADPHASE_PAIRS 16
+//#define SPU_BATCHSIZE_BROADPHASE_PAIRS 256
+
+
+class SpuCollisionTaskProcess;
+
+///SpuGatheringCollisionDispatcher can use SPU to gather and calculate collision detection
+///Time of Impact, Closest Points and Penetration Depth.
+class SpuGatheringCollisionDispatcher : public btCollisionDispatcher
+{
+	
+	SpuCollisionTaskProcess*	m_spuCollisionTaskProcess;
+	
+protected:
+
+	class	btThreadSupportInterface*	m_threadInterface;
+
+	unsigned int	m_maxNumOutstandingTasks;
+	
+
+public:
+
+	//can be used by SPU collision algorithms	
+	SpuCollisionTaskProcess*	getSpuCollisionTaskProcess()
+	{
+			return m_spuCollisionTaskProcess;
+	}
+	
+	SpuGatheringCollisionDispatcher (class	btThreadSupportInterface*	threadInterface, unsigned int	maxNumOutstandingTasks,btCollisionConfiguration* collisionConfiguration);
+	
+	virtual ~SpuGatheringCollisionDispatcher();
+
+	bool	supportsDispatchPairOnSpu(int proxyType0,int proxyType1);
+
+	virtual void	dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo,btDispatcher* dispatcher) ;
+
+};
+
+
+
+#endif //SPU_GATHERING_COLLISION__DISPATCHER_H
--- a/src/BulletMultiThreaded/SpuIntegrationTask/readme.txt
+++ b/src/BulletMultiThreaded/SpuIntegrationTask/readme.txt
@@ -0,0 +1 @@
+Empty placeholder for future Libspe2 SPU task
--- a/src/BulletMultiThreaded/SpuLibspe2Support.cpp
+++ b/src/BulletMultiThreaded/SpuLibspe2Support.cpp
@@ -0,0 +1,257 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifdef USE_LIBSPE2
+
+#include "SpuLibspe2Support.h"
+
+
+
+
+//SpuLibspe2Support helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+///Setup and initialize SPU/CELL/Libspe2
+SpuLibspe2Support::SpuLibspe2Support(spe_program_handle_t *speprog, int numThreads)
+{
+	this->program = speprog;
+	this->numThreads =  ((numThreads <= spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1)) ? numThreads : spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1));
+}
+
+///cleanup/shutdown Libspe2
+SpuLibspe2Support::~SpuLibspe2Support()
+{
+	
+	stopSPU();
+}
+
+
+
+///send messages to SPUs
+void SpuLibspe2Support::sendRequest(uint32_t uiCommand, uint32_t uiArgument0, uint32_t uiArgument1)
+{
+	spe_context_ptr_t context;
+	
+	switch (uiCommand)
+	{
+	case CMD_SAMPLE_TASK_COMMAND:
+	{
+		//get taskdescription
+		SpuSampleTaskDesc* taskDesc = (SpuSampleTaskDesc*) uiArgument0;
+
+		btAssert(taskDesc->m_taskId<m_activeSpuStatus.size());
+
+		//get status of SPU on which task should run
+		btSpuStatus&	spuStatus = m_activeSpuStatus[taskDesc->m_taskId];
+
+		//set data for spuStatus
+		spuStatus.m_commandId = uiCommand;
+		spuStatus.m_status = Spu_Status_Occupied; //set SPU as "occupied"
+		spuStatus.m_taskDesc.p = taskDesc; 
+		
+		//get context
+		context = data[taskDesc->m_taskId].context;
+		
+		
+		taskDesc->m_mainMemoryPtr = reinterpret_cast<uint64_t> (spuStatus.m_lsMemory.p);
+		
+
+		break;
+	}
+	case CMD_GATHER_AND_PROCESS_PAIRLIST:
+		{
+			//get taskdescription
+			SpuGatherAndProcessPairsTaskDesc* taskDesc = (SpuGatherAndProcessPairsTaskDesc*) uiArgument0;
+
+			btAssert(taskDesc->taskId<m_activeSpuStatus.size());
+
+			//get status of SPU on which task should run
+			btSpuStatus&	spuStatus = m_activeSpuStatus[taskDesc->taskId];
+
+			//set data for spuStatus
+			spuStatus.m_commandId = uiCommand;
+			spuStatus.m_status = Spu_Status_Occupied; //set SPU as "occupied"
+			spuStatus.m_taskDesc.p = taskDesc; 
+			
+			//get context
+			context = data[taskDesc->taskId].context;
+			
+			
+			taskDesc->m_lsMemory = (CollisionTask_LocalStoreMemory*)spuStatus.m_lsMemory.p;
+			
+			break;
+		}
+	default:
+		{
+			///not implemented
+			btAssert(0);
+		}
+
+	};
+
+	
+	//write taskdescription in mailbox
+	unsigned int event = Spu_Mailbox_Event_Task;
+	spe_in_mbox_write(context, &event, 1, SPE_MBOX_ANY_NONBLOCKING);
+
+}
+
+///check for messages from SPUs
+void SpuLibspe2Support::waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1)
+{
+	///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response
+	
+	///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback'
+	
+	btAssert(m_activeSpuStatus.size());
+
+	
+	int last = -1;
+	
+	//find an active spu/thread
+	while(last < 0)
+	{
+		for (int i=0;i<m_activeSpuStatus.size();i++)
+		{
+			if ( m_activeSpuStatus[i].m_status == Spu_Status_Free)
+			{
+				last = i;
+				break;
+			}
+		}
+		if(last < 0)
+			sched_yield();
+	}
+
+
+
+	btSpuStatus& spuStatus = m_activeSpuStatus[last];
+
+	///need to find an active spu
+	btAssert(last>=0);
+
+	
+
+	*puiArgument0 = spuStatus.m_taskId;
+	*puiArgument1 = spuStatus.m_status;
+
+
+}
+
+
+void SpuLibspe2Support::startSPU()
+{
+	this->internal_startSPU();
+}
+
+
+
+///start the spus group (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+void SpuLibspe2Support::internal_startSPU()
+{
+	m_activeSpuStatus.resize(numThreads);
+	
+	
+	for (int i=0; i < numThreads; i++)
+	{
+		
+		if(data[i].context == NULL) 
+		{
+					
+			 /* Create context */
+			if ((data[i].context = spe_context_create(0, NULL)) == NULL)
+			{
+			      perror ("Failed creating context");
+		          exit(1);
+			}
+	
+			/* Load program into context */
+			if(spe_program_load(data[i].context, this->program))
+			{
+			      perror ("Failed loading program");
+		          exit(1);
+			}
+			
+			m_activeSpuStatus[i].m_status = Spu_Status_Startup; 
+			m_activeSpuStatus[i].m_taskId = i; 
+			m_activeSpuStatus[i].m_commandId = 0; 
+			m_activeSpuStatus[i].m_lsMemory.p = NULL; 
+			
+			
+			data[i].entry = SPE_DEFAULT_ENTRY;
+			data[i].flags = 0;
+			data[i].argp.p = &m_activeSpuStatus[i];
+			data[i].envp.p = NULL;
+			
+		    /* Create thread for each SPE context */
+			if (pthread_create(&data[i].pthread, NULL, &ppu_pthread_function, &(data[i]) ))
+			{
+			      perror ("Failed creating thread");
+		          exit(1);
+			}
+			/*
+			else
+			{
+				printf("started thread %d\n",i);
+			}*/
+		}		
+	}
+	
+	
+	for (int i=0; i < numThreads; i++)
+	{
+		if(data[i].context != NULL) 
+		{
+			while( m_activeSpuStatus[i].m_status == Spu_Status_Startup)
+			{
+				// wait for spu to set up
+				sched_yield();
+			}
+			printf("Spu %d is ready\n", i);
+		}
+	}
+}
+
+///tell the task scheduler we are done with the SPU tasks
+void SpuLibspe2Support::stopSPU()
+{
+	// wait for all threads to finish 
+	int i;
+	for ( i = 0; i < this->numThreads; i++ ) 
+	{ 
+		
+		unsigned int event = Spu_Mailbox_Event_Shutdown;
+		spe_context_ptr_t context = data[i].context;
+		spe_in_mbox_write(context, &event, 1, SPE_MBOX_ALL_BLOCKING);
+		pthread_join (data[i].pthread, NULL); 
+		
+	} 
+	// close SPE program 
+	spe_image_close(program); 
+	// destroy SPE contexts 
+	for ( i = 0; i < this->numThreads; i++ ) 
+	{ 
+		if(data[i].context != NULL)
+		{
+			spe_context_destroy (data[i].context);
+		}
+	} 
+	
+	m_activeSpuStatus.clear();
+	
+}
+
+
+
+#endif //USE_LIBSPE2
+
--- a/src/BulletMultiThreaded/SpuLibspe2Support.h
+++ b/src/BulletMultiThreaded/SpuLibspe2Support.h
@@ -0,0 +1,173 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef SPU_LIBSPE2_SUPPORT_H
+#define SPU_LIBSPE2_SUPPORT_H
+
+#include <LinearMath/btScalar.h> //for uint32_t etc.
+
+#ifdef USE_LIBSPE2
+
+#include <stdlib.h>
+#include <stdio.h>
+//#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
+#include "PlatformDefinitions.h"
+
+
+//extern struct SpuGatherAndProcessPairsTaskDesc;
+
+enum
+{
+	Spu_Mailbox_Event_Nothing = 0,
+	Spu_Mailbox_Event_Task = 1,
+	Spu_Mailbox_Event_Shutdown = 2,
+	
+	Spu_Mailbox_Event_ForceDword = 0xFFFFFFFF
+	
+};
+
+enum
+{
+	Spu_Status_Free = 0,
+	Spu_Status_Occupied = 1,
+	Spu_Status_Startup = 2,
+	
+	Spu_Status_ForceDword = 0xFFFFFFFF
+	
+};
+
+
+struct btSpuStatus
+{
+	uint32_t	m_taskId;
+	uint32_t	m_commandId;
+	uint32_t	m_status;
+
+	addr64 m_taskDesc;
+	addr64 m_lsMemory;
+	
+}
+__attribute__ ((aligned (128)))
+;
+
+
+
+#ifndef __SPU__
+
+#include "LinearMath/btAlignedObjectArray.h"
+#include "SpuCollisionTaskProcess.h"
+#include "SpuSampleTaskProcess.h"
+#include "btThreadSupportInterface.h"
+#include <libspe2.h>
+#include <pthread.h>
+#include <sched.h>
+
+#define MAX_SPUS 4 
+
+typedef struct ppu_pthread_data 
+{
+	spe_context_ptr_t context;
+	pthread_t pthread;
+	unsigned int entry;
+	unsigned int flags;
+	addr64 argp;
+	addr64 envp;
+	spe_stop_info_t stopinfo;
+} ppu_pthread_data_t;
+
+
+static void *ppu_pthread_function(void *arg)
+{
+    ppu_pthread_data_t * datap = (ppu_pthread_data_t *)arg;
+    /*
+    int rc;
+    do 
+    {*/
+        spe_context_run(datap->context, &datap->entry, datap->flags, datap->argp.p, datap->envp.p, &datap->stopinfo);
+        if (datap->stopinfo.stop_reason == SPE_EXIT) 
+        {
+           if (datap->stopinfo.result.spe_exit_code != 0) 
+           {
+             perror("FAILED: SPE returned a non-zero exit status: \n");
+             exit(1);
+           }
+         } 
+        else 
+         {
+           perror("FAILED: SPE abnormally terminated\n");
+           exit(1);
+         }
+        
+        
+    //} while (rc > 0); // loop until exit or error, and while any stop & signal
+    pthread_exit(NULL);
+}
+
+
+
+
+
+
+///SpuLibspe2Support helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+class SpuLibspe2Support : public btThreadSupportInterface
+{
+
+	btAlignedObjectArray<btSpuStatus>	m_activeSpuStatus;
+	
+public:
+	//Setup and initialize SPU/CELL/Libspe2
+	SpuLibspe2Support(spe_program_handle_t *speprog,int numThreads);
+	
+	// SPE program handle ptr.
+	spe_program_handle_t *program;
+	
+	// SPE program data
+	ppu_pthread_data_t data[MAX_SPUS];
+	
+	//cleanup/shutdown Libspe2
+	~SpuLibspe2Support();
+
+	///send messages to SPUs
+	void sendRequest(uint32_t uiCommand, uint32_t uiArgument0, uint32_t uiArgument1=0);
+
+	//check for messages from SPUs
+	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1);
+
+	//start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+	virtual void startSPU();
+
+	//tell the task scheduler we are done with the SPU tasks
+	virtual void stopSPU();
+
+private:
+	
+	///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+	void internal_startSPU();
+	
+	
+	int numThreads;
+
+};
+
+#endif // NOT __SPU__
+
+#endif //USE_LIBSPE2
+
+#endif //SPU_LIBSPE2_SUPPORT_H
+
+
+
+
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
@@ -0,0 +1,487 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "SpuCollisionShapes.h"
+
+btPoint3 localGetSupportingVertexWithoutMargin(int shapeType, void* shape, const btVector3& localDir,struct	SpuConvexPolyhedronVertexData* convexVertexData)//, int *featureIndex)
+{
+    switch (shapeType)
+    {
+    case SPHERE_SHAPE_PROXYTYPE:
+        {
+            return btPoint3(0,0,0);
+        }
+	case BOX_SHAPE_PROXYTYPE:
+		{
+//			spu_printf("SPU: getSupport BOX_SHAPE_PROXYTYPE\n");
+			btConvexInternalShape* convexShape = (btConvexInternalShape*)shape;
+			const btVector3& halfExtents = convexShape->getImplicitShapeDimensions();
+			
+			return btPoint3(
+				localDir.getX() < 0.0f ? -halfExtents.x() : halfExtents.x(),
+							localDir.getY() < 0.0f ? -halfExtents.y() : halfExtents.y(),
+							localDir.getZ() < 0.0f ? -halfExtents.z() : halfExtents.z());
+		}
+
+	case TRIANGLE_SHAPE_PROXYTYPE:
+		{
+
+			btVector3 dir(localDir.getX(),localDir.getY(),localDir.getZ());
+			btVector3* vertices = (btVector3*)shape;
+			btVector3 dots(dir.dot(vertices[0]), dir.dot(vertices[1]), dir.dot(vertices[2]));
+	  		btVector3 sup = vertices[dots.maxAxis()];
+			return btPoint3(sup.getX(),sup.getY(),sup.getZ());
+			break;
+		}
+
+	case CYLINDER_SHAPE_PROXYTYPE:
+		{
+			btCylinderShape* cylShape = (btCylinderShape*)shape;
+
+			//mapping of halfextents/dimension onto radius/height depends on how cylinder local orientation is (upAxis)
+
+			btVector3 halfExtents = cylShape->getImplicitShapeDimensions();
+			btVector3 v(localDir.getX(),localDir.getY(),localDir.getZ());
+			
+			int cylinderUpAxis = cylShape->getUpAxis();
+			int XX(1),YY(0),ZZ(2);
+
+			switch (cylinderUpAxis)
+			{
+			case 0:
+				{
+					XX = 1;
+					YY = 0;
+					ZZ = 2;
+					break;
+				}
+			case 1:
+				{
+					XX = 0;
+					YY = 1;
+					ZZ = 2;
+				break;
+				}
+			case 2:
+				{
+					XX = 0;
+					YY = 2;
+					ZZ = 1;
+					break;
+				}
+			default:
+				btAssert(0);
+				//printf("SPU:localGetSupportingVertexWithoutMargin unknown Cylinder up-axis\n");
+			};
+
+			btScalar radius = halfExtents[XX];
+			btScalar halfHeight = halfExtents[cylinderUpAxis];
+
+			btVector3 tmp;
+			btScalar d ;
+
+			btScalar s = btSqrt(v[XX] * v[XX] + v[ZZ] * v[ZZ]);
+			if (s != btScalar(0.0))
+			{
+				d = radius / s;  
+				tmp[XX] = v[XX] * d;
+				tmp[YY] = v[YY] < 0.0 ? -halfHeight : halfHeight;
+				tmp[ZZ] = v[ZZ] * d;
+				return btPoint3(tmp.getX(),tmp.getY(),tmp.getZ());
+			}
+			else
+			{
+				tmp[XX] = radius;
+				tmp[YY] = v[YY] < 0.0 ? -halfHeight : halfHeight;
+				tmp[ZZ] = btScalar(0.0);
+				return btPoint3(tmp.getX(),tmp.getY(),tmp.getZ());
+			}
+		}
+
+	case CAPSULE_SHAPE_PROXYTYPE:
+	{
+		//spu_printf("SPU: todo: getSupport CAPSULE_SHAPE_PROXYTYPE\n");
+		btVector3 vec0(localDir.getX(),localDir.getY(),localDir.getZ());
+
+		btCapsuleShape* capsuleShape = (btCapsuleShape*)shape;
+		btVector3 halfExtents = capsuleShape->getImplicitShapeDimensions();
+		btScalar halfHeight = capsuleShape->getHalfHeight();
+		int capsuleUpAxis = capsuleShape->getUpAxis();
+
+		btScalar radius = capsuleShape->getRadius();
+		btVector3 supVec(0,0,0);
+
+		btScalar maxDot(btScalar(-1e30));
+
+		btVector3 vec = vec0;
+		btScalar lenSqr = vec.length2();
+		if (lenSqr < btScalar(0.0001))
+		{
+			vec.setValue(1,0,0);
+		} else
+		{
+			btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
+			vec *= rlen;
+		}
+		btVector3 vtx;
+		btScalar newDot;
+		{
+			btVector3 pos(0,0,0);
+			pos[capsuleUpAxis] = halfHeight;
+
+			vtx = pos +vec*(radius);
+			newDot = vec.dot(vtx);
+			if (newDot > maxDot)
+			{
+				maxDot = newDot;
+				supVec = vtx;
+			}
+		}
+		{
+			btVector3 pos(0,0,0);
+			pos[capsuleUpAxis] = -halfHeight;
+
+			vtx = pos +vec*(radius);
+			newDot = vec.dot(vtx);
+			if (newDot > maxDot)
+			{
+				maxDot = newDot;
+				supVec = vtx;
+			}
+		}
+		return btPoint3(supVec.getX(),supVec.getY(),supVec.getZ());
+		break;
+	};
+
+	case CONVEX_HULL_SHAPE_PROXYTYPE:
+		{
+			//spu_printf("SPU: todo: getSupport CONVEX_HULL_SHAPE_PROXYTYPE\n");
+
+		
+
+			btPoint3* points = 0;
+			int numPoints = 0;
+			points = convexVertexData->gConvexPoints;
+			numPoints = convexVertexData->gNumConvexPoints;
+
+		//	spu_printf("numPoints = %d\n",numPoints);
+
+			btVector3 supVec(btScalar(0.),btScalar(0.),btScalar(0.));
+			btScalar newDot,maxDot = btScalar(-1e30);
+
+			btVector3 vec0(localDir.getX(),localDir.getY(),localDir.getZ());
+			btVector3 vec = vec0;
+			btScalar lenSqr = vec.length2();
+			if (lenSqr < btScalar(0.0001))
+			{
+				vec.setValue(1,0,0);
+			} else
+			{
+				btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
+				vec *= rlen;
+			}
+
+
+			for (int i=0;i<numPoints;i++)
+			{
+				btPoint3 vtx = points[i];// * m_localScaling;
+
+				newDot = vec.dot(vtx);
+				if (newDot > maxDot)
+				{
+					maxDot = newDot;
+					supVec = vtx;
+				}
+			}
+			return btPoint3(supVec.getX(),supVec.getY(),supVec.getZ());
+
+			break;
+		};
+
+    default:
+
+		//spu_printf("SPU:(type %i) missing support function\n",shapeType);
+
+		
+#if __ASSERT
+       // spu_printf("localGetSupportingVertexWithoutMargin() - Unsupported bound type: %d.\n", shapeType);
+#endif // __ASSERT
+        return btPoint3(0.f, 0.f, 0.f);
+    }
+}
+
+void computeAabb (btVector3& aabbMin, btVector3& aabbMax, btConvexInternalShape* convexShape, ppu_address_t convexShapePtr, int shapeType, btTransform xform)
+{
+	//calculate the aabb, given the types...
+	switch (shapeType)
+	{
+	case CYLINDER_SHAPE_PROXYTYPE:
+		/* fall through */
+	case BOX_SHAPE_PROXYTYPE:
+	{
+		float margin=convexShape->getMarginNV();
+		btVector3 halfExtents = convexShape->getImplicitShapeDimensions();
+		halfExtents += btVector3(margin,margin,margin);
+		btTransform& t = xform;
+		btMatrix3x3 abs_b = t.getBasis().absolute();  
+		btPoint3 center = t.getOrigin();
+		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));
+		
+		aabbMin = center - extent;
+		aabbMax = center + extent;
+		break;
+	}
+	case CAPSULE_SHAPE_PROXYTYPE:
+	{
+		float margin=convexShape->getMarginNV();
+		btVector3 halfExtents = convexShape->getImplicitShapeDimensions();
+		//add the radius to y-axis to get full height
+		btScalar radius = halfExtents[0];
+		halfExtents[1] += radius;
+		halfExtents += btVector3(margin,margin,margin);
+#if 0
+		int capsuleUpAxis = convexShape->getUpAxis();
+		btScalar halfHeight = convexShape->getHalfHeight();
+		btScalar radius = convexShape->getRadius();
+		halfExtents[capsuleUpAxis] = radius + halfHeight;
+#endif
+		btTransform& t = xform;
+		btMatrix3x3 abs_b = t.getBasis().absolute();  
+		btPoint3 center = t.getOrigin();
+		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));
+		
+		aabbMin = center - extent;
+		aabbMax = center + extent;
+		break;
+	}
+	case SPHERE_SHAPE_PROXYTYPE:
+	{
+		float radius = convexShape->getImplicitShapeDimensions().getX();// * convexShape->getLocalScaling().getX();
+		float margin = radius + convexShape->getMarginNV();
+		btTransform& t = xform;
+		const btVector3& center = t.getOrigin();
+		btVector3 extent(margin,margin,margin);
+		aabbMin = center - extent;
+		aabbMax = center + extent;
+		break;
+	}
+	case CONVEX_HULL_SHAPE_PROXYTYPE:
+	{
+		ATTRIBUTE_ALIGNED16(char convexHullShape0[sizeof(btConvexHullShape)]);
+		cellDmaGet(&convexHullShape0, convexShapePtr  , sizeof(btConvexHullShape), DMA_TAG(1), 0, 0);
+		cellDmaWaitTagStatusAll(DMA_MASK(1));
+		btConvexHullShape* localPtr = (btConvexHullShape*)&convexHullShape0;
+		btTransform& t = xform;
+		btScalar margin = convexShape->getMarginNV();
+		localPtr->getNonvirtualAabb(t,aabbMin,aabbMax,margin);
+		//spu_printf("SPU convex aabbMin=%f,%f,%f=\n",aabbMin.getX(),aabbMin.getY(),aabbMin.getZ());
+		//spu_printf("SPU convex aabbMax=%f,%f,%f=\n",aabbMax.getX(),aabbMax.getY(),aabbMax.getZ());
+		break;
+	}
+	default:
+		{
+	//	spu_printf("SPU: unsupported shapetype %d in AABB calculation\n");
+		}
+	};
+}
+
+void dmaBvhShapeData (bvhMeshShape_LocalStoreMemory* bvhMeshShape, btBvhTriangleMeshShape* triMeshShape)
+{
+	register int dmaSize;
+	register ppu_address_t	dmaPpuAddress2;
+
+	dmaSize = sizeof(btTriangleIndexVertexArray);
+	dmaPpuAddress2 = reinterpret_cast<ppu_address_t>(triMeshShape->getMeshInterface());
+	//	spu_printf("trimeshShape->getMeshInterface() == %llx\n",dmaPpuAddress2);
+#ifdef __SPU__
+	cellDmaGet(&bvhMeshShape->gTriangleMeshInterfaceStorage, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
+	bvhMeshShape->gTriangleMeshInterfacePtr = &bvhMeshShape->gTriangleMeshInterfaceStorage;
+#else
+	bvhMeshShape->gTriangleMeshInterfacePtr = (btTriangleIndexVertexArray*)cellDmaGetReadOnly(&bvhMeshShape->gTriangleMeshInterfaceStorage, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
+#endif
+
+	//cellDmaWaitTagStatusAll(DMA_MASK(1));
+	
+	///now DMA over the BVH
+	
+	dmaSize = sizeof(btOptimizedBvh);
+	dmaPpuAddress2 = reinterpret_cast<ppu_address_t>(triMeshShape->getOptimizedBvh());
+	//spu_printf("trimeshShape->getOptimizedBvh() == %llx\n",dmaPpuAddress2);
+	cellDmaGet(&bvhMeshShape->gOptimizedBvh, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
+	//cellDmaWaitTagStatusAll(DMA_MASK(2));
+	cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
+}
+
+void dmaBvhIndexedMesh (btIndexedMesh* IndexMesh, IndexedMeshArray& indexArray, int index, uint32_t dmaTag)
+{		
+	cellDmaGet(IndexMesh, (ppu_address_t)&indexArray[index]  , sizeof(btIndexedMesh), DMA_TAG(dmaTag), 0, 0);
+	
+}
+
+void dmaBvhSubTreeHeaders (btBvhSubtreeInfo* subTreeHeaders, ppu_address_t subTreePtr, int batchSize, uint32_t dmaTag)
+{
+	cellDmaGet(subTreeHeaders, subTreePtr, batchSize * sizeof(btBvhSubtreeInfo), DMA_TAG(dmaTag), 0, 0);
+}
+
+void dmaBvhSubTreeNodes (btQuantizedBvhNode* nodes, const btBvhSubtreeInfo& subtree, QuantizedNodeArray&	nodeArray, int dmaTag)
+{
+	cellDmaGet(nodes, reinterpret_cast<ppu_address_t>(&nodeArray[subtree.m_rootNodeIndex]) , subtree.m_subtreeSize* sizeof(btQuantizedBvhNode), DMA_TAG(2), 0, 0);
+}
+
+///getShapeTypeSize could easily be optimized, but it is not likely a bottleneck
+int		getShapeTypeSize(int shapeType)
+{
+
+
+	switch (shapeType)
+	{
+	case CYLINDER_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btCylinderShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+	case BOX_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btBoxShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+	case SPHERE_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btSphereShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+	case TRIANGLE_MESH_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btBvhTriangleMeshShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+	case CAPSULE_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btCapsuleShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+
+	case CONVEX_HULL_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btConvexHullShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+
+	case COMPOUND_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btCompoundShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+
+	default:
+		btAssert(0);
+		//unsupported shapetype, please add here
+		return 0;
+	}
+}
+
+void dmaConvexVertexData (SpuConvexPolyhedronVertexData* convexVertexData, btConvexHullShape* convexShapeSPU)
+{
+	convexVertexData->gNumConvexPoints = convexShapeSPU->getNumPoints();
+	if (convexVertexData->gNumConvexPoints>MAX_NUM_SPU_CONVEX_POINTS)
+	{
+		btAssert(0);
+	//	spu_printf("SPU: Error: MAX_NUM_SPU_CONVEX_POINTS(%d) exceeded: %d\n",MAX_NUM_SPU_CONVEX_POINTS,convexVertexData->gNumConvexPoints);
+		return;
+	}
+			
+	register int dmaSize = convexVertexData->gNumConvexPoints*sizeof(btPoint3);
+	ppu_address_t pointsPPU = (ppu_address_t) convexShapeSPU->getPoints();
+	cellDmaGet(&convexVertexData->g_convexPointBuffer[0], pointsPPU  , dmaSize, DMA_TAG(2), 0, 0);
+}
+
+void dmaCollisionShape (void* collisionShapeLocation, ppu_address_t collisionShapePtr, uint32_t dmaTag, int shapeType)
+{
+	register int dmaSize = getShapeTypeSize(shapeType);
+	cellDmaGet(collisionShapeLocation, collisionShapePtr  , dmaSize, DMA_TAG(dmaTag), 0, 0);
+	//cellDmaWaitTagStatusAll(DMA_MASK(dmaTag));
+}
+
+void dmaCompoundShapeInfo (CompoundShape_LocalStoreMemory* compoundShapeLocation, btCompoundShape* spuCompoundShape, uint32_t dmaTag)
+{
+	register int dmaSize;
+	register	ppu_address_t	dmaPpuAddress2;
+	int childShapeCount = spuCompoundShape->getNumChildShapes();
+	dmaSize = childShapeCount * sizeof(btCompoundShapeChild);
+	dmaPpuAddress2 = (ppu_address_t)spuCompoundShape->getChildList();
+	cellDmaGet(&compoundShapeLocation->gSubshapes[0], dmaPpuAddress2, dmaSize, DMA_TAG(dmaTag), 0, 0);
+}
+
+void dmaCompoundSubShapes (CompoundShape_LocalStoreMemory* compoundShapeLocation, btCompoundShape* spuCompoundShape, uint32_t dmaTag)
+{
+	int childShapeCount = spuCompoundShape->getNumChildShapes();
+	int i;
+	// DMA all the subshapes 
+	for ( i = 0; i < childShapeCount; ++i)
+	{
+		btCompoundShapeChild& childShape = compoundShapeLocation->gSubshapes[i];
+		dmaCollisionShape (&compoundShapeLocation->gSubshapeShape[i],(ppu_address_t)childShape.m_childShape, dmaTag, childShape.m_childShapeType);
+	}
+}
+
+
+void	spuWalkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,const btQuantizedBvhNode* rootNode,int startNodeIndex,int endNodeIndex)
+{
+
+	int curIndex = startNodeIndex;
+	int walkIterations = 0;
+	int subTreeSize = endNodeIndex - startNodeIndex;
+
+	int escapeIndex;
+
+	unsigned int aabbOverlap, isLeafNode;
+
+	while (curIndex < endNodeIndex)
+	{
+		//catch bugs in tree data
+		assert (walkIterations < subTreeSize);
+
+		walkIterations++;
+		aabbOverlap = spuTestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax);
+		isLeafNode = rootNode->isLeafNode();
+
+		if (isLeafNode && aabbOverlap)
+		{
+			//printf("overlap with node %d\n",rootNode->getTriangleIndex());
+			nodeCallback->processNode(0,rootNode->getTriangleIndex());
+			//			spu_printf("SPU: overlap detected with triangleIndex:%d\n",rootNode->getTriangleIndex());
+		} 
+
+		if (aabbOverlap || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->getEscapeIndex();
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+
+}
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h
@@ -0,0 +1,102 @@
+#ifndef __SPU_COLLISION_SHAPES_H
+#define __SPU_COLLISION_SHAPES_H
+
+#include "../SpuDoubleBuffer.h"
+
+#include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
+#include "BulletCollision/CollisionShapes/btConvexInternalShape.h"
+#include "BulletCollision/CollisionShapes/btCylinderShape.h"
+
+#include "BulletCollision/CollisionShapes/btOptimizedBvh.h"
+#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
+#include "BulletCollision/CollisionShapes/btSphereShape.h"
+
+#include "BulletCollision/CollisionShapes/btCapsuleShape.h"
+
+#include "BulletCollision/CollisionShapes/btConvexShape.h"
+#include "BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h"
+#include "BulletCollision/CollisionShapes/btConvexHullShape.h"
+#include "BulletCollision/CollisionShapes/btCompoundShape.h"
+
+#define MAX_NUM_SPU_CONVEX_POINTS 128
+
+struct	SpuConvexPolyhedronVertexData
+{
+	void*	gSpuConvexShapePtr;
+	btPoint3* gConvexPoints;
+	int gNumConvexPoints;
+	ATTRIBUTE_ALIGNED16(btPoint3 g_convexPointBuffer[MAX_NUM_SPU_CONVEX_POINTS]);
+};
+
+#define MAX_SHAPE_SIZE 256
+
+struct CollisionShape_LocalStoreMemory
+{
+	ATTRIBUTE_ALIGNED16(char collisionShape[MAX_SHAPE_SIZE]);
+};
+
+struct CompoundShape_LocalStoreMemory
+{
+	// Compound data
+#define MAX_SPU_COMPOUND_SUBSHAPES 16
+	ATTRIBUTE_ALIGNED16(btCompoundShapeChild gSubshapes[MAX_SPU_COMPOUND_SUBSHAPES]);
+	ATTRIBUTE_ALIGNED16(char gSubshapeShape[MAX_SPU_COMPOUND_SUBSHAPES][MAX_SHAPE_SIZE]);
+};
+
+struct bvhMeshShape_LocalStoreMemory
+{
+	//ATTRIBUTE_ALIGNED16(btOptimizedBvh	gOptimizedBvh);
+	ATTRIBUTE_ALIGNED16(char gOptimizedBvh[sizeof(btOptimizedBvh)+16]);
+	btOptimizedBvh*	getOptimizedBvh()
+	{
+		return (btOptimizedBvh*) gOptimizedBvh;
+	}
+
+	ATTRIBUTE_ALIGNED16(btTriangleIndexVertexArray	gTriangleMeshInterfaceStorage);
+	btTriangleIndexVertexArray*	gTriangleMeshInterfacePtr;
+	///only a single mesh part for now, we can add support for multiple parts, but quantized trees don't support this at the moment 
+	ATTRIBUTE_ALIGNED16(btIndexedMesh	gIndexMesh);
+	#define MAX_SPU_SUBTREE_HEADERS 32
+	//1024
+	ATTRIBUTE_ALIGNED16(btBvhSubtreeInfo	gSubtreeHeaders[MAX_SPU_SUBTREE_HEADERS]);
+	ATTRIBUTE_ALIGNED16(btQuantizedBvhNode	gSubtreeNodes[MAX_SUBTREE_SIZE_IN_BYTES/sizeof(btQuantizedBvhNode)]);
+};
+
+
+btPoint3 localGetSupportingVertexWithoutMargin(int shapeType, void* shape, const btVector3& localDir,struct	SpuConvexPolyhedronVertexData* convexVertexData);//, int *featureIndex)
+void computeAabb (btVector3& aabbMin, btVector3& aabbMax, btConvexInternalShape* convexShape, ppu_address_t convexShapePtr, int shapeType, btTransform xform);
+void dmaBvhShapeData (bvhMeshShape_LocalStoreMemory* bvhMeshShape, btBvhTriangleMeshShape* triMeshShape);
+void dmaBvhIndexedMesh (btIndexedMesh* IndexMesh, IndexedMeshArray& indexArray, int index, uint32_t dmaTag);
+void dmaBvhSubTreeHeaders (btBvhSubtreeInfo* subTreeHeaders, ppu_address_t subTreePtr, int batchSize, uint32_t dmaTag);
+void dmaBvhSubTreeNodes (btQuantizedBvhNode* nodes, const btBvhSubtreeInfo& subtree, QuantizedNodeArray&	nodeArray, int dmaTag);
+
+int  getShapeTypeSize(int shapeType);
+void dmaConvexVertexData (SpuConvexPolyhedronVertexData* convexVertexData, btConvexHullShape* convexShapeSPU);
+void dmaCollisionShape (void* collisionShapeLocation, ppu_address_t collisionShapePtr, uint32_t dmaTag, int shapeType);
+void dmaCompoundShapeInfo (CompoundShape_LocalStoreMemory* compoundShapeLocation, btCompoundShape* spuCompoundShape, uint32_t dmaTag);
+void dmaCompoundSubShapes (CompoundShape_LocalStoreMemory* compoundShapeLocation, btCompoundShape* spuCompoundShape, uint32_t dmaTag);
+
+#define USE_BRANCHFREE_TEST 1
+#ifdef USE_BRANCHFREE_TEST
+SIMD_FORCE_INLINE unsigned int spuTestQuantizedAabbAgainstQuantizedAabb(unsigned short int* aabbMin1,unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int* aabbMax2)
+{		
+	return btSelect((unsigned)((aabbMin1[0] <= aabbMax2[0]) & (aabbMax1[0] >= aabbMin2[0])
+		& (aabbMin1[2] <= aabbMax2[2]) & (aabbMax1[2] >= aabbMin2[2])
+		& (aabbMin1[1] <= aabbMax2[1]) & (aabbMax1[1] >= aabbMin2[1])),
+		1, 0);
+}
+#else
+
+unsigned int spuTestQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1,const unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int*  aabbMax2)
+{
+	unsigned int overlap = 1;
+	overlap = (aabbMin1[0] > aabbMax2[0] || aabbMax1[0] < aabbMin2[0]) ? 0 : overlap;
+	overlap = (aabbMin1[2] > aabbMax2[2] || aabbMax1[2] < aabbMin2[2]) ? 0 : overlap;
+	overlap = (aabbMin1[1] > aabbMax2[1] || aabbMax1[1] < aabbMin2[1]) ? 0 : overlap;
+	return overlap;
+}
+#endif
+
+void	spuWalkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,const btQuantizedBvhNode* rootNode,int startNodeIndex,int endNodeIndex);
+
+#endif
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
@@ -0,0 +1,227 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuContactResult.h"
+
+//#define DEBUG_SPU_COLLISION_DETECTION 1
+
+
+SpuContactResult::SpuContactResult()
+{
+	m_manifoldAddress = 0;
+	m_spuManifold = NULL;
+	m_RequiresWriteBack = false;
+}
+
+ SpuContactResult::~SpuContactResult()
+{
+	g_manifoldDmaExport.swapBuffers();
+}
+
+ 	///User can override this material combiner by implementing gContactAddedCallback and setting body0->m_collisionFlags |= btCollisionObject::customMaterialCallback;
+inline btScalar	calculateCombinedFriction(btScalar friction0,btScalar friction1)
+{
+	btScalar friction = friction0*friction1;
+
+	const btScalar MAX_FRICTION  = btScalar(10.);
+
+	if (friction < -MAX_FRICTION)
+		friction = -MAX_FRICTION;
+	if (friction > MAX_FRICTION)
+		friction = MAX_FRICTION;
+	return friction;
+
+}
+
+inline btScalar	calculateCombinedRestitution(btScalar restitution0,btScalar restitution1)
+{
+	return restitution0*restitution1;
+}
+
+
+
+ void	SpuContactResult::setContactInfo(btPersistentManifold* spuManifold, ppu_address_t	manifoldAddress,const btTransform& worldTrans0,const btTransform& worldTrans1, btScalar restitution0,btScalar restitution1, btScalar friction0,btScalar friction1, bool isSwapped)
+ {
+	//spu_printf("SpuContactResult::setContactInfo ManifoldAddress: %lu\n", manifoldAddress);
+	m_rootWorldTransform0 = worldTrans0;
+	m_rootWorldTransform1 = worldTrans1;
+	m_manifoldAddress = manifoldAddress;    
+	m_spuManifold = spuManifold;
+
+	m_combinedFriction = calculateCombinedFriction(friction0,friction1);
+	m_combinedRestitution = calculateCombinedRestitution(restitution0,restitution1);
+	m_isSwapped = isSwapped;
+ }
+
+ void SpuContactResult::setShapeIdentifiers(int partId0,int index0,	int partId1,int index1)
+ {
+	
+ }
+	
+
+
+ ///return true if it requires a dma transfer back
+bool ManifoldResultAddContactPoint(const btVector3& normalOnBInWorld,
+								   const btVector3& pointInWorld,
+								   float depth,
+								   btPersistentManifold* manifoldPtr,
+								   btTransform& transA,
+								   btTransform& transB,
+									btScalar	combinedFriction,
+									btScalar	combinedRestitution,
+								   bool isSwapped)
+{
+	
+	float contactTreshold = manifoldPtr->getContactBreakingThreshold();
+
+	//spu_printf("SPU: add contactpoint, depth:%f, contactTreshold %f, manifoldPtr %llx\n",depth,contactTreshold,manifoldPtr);
+
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+	spu_printf("SPU: contactTreshold %f\n",contactTreshold);
+#endif //DEBUG_SPU_COLLISION_DETECTION
+	if (depth > manifoldPtr->getContactBreakingThreshold())
+		return false;
+
+	//provide inverses or just calculate?
+	btTransform transAInv = transA.inverse();//m_body0->m_cachedInvertedWorldTransform;
+	btTransform transBInv= transB.inverse();//m_body1->m_cachedInvertedWorldTransform;
+
+	btVector3 pointA;
+	btVector3 localA;
+	btVector3 localB;
+	btVector3 normal;
+
+	if (isSwapped)
+	{
+		normal = normalOnBInWorld * -1;
+		pointA = pointInWorld + normal * depth;
+		localA = transAInv(pointA );
+		localB = transBInv(pointInWorld);
+		/*localA = transBInv(pointA );
+		localB = transAInv(pointInWorld);*/
+	}
+	else
+	{
+		normal = normalOnBInWorld;
+		pointA = pointInWorld + normal * depth;
+		localA = transAInv(pointA );
+		localB = transBInv(pointInWorld);
+	}
+
+	btManifoldPoint newPt(localA,localB,normal,depth);
+
+	int insertIndex = manifoldPtr->getCacheEntry(newPt);
+	if (insertIndex >= 0)
+	{
+//		manifoldPtr->replaceContactPoint(newPt,insertIndex);
+//		return true;
+
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+		spu_printf("SPU: same contact detected, nothing done\n");
+#endif //DEBUG_SPU_COLLISION_DETECTION
+		// This is not needed, just use the old info! saves a DMA transfer as well
+	} else
+	{
+
+		newPt.m_combinedFriction = combinedFriction;
+		newPt.m_combinedRestitution = combinedRestitution;
+
+		/*
+		//potential TODO: SPU callbacks, either immediate (local on the SPU), or deferred
+		//User can override friction and/or restitution
+		if (gContactAddedCallback &&
+			//and if either of the two bodies requires custom material
+			 ((m_body0->m_collisionFlags & btCollisionObject::customMaterialCallback) ||
+			   (m_body1->m_collisionFlags & btCollisionObject::customMaterialCallback)))
+		{
+			//experimental feature info, for per-triangle material etc.
+			(*gContactAddedCallback)(newPt,m_body0,m_partId0,m_index0,m_body1,m_partId1,m_index1);
+		}
+		*/
+		manifoldPtr->addManifoldPoint(newPt);
+		return true;
+
+	}
+	return false;
+	
+}
+
+
+void SpuContactResult::writeDoubleBufferedManifold(btPersistentManifold* lsManifold, btPersistentManifold* mmManifold)
+{
+    memcpy(g_manifoldDmaExport.getFront(),lsManifold,sizeof(btPersistentManifold));
+
+    g_manifoldDmaExport.swapBuffers();
+    uint64_t mmAddr = (uint32_t)mmManifold;
+    g_manifoldDmaExport.backBufferDmaPut(mmAddr, sizeof(btPersistentManifold), DMA_TAG(9));
+	// Should there be any kind of wait here?  What if somebody tries to use this tag again?  What if we call this function again really soon?
+	//no, the swapBuffers does the wait
+}
+
+void SpuContactResult::addContactPoint(const btVector3& normalOnBInWorld,const btPoint3& pointInWorld,float depth)
+{
+	//spu_printf("*** SpuContactResult::addContactPoint: depth = %f\n",depth);
+
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+ //   int sman = sizeof(rage::phManifold);
+//	spu_printf("sizeof_manifold = %i\n",sman);
+#endif //DEBUG_SPU_COLLISION_DETECTION
+
+	btPersistentManifold* localManifold = m_spuManifold;
+
+	btVector3	normalB(normalOnBInWorld.getX(),normalOnBInWorld.getY(),normalOnBInWorld.getZ());
+	btVector3	pointWrld(pointInWorld.getX(),pointInWorld.getY(),pointInWorld.getZ());
+
+	//process the contact point
+	const bool retVal = ManifoldResultAddContactPoint(normalB,
+		pointWrld,
+		depth,
+		localManifold,
+		m_rootWorldTransform0,
+		m_rootWorldTransform1,
+		m_combinedFriction,
+		m_combinedRestitution,
+		m_isSwapped);
+	m_RequiresWriteBack = m_RequiresWriteBack || retVal;
+}
+
+void SpuContactResult::flush()
+{
+
+	if (m_spuManifold && m_spuManifold->getNumContacts())
+	{
+		m_spuManifold->refreshContactPoints(m_rootWorldTransform0,m_rootWorldTransform1);
+		m_RequiresWriteBack = true;
+	}
+
+
+	if (m_RequiresWriteBack)
+	{
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+		spu_printf("SPU: Start SpuContactResult::flush (Put) DMA\n");
+		spu_printf("Num contacts:%d\n", m_spuManifold->getNumContacts());
+		spu_printf("Manifold address: %llu\n", m_manifoldAddress);
+#endif //DEBUG_SPU_COLLISION_DETECTION
+	//	spu_printf("writeDoubleBufferedManifold\n");
+		writeDoubleBufferedManifold(m_spuManifold, (btPersistentManifold*)m_manifoldAddress);
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+		spu_printf("SPU: Finished (Put) DMA\n");
+#endif //DEBUG_SPU_COLLISION_DETECTION
+	}
+	m_spuManifold = NULL;
+	m_RequiresWriteBack = false;
+}
+
+
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h
@@ -0,0 +1,113 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_CONTACT_RESULT2_H
+#define SPU_CONTACT_RESULT2_H
+
+
+#ifndef WIN32
+#include <stdint.h>
+#endif
+
+
+
+#include "../SpuDoubleBuffer.h"
+
+
+#include "LinearMath/btTransform.h"
+#include "LinearMath/btPoint3.h"
+
+
+#include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
+
+
+struct SpuCollisionPairInput
+{
+	ppu_address_t m_collisionShapes[2];
+	void*	m_spuCollisionShapes[2];
+
+	ppu_address_t m_persistentManifoldPtr;
+	btVector3	m_primitiveDimensions0;
+	btVector3	m_primitiveDimensions1;
+	int		m_shapeType0;
+	int		m_shapeType1;	
+	float	m_collisionMargin0;
+	float	m_collisionMargin1;
+
+	btTransform	m_worldTransform0;
+	btTransform m_worldTransform1;
+	
+	bool	m_isSwapped;
+	bool    m_useEpa;
+};
+
+
+struct SpuClosestPointInput
+{
+    SpuClosestPointInput()
+        :m_maximumDistanceSquared(float(1e30)),
+        m_stackAlloc(0)
+    {
+    }
+
+    btTransform m_transformA;
+    btTransform m_transformB;
+    float	m_maximumDistanceSquared;
+    class	btStackAlloc* m_stackAlloc;
+	struct SpuConvexPolyhedronVertexData* m_convexVertexData[2];
+};
+
+///SpuContactResult exports the contact points using double-buffered DMA transfers, only when needed
+///So when an existing contact point is duplicated, no transfer/refresh is performed.
+class SpuContactResult
+{
+    btTransform		m_rootWorldTransform0;
+	btTransform		m_rootWorldTransform1;
+	ppu_address_t	m_manifoldAddress;
+
+    btPersistentManifold* m_spuManifold;
+	bool m_RequiresWriteBack;
+	btScalar	m_combinedFriction;
+	btScalar	m_combinedRestitution;
+	
+	bool m_isSwapped;
+
+	DoubleBuffer<btPersistentManifold, 1> g_manifoldDmaExport;
+
+	public:
+		SpuContactResult();
+		virtual ~SpuContactResult();
+
+		btPersistentManifold*	GetSpuManifold() const
+		{
+			return m_spuManifold;
+		}
+
+		virtual void setShapeIdentifiers(int partId0,int index0,	int partId1,int index1);
+
+		void	setContactInfo(btPersistentManifold* spuManifold, ppu_address_t	manifoldAddress,const btTransform& worldTrans0,const btTransform& worldTrans1, btScalar restitution0,btScalar restitution1, btScalar friction0,btScalar friction01, bool isSwapped);
+
+
+        void writeDoubleBufferedManifold(btPersistentManifold* lsManifold, btPersistentManifold* mmManifold);
+
+        virtual void addContactPoint(const btVector3& normalOnBInWorld,const btPoint3& pointInWorld,float depth);
+
+		void flush();
+};
+
+
+
+#endif //SPU_CONTACT_RESULT2_H
+
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
@@ -0,0 +1,52 @@
+
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef CONVEX_PENETRATION_DEPTH_H
+#define CONVEX_PENETRATION_DEPTH_H
+
+
+
+class btStackAlloc;
+class btIDebugDraw;
+class SpuVoronoiSimplexSolver;
+
+#include <LinearMath/btTransform.h>
+#include <LinearMath/btPoint3.h>
+
+
+///ConvexPenetrationDepthSolver provides an interface for penetration depth calculation.
+class SpuConvexPenetrationDepthSolver
+{
+public:	
+	
+	virtual ~SpuConvexPenetrationDepthSolver() {};
+	virtual bool calcPenDepth( SpuVoronoiSimplexSolver& simplexSolver,
+	        void* convexA,void* convexB,int shapeTypeA, int shapeTypeB, float marginA, float marginB,
+            btTransform& transA,const btTransform& transB,
+			btVector3& v, btPoint3& pa, btPoint3& pb,
+			class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataA,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataB
+			) const = 0;
+
+
+};
+
+
+
+#endif //CONVEX_PENETRATION_DEPTH_H
+
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuEpaPenetrationDepthSolver.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuEpaPenetrationDepthSolver.cpp
@@ -0,0 +1,37 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuEpaPenetrationDepthSolver.h"
+#include "SpuVoronoiSimplexSolver.h"
+#include "SpuGjkPairDetector.h"
+#include "SpuContactResult.h"
+#include "SpuGjkEpa2.h"
+
+bool SpuEpaPenetrationDepthSolver::calcPenDepth( SpuVoronoiSimplexSolver& simplexSolver,
+	        void* convexA,void* convexB,int shapeTypeA, int shapeTypeB, float marginA, float marginB,
+		btTransform& transA,const btTransform& transB,
+			btVector3& v, btPoint3& pa, btPoint3& pb,
+			class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataA,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataB
+			) const
+{
+	bool r;
+	SpuGjkEpaSolver2::sResults results;
+	r = SpuGjkEpaSolver2::Penetration (convexA, convexVertexDataA, shapeTypeA, marginA, transA, convexB, convexVertexDataB, shapeTypeB, marginB, transB, btVector3(1.0f, 0.0f, 0.0f), results);
+	pa = results.witnesses[0];
+	pb = results.witnesses[1];
+	return r;
+}
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuEpaPenetrationDepthSolver.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuEpaPenetrationDepthSolver.h
@@ -0,0 +1,47 @@
+
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_EPA_PENETRATION_DEPTH_SOLVER_H
+#define SPU_EPA_PENETRATION_DEPTH_SOLVER_H
+
+
+#include "SpuConvexPenetrationDepthSolver.h"
+
+class btStackAlloc;
+class btIDebugDraw;
+class SpuVoronoiSimplexSolver;
+
+///MinkowskiPenetrationDepthSolver implements bruteforce penetration depth estimation.
+///Implementation is based on sampling the depth using support mapping, and using GJK step to get the witness points.
+class SpuEpaPenetrationDepthSolver : public SpuConvexPenetrationDepthSolver
+{
+public:
+
+	virtual bool calcPenDepth( SpuVoronoiSimplexSolver& simplexSolver,
+	        void* convexA,void* convexB,int shapeTypeA, int shapeTypeB, float marginA, float marginB,
+            btTransform& transA,const btTransform& transB,
+			btVector3& v, btPoint3& pa, btPoint3& pb,
+			class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataA,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataB
+			) const;
+
+
+};
+
+
+#endif //SPU_EPA_PENETRATION_DEPTH_SOLVER_H
+
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
@@ -0,0 +1,140 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_GATHERING_COLLISION_TASK_H
+#define SPU_GATHERING_COLLISION_TASK_H
+
+#include "../PlatformDefinitions.h"
+//#define DEBUG_SPU_COLLISION_DETECTION 1
+
+
+///Task Description for SPU collision detection
+struct SpuGatherAndProcessPairsTaskDesc 
+{
+	ppu_address_t	inPtr;//m_pairArrayPtr;
+	//mutex variable
+	uint32_t	m_someMutexVariableInMainMemory;
+
+	ppu_address_t	m_dispatcher;
+
+	uint32_t	numOnLastPage;
+
+	uint16_t numPages;
+	uint16_t taskId;
+	bool m_useEpa;
+
+	struct	CollisionTask_LocalStoreMemory*	m_lsMemory; 
+}
+
+#if  defined(__CELLOS_LV2__) || defined(USE_LIBSPE2)
+__attribute__ ((aligned (128)))
+#endif
+;
+
+
+void	processCollisionTask(void* userPtr, void* lsMemory);
+
+void*	createCollisionLocalStoreMemory();
+
+
+#if defined(USE_LIBSPE2) && defined(__SPU__)
+#include "../SpuLibspe2Support.h"
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+#include <SpuFakeDma.h>
+
+//#define DEBUG_LIBSPE2_SPU_TASK
+
+
+
+int main(unsigned long long speid, addr64 argp, addr64 envp)
+{
+	printf("SPU: hello \n");
+	
+	ATTRIBUTE_ALIGNED128(btSpuStatus status);
+	ATTRIBUTE_ALIGNED16( SpuGatherAndProcessPairsTaskDesc taskDesc ) ;
+	unsigned int received_message = Spu_Mailbox_Event_Nothing;
+    bool shutdown = false;
+
+	cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+	cellDmaWaitTagStatusAll(DMA_MASK(3));
+
+	status.m_status = Spu_Status_Free;
+	status.m_lsMemory.p = createCollisionLocalStoreMemory();
+
+	cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+	cellDmaWaitTagStatusAll(DMA_MASK(3));
+	
+	
+	while ( btLikely( !shutdown ) )
+	{
+		
+		received_message = spu_read_in_mbox();
+		
+		if( btLikely( received_message == Spu_Mailbox_Event_Task ))
+		{
+#ifdef DEBUG_LIBSPE2_SPU_TASK
+			printf("SPU: received Spu_Mailbox_Event_Task\n");
+#endif //DEBUG_LIBSPE2_SPU_TASK
+
+			// refresh the status
+			cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+			cellDmaWaitTagStatusAll(DMA_MASK(3));
+		
+			btAssert(status.m_status==Spu_Status_Occupied);
+			
+			cellDmaGet(&taskDesc, status.m_taskDesc.p, sizeof(SpuGatherAndProcessPairsTaskDesc), DMA_TAG(3), 0, 0);
+			cellDmaWaitTagStatusAll(DMA_MASK(3));
+#ifdef DEBUG_LIBSPE2_SPU_TASK		
+			printf("SPU:processCollisionTask\n");	
+#endif //DEBUG_LIBSPE2_SPU_TASK
+			processCollisionTask((void*)&taskDesc, taskDesc.m_lsMemory);
+			
+#ifdef DEBUG_LIBSPE2_SPU_TASK
+			printf("SPU:finished processCollisionTask\n");
+#endif //DEBUG_LIBSPE2_SPU_TASK
+		}
+		else
+		{
+#ifdef DEBUG_LIBSPE2_SPU_TASK
+			printf("SPU: received ShutDown\n");
+#endif //DEBUG_LIBSPE2_SPU_TASK
+			if( btLikely( received_message == Spu_Mailbox_Event_Shutdown ) )
+			{
+				shutdown = true;
+			}
+			else
+			{
+				//printf("SPU - Sth. recieved\n");
+			}
+		}
+
+		// set to status free and wait for next task
+		status.m_status = Spu_Status_Free;
+		cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+		cellDmaWaitTagStatusAll(DMA_MASK(3));		
+				
+		
+  	}
+
+	printf("SPU: shutdown\n");
+  	return 0;
+}
+#endif // USE_LIBSPE2
+
+
+#endif //SPU_GATHERING_COLLISION_TASK_H
+
+
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGjkEpa2.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGjkEpa2.cpp
@@ -0,0 +1,850 @@
+#include "BulletCollision/CollisionShapes/btConvexInternalShape.h"
+#include "BulletCollision/CollisionShapes/btSphereShape.h"
+#include "SpuCollisionShapes.h"
+#include "SpuGjkEpa2.h"
+
+#if defined(DEBUG) || defined (_DEBUG)
+#include <stdio.h> //for debug printf
+#ifdef __SPU__
+#include <spu_printf.h>
+#define printf spu_printf
+#endif //__SPU__
+#endif
+
+namespace gjkepa2_spu_impl
+{
+
+// Config
+
+	/* GJK	*/ 
+#define GJK_MAX_ITERATIONS	128
+#define GJK_ACCURARY		((btScalar)0.0001)
+#define GJK_MIN_DISTANCE	((btScalar)0.0001)
+#define GJK_DUPLICATED_EPS	((btScalar)0.0001)
+#define GJK_SIMPLEX2_EPS	((btScalar)0.0)
+#define GJK_SIMPLEX3_EPS	((btScalar)0.0)
+#define GJK_SIMPLEX4_EPS	((btScalar)0.0)
+
+	/* EPA	*/ 
+#define EPA_MAX_VERTICES	64
+#define EPA_MAX_FACES		(EPA_MAX_VERTICES*2)
+#define EPA_MAX_ITERATIONS	255
+#define EPA_ACCURACY		((btScalar)0.0001)
+#define EPA_FALLBACK		(10*EPA_ACCURACY)
+#define EPA_PLANE_EPS		((btScalar)0.00001)
+#define EPA_INSIDE_EPS		((btScalar)0.01)
+
+
+// Shorthands
+typedef unsigned int	U;
+typedef unsigned char	U1;
+
+struct convexShape
+{
+	void* shape;
+	SpuConvexPolyhedronVertexData* convexData;
+	int shapeType;
+	float margin;
+};
+
+// MinkowskiDiff
+struct	MinkowskiDiff
+	{
+	convexShape m_shapes[2];
+	btMatrix3x3				m_toshape1;
+	btTransform				m_toshape0;
+	btVector3				(btConvexShape::*Ls)(const btVector3&) const;
+	void					EnableMargin(bool enable)
+		{
+#if 0
+		if(enable)
+			Ls=&btConvexShape::localGetSupportingVertex;
+			else
+			Ls=&btConvexShape::localGetSupportingVertexWithoutMargin;
+#endif
+		}	
+	inline btVector3		Support0(const btVector3& d) const
+		{
+			btVector3 sp = localGetSupportingVertexWithoutMargin (m_shapes[0].shapeType, m_shapes[0].shape, d, m_shapes[0].convexData);
+			btVector3 ud = d;
+			ud.normalize();
+			sp += ud * m_shapes[0].margin;
+			return sp;
+	//	return(((m_shapes[0])->*(Ls))(d));
+		}
+	inline btVector3		Support1(const btVector3& d) const
+		{
+			btVector3 nd = m_toshape1*d;
+			btVector3 ud = nd;
+			ud.normalize ();
+			btVector3 sp = localGetSupportingVertexWithoutMargin (m_shapes[1].shapeType, m_shapes[1].shape, nd, m_shapes[1].convexData);
+			sp += ud * m_shapes[1].margin;
+			return m_toshape0 * sp;
+	//	return(m_toshape0*((m_shapes[1])->*(Ls))(m_toshape1*d));
+		}
+	inline btVector3		Support(const btVector3& d) const
+		{
+			return(Support0(d)-Support1(-d));
+		}
+	btVector3				Support(const btVector3& d,U index) const
+		{
+
+		if(index)
+			return(Support1(d));
+			else
+			return(Support0(d));
+
+		}
+	};
+
+typedef	MinkowskiDiff	tShape;
+
+// GJK
+struct	GJK
+{
+/* Types		*/ 
+struct	sSV
+	{
+	btVector3	d,w;
+	};
+struct	sSimplex
+	{
+	sSV*		c[4];
+	btScalar	p[4];
+	U			rank;
+	};
+struct	eStatus	{ enum _ {
+	Valid,
+	Inside,
+	Failed		};};
+/* Fields		*/ 
+tShape			m_shape;
+btVector3		m_ray;
+btScalar		m_distance;
+sSimplex		m_simplices[2];
+sSV				m_store[4];
+sSV*			m_free[4];
+U				m_nfree;
+U				m_current;
+sSimplex*		m_simplex;
+eStatus::_		m_status;
+/* Methods		*/ 
+					GJK()
+	{
+	Initialize();
+	}
+void				Initialize()
+	{
+	m_ray		=	btVector3(0,0,0);
+	m_nfree		=	0;
+	m_status	=	eStatus::Failed;
+	m_current	=	0;
+	m_distance	=	0;
+	}
+eStatus::_			Evaluate(const tShape& shapearg,const btVector3& guess)
+	{
+	U			iterations=0;
+	btScalar	sqdist=0;
+	btScalar	alpha=0;
+	btVector3	lastw[4];
+	U			clastw=0;
+	/* Initialize solver		*/ 
+	m_free[0]			=	&m_store[0];
+	m_free[1]			=	&m_store[1];
+	m_free[2]			=	&m_store[2];
+	m_free[3]			=	&m_store[3];
+	m_nfree				=	4;
+	m_current			=	0;
+	m_status			=	eStatus::Valid;
+	m_shape				=	shapearg;
+	m_distance			=	0;
+	/* Initialize simplex		*/ 
+	m_simplices[0].rank	=	0;
+	m_ray				=	guess;
+	const btScalar	sqrl=	m_ray.length2();
+	appendvertice(m_simplices[0],sqrl>0?-m_ray:btVector3(1,0,0));
+	m_simplices[0].p[0]	=	1;
+	m_ray				=	m_simplices[0].c[0]->w;	
+	sqdist				=	sqrl;
+	lastw[0]			=
+	lastw[1]			=
+	lastw[2]			=
+	lastw[3]			=	m_ray;
+	/* Loop						*/ 
+	do	{
+		const U		next=1-m_current;
+		sSimplex&	cs=m_simplices[m_current];
+		sSimplex&	ns=m_simplices[next];
+		/* Check zero							*/ 
+		const btScalar	rl=m_ray.length();
+		if(rl<GJK_MIN_DISTANCE)
+			{/* Touching or inside				*/ 
+			m_status=eStatus::Inside;
+			break;
+			}
+		/* Append new vertice in -'v' direction	*/ 
+		appendvertice(cs,-m_ray);
+		const btVector3&	w=cs.c[cs.rank-1]->w;
+		bool				found=false;
+		for(U i=0;i<4;++i)
+			{
+			if((w-lastw[i]).length2()<GJK_DUPLICATED_EPS)
+				{ found=true;break; }
+			}
+		if(found)
+			{/* Return old simplex				*/ 
+			removevertice(m_simplices[m_current]);
+			break;
+			}
+			else
+			{/* Update lastw					*/ 
+			lastw[clastw=(clastw+1)&3]=w;
+			}
+		/* Check for termination				*/ 
+		const btScalar	omega=dot(m_ray,w)/rl;
+		alpha=btMax(omega,alpha);
+		if(((rl-alpha)-(GJK_ACCURARY*rl))<=0)
+			{/* Return old simplex				*/ 
+			removevertice(m_simplices[m_current]);
+			break;
+			}		
+		/* Reduce simplex						*/ 
+		btScalar	weights[4];
+		U			mask=0;
+		switch(cs.rank)
+			{
+			case	2:	sqdist=projectorigin(	cs.c[0]->w,
+												cs.c[1]->w,
+												weights,mask);break;
+			case	3:	sqdist=projectorigin(	cs.c[0]->w,
+												cs.c[1]->w,
+												cs.c[2]->w,
+												weights,mask);break;
+			case	4:	sqdist=projectorigin(	cs.c[0]->w,
+												cs.c[1]->w,
+												cs.c[2]->w,
+												cs.c[3]->w,
+												weights,mask);break;
+			}
+		if(sqdist>=0)
+			{/* Valid	*/ 
+			ns.rank		=	0;
+			m_ray		=	btVector3(0,0,0);
+			m_current	=	next;
+			for(U i=0,ni=cs.rank;i<ni;++i)
+				{
+				if(mask&(1<<i))
+					{
+					ns.c[ns.rank]		=	cs.c[i];
+					ns.p[ns.rank++]		=	weights[i];
+					m_ray				+=	cs.c[i]->w*weights[i];
+					}
+					else
+					{
+					m_free[m_nfree++]	=	cs.c[i];
+					}
+				}
+			if(mask==15) m_status=eStatus::Inside;
+			}
+			else
+			{/* Return old simplex				*/ 
+			removevertice(m_simplices[m_current]);
+			break;
+			}
+		m_status=((++iterations)<GJK_MAX_ITERATIONS)?m_status:eStatus::Failed;
+		} while(m_status==eStatus::Valid);
+	m_simplex=&m_simplices[m_current];
+	switch(m_status)
+		{
+		case	eStatus::Valid:		m_distance=m_ray.length();break;
+		case	eStatus::Inside:	m_distance=0;break;
+		}	
+	return(m_status);
+	}
+bool					EncloseOrigin()
+	{
+	switch(m_simplex->rank)
+		{
+		case	1:
+			{
+			for(U i=0;i<3;++i)
+				{
+				btVector3		axis=btVector3(0,0,0);
+				axis[i]=1;
+				appendvertice(*m_simplex, axis);
+				if(EncloseOrigin())	return(true);
+				removevertice(*m_simplex);
+				appendvertice(*m_simplex,-axis);
+				if(EncloseOrigin())	return(true);
+				removevertice(*m_simplex);
+				}
+			}
+		break;
+		case	2:
+			{
+			const btVector3	d=m_simplex->c[1]->w-m_simplex->c[0]->w;
+			for(U i=0;i<3;++i)
+				{
+				btVector3		axis=btVector3(0,0,0);
+				axis[i]=1;
+				if(btFabs(dot(axis,d))>0)
+					{
+					const btVector3	p=cross(d,axis);
+					appendvertice(*m_simplex, p);
+					if(EncloseOrigin())	return(true);
+					removevertice(*m_simplex);
+					appendvertice(*m_simplex,-p);
+					if(EncloseOrigin())	return(true);
+					removevertice(*m_simplex);
+					}
+				}
+			}
+		break;
+		case	3:
+			{
+			const btVector3	n=cross(m_simplex->c[1]->w-m_simplex->c[0]->w,
+									m_simplex->c[2]->w-m_simplex->c[0]->w);
+			const btScalar	l=n.length();
+			if(l>0)
+				{
+				appendvertice(*m_simplex,n);
+				if(EncloseOrigin())	return(true);
+				removevertice(*m_simplex);
+				appendvertice(*m_simplex,-n);
+				if(EncloseOrigin())	return(true);
+				removevertice(*m_simplex);
+				}
+			}
+		break;
+		case	4:
+			{
+			if(btFabs(det(	m_simplex->c[0]->w-m_simplex->c[3]->w,
+							m_simplex->c[1]->w-m_simplex->c[3]->w,
+							m_simplex->c[2]->w-m_simplex->c[3]->w))>0)
+				return(true);
+			}
+		break;
+		}
+	return(false);
+	}
+/* Internals	*/ 
+void				getsupport(const btVector3& d,sSV& sv) const
+	{
+	sv.d	=	d/d.length();
+	sv.w	=	m_shape.Support(sv.d);
+	}
+void				removevertice(sSimplex& simplex)
+	{
+	m_free[m_nfree++]=simplex.c[--simplex.rank];
+	}
+void				appendvertice(sSimplex& simplex,const btVector3& v)
+	{
+	simplex.p[simplex.rank]=0;
+	simplex.c[simplex.rank]=m_free[--m_nfree];
+	getsupport(v,*simplex.c[simplex.rank++]);
+	}
+static btScalar		det(const btVector3& a,const btVector3& b,const btVector3& c)
+	{
+	return(	a.y()*b.z()*c.x()+a.z()*b.x()*c.y()-
+			a.x()*b.z()*c.y()-a.y()*b.x()*c.z()+
+			a.x()*b.y()*c.z()-a.z()*b.y()*c.x());
+	}
+static btScalar		projectorigin(	const btVector3& a,
+									const btVector3& b,
+									btScalar* w,U& m)
+	{
+	const btVector3	d=b-a;
+	const btScalar	l=d.length2();
+	if(l>GJK_SIMPLEX2_EPS)
+		{
+		const btScalar	t(l>0?-dot(a,d)/l:0);
+		if(t>=1)		{ w[0]=0;w[1]=1;m=2;return(b.length2()); }
+		else if(t<=0)	{ w[0]=1;w[1]=0;m=1;return(a.length2()); }
+		else			{ w[0]=1-(w[1]=t);m=3;return((a+d*t).length2()); }
+		}
+	return(-1);
+	}
+static btScalar		projectorigin(	const btVector3& a,
+									const btVector3& b,
+									const btVector3& c,
+									btScalar* w,U& m)
+	{
+	static const U		imd3[]={1,2,0};
+	const btVector3*	vt[]={&a,&b,&c};
+	const btVector3		dl[]={a-b,b-c,c-a};
+	const btVector3		n=cross(dl[0],dl[1]);
+	const btScalar		l=n.length2();
+	if(l>GJK_SIMPLEX3_EPS)
+		{
+		btScalar	mindist=-1;
+		btScalar	subw[2] = { btScalar(0.0f), btScalar(0.0f) };
+		U			subm;
+		for(U i=0;i<3;++i)
+			{
+			if(dot(*vt[i],cross(dl[i],n))>0)
+				{
+				const U			j=imd3[i];
+				const btScalar	subd(projectorigin(*vt[i],*vt[j],subw,subm));
+				if((mindist<0)||(subd<mindist))
+					{
+					mindist		=	subd;
+					m			=	((subm&1)?1<<i:0)+((subm&2)?1<<j:0);
+					w[i]		=	subw[0];
+					w[j]		=	subw[1];
+					w[imd3[j]]	=	0;				
+					}
+				}
+			}
+		if(mindist<0)
+			{
+			const btScalar	d=dot(a,n);	
+			const btScalar	s=btSqrt(l);
+			const btVector3	p=n*(d/l);
+			mindist	=	p.length2();
+			m		=	7;
+			w[0]	=	(cross(dl[1],b-p)).length()/s;
+			w[1]	=	(cross(dl[2],c-p)).length()/s;
+			w[2]	=	1-(w[0]+w[1]);
+			}
+		return(mindist);
+		}
+	return(-1);
+	}
+static btScalar		projectorigin(	const btVector3& a,
+									const btVector3& b,
+									const btVector3& c,
+									const btVector3& d,
+									btScalar* w,U& m)
+	{
+	static const U		imd3[]={1,2,0};
+	const btVector3*	vt[]={&a,&b,&c,&d};
+	const btVector3		dl[]={a-d,b-d,c-d};
+	const btScalar		vl=det(dl[0],dl[1],dl[2]);
+	const bool			ng=(vl*dot(a,cross(b-c,a-b)))<=0;
+	if(ng&&(btFabs(vl)>GJK_SIMPLEX4_EPS))
+		{
+		btScalar	mindist=-1;
+		btScalar	subw[3];
+		U			subm;
+		for(U i=0;i<3;++i)
+			{
+			const U			j=imd3[i];
+			const btScalar	s=vl*dot(d,cross(dl[i],dl[j]));
+			if(s>0)
+				{
+				const btScalar	subd=projectorigin(*vt[i],*vt[j],d,subw,subm);
+				if((mindist<0)||(subd<mindist))
+					{
+					mindist		=	subd;
+					m			=	(subm&1?1<<i:0)+
+									(subm&2?1<<j:0)+
+									(subm&4?8:0);
+					w[i]		=	subw[0];
+					w[j]		=	subw[1];
+					w[imd3[j]]	=	0;
+					w[3]		=	subw[2];
+					}
+				}
+			}
+		if(mindist<0)
+			{
+			mindist	=	0;
+			m		=	15;
+			w[0]	=	det(c,b,d)/vl;
+			w[1]	=	det(a,c,d)/vl;
+			w[2]	=	det(b,a,d)/vl;
+			w[3]	=	1-(w[0]+w[1]+w[2]);
+			}
+		return(mindist);
+		}
+	return(-1);
+	}
+};
+
+// EPA
+struct	EPA
+{
+/* Types		*/ 
+typedef	GJK::sSV	sSV;
+struct	sFace
+	{
+	btVector3	n;
+	btScalar	d;
+	btScalar	p;
+	sSV*		c[3];
+	sFace*		f[3];
+	sFace*		l[2];
+	U1			e[3];
+	U1			pass;
+	};
+struct	sList
+	{
+	sFace*		root;
+	U			count;
+				sList() : root(0),count(0)	{}
+	};
+struct	sHorizon
+	{
+	sFace*		cf;
+	sFace*		ff;
+	U			nf;
+				sHorizon() : cf(0),ff(0),nf(0)	{}
+	};
+struct	eStatus { enum _ {
+	Valid,
+	Touching,
+	Degenerated,
+	NonConvex,
+	InvalidHull,		
+	OutOfFaces,
+	OutOfVertices,
+	AccuraryReached,
+	FallBack,
+	Failed,		};};
+/* Fields		*/ 
+eStatus::_		m_status;
+GJK::sSimplex	m_result;
+btVector3		m_normal;
+btScalar		m_depth;
+sSV				m_sv_store[EPA_MAX_VERTICES];
+sFace			m_fc_store[EPA_MAX_FACES];
+U				m_nextsv;
+sList			m_hull;
+sList			m_stock;
+/* Methods		*/ 
+					EPA()
+	{
+	Initialize();	
+	}
+void				Initialize()
+	{
+	m_status	=	eStatus::Failed;
+	m_normal	=	btVector3(0,0,0);
+	m_depth		=	0;
+	m_nextsv	=	0;
+	for(U i=0;i<EPA_MAX_FACES;++i)
+		{
+		append(m_stock,&m_fc_store[EPA_MAX_FACES-i-1]);
+		}
+	}
+eStatus::_			Evaluate(GJK& gjk,const btVector3& guess)
+	{
+	GJK::sSimplex&	simplex=*gjk.m_simplex;
+	if((simplex.rank>1)&&gjk.EncloseOrigin())
+		{
+		/* Clean up				*/ 
+		while(m_hull.root)
+			{
+			sFace*	f(m_hull.root);
+			remove(m_hull,f);
+			append(m_stock,f);
+			}
+		m_status	=	eStatus::Valid;
+		m_nextsv	=	0;
+		/* Orient simplex		*/ 
+		if(gjk.det(	simplex.c[0]->w-simplex.c[3]->w,
+					simplex.c[1]->w-simplex.c[3]->w,
+					simplex.c[2]->w-simplex.c[3]->w)<0)
+			{
+			btSwap(simplex.c[0],simplex.c[1]);
+			btSwap(simplex.p[0],simplex.p[1]);
+			}
+		/* Build initial hull	*/ 
+		sFace*	tetra[]={newface(simplex.c[0],simplex.c[1],simplex.c[2],true),
+						newface(simplex.c[1],simplex.c[0],simplex.c[3],true),
+						newface(simplex.c[2],simplex.c[1],simplex.c[3],true),
+						newface(simplex.c[0],simplex.c[2],simplex.c[3],true)};
+		if(m_hull.count==4)
+			{
+			sFace*		best=findbest();
+			sFace		outer=*best;
+			U			pass=0;
+			U			iterations=0;
+			bind(tetra[0],0,tetra[1],0);
+			bind(tetra[0],1,tetra[2],0);
+			bind(tetra[0],2,tetra[3],0);
+			bind(tetra[1],1,tetra[3],2);
+			bind(tetra[1],2,tetra[2],1);
+			bind(tetra[2],2,tetra[3],1);
+			m_status=eStatus::Valid;
+			for(;iterations<EPA_MAX_ITERATIONS;++iterations)
+				{
+				if(m_nextsv<EPA_MAX_VERTICES)
+					{	
+					sHorizon		horizon;
+					sSV*			w=&m_sv_store[m_nextsv++];
+					bool			valid=true;					
+					best->pass	=	(U1)(++pass);
+					gjk.getsupport(best->n,*w);
+					const btScalar	wdist=dot(best->n,w->w)-best->d;
+					if(wdist>EPA_ACCURACY)
+						{
+						for(U j=0;(j<3)&&valid;++j)
+							{
+							valid&=expand(	pass,w,
+											best->f[j],best->e[j],
+											horizon);
+							}
+						if(valid&&(horizon.nf>=3))
+							{
+							bind(horizon.cf,1,horizon.ff,2);
+							remove(m_hull,best);
+							append(m_stock,best);
+							best=findbest();
+							if(best->p>=outer.p) outer=*best;
+							} else { m_status=eStatus::InvalidHull;break; }
+						} else { m_status=eStatus::AccuraryReached;break; }
+					} else { m_status=eStatus::OutOfVertices;break; }
+				}
+			const btVector3	projection=outer.n*outer.d;
+			m_normal	=	outer.n;
+			m_depth		=	outer.d;
+			m_result.rank	=	3;
+			m_result.c[0]	=	outer.c[0];
+			m_result.c[1]	=	outer.c[1];
+			m_result.c[2]	=	outer.c[2];
+			m_result.p[0]	=	cross(	outer.c[1]->w-projection,
+										outer.c[2]->w-projection).length();
+			m_result.p[1]	=	cross(	outer.c[2]->w-projection,
+										outer.c[0]->w-projection).length();
+			m_result.p[2]	=	cross(	outer.c[0]->w-projection,
+										outer.c[1]->w-projection).length();
+			const btScalar	sum=m_result.p[0]+m_result.p[1]+m_result.p[2];
+			m_result.p[0]	/=	sum;
+			m_result.p[1]	/=	sum;
+			m_result.p[2]	/=	sum;
+			return(m_status);
+			}
+		}
+	/* Fallback		*/ 
+	m_status	=	eStatus::FallBack;
+	m_normal	=	-guess;
+	const btScalar	nl=m_normal.length();
+	if(nl>0)
+		m_normal	=	m_normal/nl;
+		else
+		m_normal	=	btVector3(1,0,0);
+	m_depth	=	0;
+	m_result.rank=1;
+	m_result.c[0]=simplex.c[0];
+	m_result.p[0]=1;	
+	return(m_status);
+	}
+sFace*				newface(sSV* a,sSV* b,sSV* c,bool forced)
+	{
+	if(m_stock.root)
+		{
+		sFace*	face=m_stock.root;
+		remove(m_stock,face);
+		append(m_hull,face);
+		face->pass	=	0;
+		face->c[0]	=	a;
+		face->c[1]	=	b;
+		face->c[2]	=	c;
+		face->n		=	cross(b->w-a->w,c->w-a->w);
+		const btScalar	l=face->n.length();
+		const bool		v=l>EPA_ACCURACY;
+		face->p		=	btMin(btMin(
+							dot(a->w,cross(face->n,a->w-b->w)),
+							dot(b->w,cross(face->n,b->w-c->w))),
+							dot(c->w,cross(face->n,c->w-a->w)))	/
+							(v?l:1);
+		face->p		=	face->p>=-EPA_INSIDE_EPS?0:face->p;
+		if(v)
+			{
+			face->d		=	dot(a->w,face->n)/l;
+			face->n		/=	l;
+			if(forced||(face->d>=-EPA_PLANE_EPS))
+				{
+				return(face);
+				} else m_status=eStatus::NonConvex;
+			} else m_status=eStatus::Degenerated;
+		remove(m_hull,face);
+		append(m_stock,face);
+		return(0);
+		}
+	m_status=m_stock.root?eStatus::OutOfVertices:eStatus::OutOfFaces;
+	return(0);
+	}
+sFace*				findbest()
+	{
+	sFace*		minf=m_hull.root;
+	btScalar	mind=minf->d*minf->d;
+	btScalar	maxp=minf->p;
+	for(sFace* f=minf->l[1];f;f=f->l[1])
+		{
+		const btScalar	sqd=f->d*f->d;
+		if((f->p>=maxp)&&(sqd<mind))
+			{
+			minf=f;
+			mind=sqd;
+			maxp=f->p;
+			}
+		}
+	return(minf);
+	}
+bool				expand(U pass,sSV* w,sFace* f,U e,sHorizon& horizon)
+	{
+	static const U	i1m3[]={1,2,0};
+	static const U	i2m3[]={2,0,1};
+	if(f->pass!=pass)
+		{
+		const U	e1=i1m3[e];
+		if((dot(f->n,w->w)-f->d)<-EPA_PLANE_EPS)
+			{
+			sFace*	nf=newface(f->c[e1],f->c[e],w,false);
+			if(nf)
+				{
+				bind(nf,0,f,e);
+				if(horizon.cf) bind(horizon.cf,1,nf,2); else horizon.ff=nf;
+				horizon.cf=nf;
+				++horizon.nf;
+				return(true);
+				}
+			}
+			else
+			{
+			const U	e2=i2m3[e];
+			f->pass		=	(U1)pass;
+			if(	expand(pass,w,f->f[e1],f->e[e1],horizon)&&
+				expand(pass,w,f->f[e2],f->e[e2],horizon))
+				{
+				remove(m_hull,f);
+				append(m_stock,f);
+				return(true);
+				}
+			}
+		}
+	return(false);
+	}
+static inline void		bind(sFace* fa,U ea,sFace* fb,U eb)
+	{
+	fa->e[ea]=(U1)eb;fa->f[ea]=fb;
+	fb->e[eb]=(U1)ea;fb->f[eb]=fa;
+	}
+static inline void		append(sList& list,sFace* face)
+	{
+	face->l[0]	=	0;
+	face->l[1]	=	list.root;
+	if(list.root) list.root->l[0]=face;
+	list.root	=	face;
+	++list.count;
+	}
+static inline void		remove(sList& list,sFace* face)
+	{
+	if(face->l[1]) face->l[1]->l[0]=face->l[0];
+	if(face->l[0]) face->l[0]->l[1]=face->l[1];
+	if(face==list.root) list.root=face->l[1];
+	--list.count;
+	}
+};
+
+//
+static void	Initialize(void* shapeA,
+						SpuConvexPolyhedronVertexData* convexDataA,
+						int shapeTypeA,
+						float marginA,
+						const btTransform& wtrs0,
+						void* shapeB,
+						SpuConvexPolyhedronVertexData* convexDataB,
+						int shapeTypeB,
+						float marginB,
+						const btTransform& wtrs1,	
+						SpuGjkEpaSolver2::sResults& results,
+						tShape& shape,
+						bool withmargins)
+{
+/* Results		*/ 
+results.witnesses[0]	=
+results.witnesses[1]	=	btVector3(0,0,0);
+results.status			=	SpuGjkEpaSolver2::sResults::Separated;
+/* Shape		*/ 
+shape.m_shapes[0].margin = marginA;
+shape.m_shapes[0].shape = shapeA;
+shape.m_shapes[0].shapeType = shapeTypeA;
+shape.m_shapes[0].convexData = convexDataA;
+shape.m_shapes[1].margin = marginB;
+shape.m_shapes[1].shape = shapeB;
+shape.m_shapes[1].shapeType = shapeTypeB;
+shape.m_shapes[1].convexData = convexDataB;
+shape.m_toshape1		=	wtrs1.getBasis().transposeTimes(wtrs0.getBasis());
+shape.m_toshape0		=	wtrs0.inverseTimes(wtrs1);
+shape.EnableMargin(withmargins);
+}
+
+}
+
+//
+// Api
+//
+
+using namespace	gjkepa2_spu_impl;
+
+//
+int			SpuGjkEpaSolver2::StackSizeRequirement()
+{
+return(sizeof(GJK)+sizeof(EPA));
+}
+
+//
+bool   SpuGjkEpaSolver2::Penetration(void* shapeA,
+									SpuConvexPolyhedronVertexData* convexDataA,
+									int shapeTypeA,
+									float marginA,
+									const btTransform& wtrs0,
+									void* shapeB,
+									SpuConvexPolyhedronVertexData* convexDataB,
+									int shapeTypeB,
+									float marginB,
+									const btTransform& wtrs1,
+									const btVector3&		guess,
+									sResults&				results)
+{
+tShape			shape;
+Initialize(shapeA, convexDataA, shapeTypeA, marginA, wtrs0, shapeB, convexDataB, shapeTypeB, marginB, wtrs1, results,shape,true);
+GJK				gjk;	
+GJK::eStatus::_	gjk_status=gjk.Evaluate(shape,-guess);
+switch(gjk_status)
+	{
+	case	GJK::eStatus::Inside:
+		{
+		EPA				epa;
+		EPA::eStatus::_	epa_status=epa.Evaluate(gjk,-guess);
+		if(epa_status!=EPA::eStatus::Failed)
+			{
+			btVector3	w0=btVector3(0,0,0);
+			for(U i=0;i<epa.m_result.rank;++i)
+				{
+				w0+=shape.Support(epa.m_result.c[i]->d,0)*epa.m_result.p[i];
+				}
+			results.status			=	sResults::Penetrating;
+			results.witnesses[0]	=	wtrs0*w0;
+			results.witnesses[1]	=	wtrs0*(w0-epa.m_normal*epa.m_depth);
+			return(true);
+			} else results.status=sResults::EPA_Failed;
+		}
+	break;
+	case	GJK::eStatus::Failed:
+	results.status=sResults::GJK_Failed;
+	break;
+	}
+return(false);
+}
+
+/* Symbols cleanup		*/ 
+
+#undef GJK_MAX_ITERATIONS
+#undef GJK_ACCURARY
+#undef GJK_MIN_DISTANCE
+#undef GJK_DUPLICATED_EPS
+#undef GJK_SIMPLEX2_EPS
+#undef GJK_SIMPLEX3_EPS
+#undef GJK_SIMPLEX4_EPS
+
+#undef EPA_MAX_VERTICES
+#undef EPA_MAX_FACES
+#undef EPA_MAX_ITERATIONS
+#undef EPA_ACCURACY
+#undef EPA_FALLBACK
+#undef EPA_PLANE_EPS
+#undef EPA_INSIDE_EPS
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGjkEpa2.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGjkEpa2.h
@@ -0,0 +1,38 @@
+#ifndef _68DA1F85_90B7_4bb0_A705_83B4040A75C6_
+#define _68DA1F85_90B7_4bb0_A705_83B4040A75C6_
+#include "BulletCollision/CollisionShapes/btConvexShape.h"
+
+///btGjkEpaSolver contributed under zlib by Nathanael Presson
+struct	SpuGjkEpaSolver2
+{
+struct	sResults
+	{
+	enum eStatus
+		{
+		Separated,		/* Shapes doesnt penetrate												*/ 
+		Penetrating,	/* Shapes are penetrating												*/ 
+		GJK_Failed,		/* GJK phase fail, no big issue, shapes are probably just 'touching'	*/ 
+		EPA_Failed,		/* EPA phase fail, bigger problem, need to save parameters, and debug	*/ 
+		}		status;
+	btVector3	witnesses[2];
+	btVector3	normal;
+	};
+
+static int		StackSizeRequirement();
+
+
+static bool		Penetration(void* shapeA,
+							SpuConvexPolyhedronVertexData* convexDataA,
+							int shapeTypeA,
+							float marginA,
+							const btTransform& xformA,
+							void* shapeB,
+							SpuConvexPolyhedronVertexData* convexDataB,
+							int shapeTypeB,
+							float marginB,
+							const btTransform& xformB,
+							const btVector3& guess,
+							sResults&	results);
+};
+
+#endif
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGjkPairDetector.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGjkPairDetector.cpp
@@ -0,0 +1,311 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuGjkPairDetector.h"
+#include "SpuConvexPenetrationDepthSolver.h"
+#include "SpuCollisionShapes.h"
+
+
+
+#if defined(DEBUG) || defined (_DEBUG)
+#include <stdio.h> //for debug printf
+#ifdef __SPU__
+#include <spu_printf.h>
+#define printf spu_printf
+#endif //__SPU__
+#endif
+
+//must be above the machine epsilon
+#define REL_ERROR2 btScalar(1.0e-6)
+
+//temp globals, to improve GJK/EPA/penetration calculations
+int gSpuNumDeepPenetrationChecks = 0;
+int gSpuNumGjkChecks = 0;
+
+
+
+SpuGjkPairDetector::SpuGjkPairDetector(void* objectA,void* objectB,int shapeTypeA, int shapeTypeB, float marginA,float marginB,SpuVoronoiSimplexSolver* simplexSolver, const SpuConvexPenetrationDepthSolver*	penetrationDepthSolver)
+:m_cachedSeparatingAxis(float(0.),float(0.),float(1.)),
+m_penetrationDepthSolver(penetrationDepthSolver),
+m_simplexSolver(simplexSolver),
+m_minkowskiA(objectA),
+m_minkowskiB(objectB),
+m_shapeTypeA(shapeTypeA),
+m_shapeTypeB(shapeTypeB),
+m_marginA(marginA),
+m_marginB(marginB),
+m_ignoreMargin(false),
+m_lastUsedMethod(-1),
+m_catchDegeneracies(1)
+{
+}
+
+void SpuGjkPairDetector::getClosestPoints(const SpuClosestPointInput& input,SpuContactResult& output)
+{
+	btScalar distance=btScalar(0.);
+	btVector3	normalInB(btScalar(0.),btScalar(0.),btScalar(0.));
+	btVector3 pointOnA,pointOnB;
+	btTransform	localTransA = input.m_transformA;
+	btTransform localTransB = input.m_transformB;
+	btVector3 positionOffset = (localTransA.getOrigin() + localTransB.getOrigin()) * btScalar(0.5);
+	localTransA.getOrigin() -= positionOffset;
+	localTransB.getOrigin() -= positionOffset;
+
+	btScalar marginA = m_marginA;
+	btScalar marginB = m_marginB;
+
+	gSpuNumGjkChecks++;
+
+	//for CCD we don't use margins
+	if (m_ignoreMargin)
+	{
+		marginA = btScalar(0.);
+		marginB = btScalar(0.);
+	}
+
+	m_curIter = 0;
+	int gGjkMaxIter = 1000;//this is to catch invalid input, perhaps check for #NaN?
+	m_cachedSeparatingAxis.setValue(0,1,0);
+
+	bool isValid = false;
+	bool checkSimplex = false;
+	bool checkPenetration = true;
+	m_degenerateSimplex = 0;
+
+	m_lastUsedMethod = -1;
+
+	{
+		btScalar squaredDistance = SIMD_INFINITY;
+		btScalar delta = btScalar(0.);
+		
+		btScalar margin = marginA + marginB;
+		
+		
+
+		m_simplexSolver->reset();
+		
+		for ( ; ; )
+		//while (true)
+		{
+
+			btVector3 seperatingAxisInA = (-m_cachedSeparatingAxis)* input.m_transformA.getBasis();
+			btVector3 seperatingAxisInB = m_cachedSeparatingAxis* input.m_transformB.getBasis();
+
+//			btVector3 pInA = m_minkowskiA->localGetSupportingVertexWithoutMargin(seperatingAxisInA);
+//			btVector3 qInB = m_minkowskiB->localGetSupportingVertexWithoutMargin(seperatingAxisInB);
+
+			btVector3 pInA  = localGetSupportingVertexWithoutMargin(m_shapeTypeA, m_minkowskiA, seperatingAxisInA,input.m_convexVertexData[0]);//, &featureIndexA);
+			btVector3 qInB  = localGetSupportingVertexWithoutMargin(m_shapeTypeB, m_minkowskiB, seperatingAxisInB,input.m_convexVertexData[1]);//, &featureIndexB);
+
+
+			btPoint3  pWorld = localTransA(pInA);	
+			btPoint3  qWorld = localTransB(qInB);
+			
+			btVector3 w	= pWorld - qWorld;
+			delta = m_cachedSeparatingAxis.dot(w);
+
+			// potential exit, they don't overlap
+			if ((delta > btScalar(0.0)) && (delta * delta > squaredDistance * input.m_maximumDistanceSquared)) 
+			{
+				checkPenetration = false;
+				break;
+			}
+
+			//exit 0: the new point is already in the simplex, or we didn't come any closer
+			if (m_simplexSolver->inSimplex(w))
+			{
+				m_degenerateSimplex = 1;
+				checkSimplex = true;
+				break;
+			}
+			// are we getting any closer ?
+			btScalar f0 = squaredDistance - delta;
+			btScalar f1 = squaredDistance * REL_ERROR2;
+
+			if (f0 <= f1)
+			{
+				if (f0 <= btScalar(0.))
+				{
+					m_degenerateSimplex = 2;
+				}
+				checkSimplex = true;
+				break;
+			}
+			//add current vertex to simplex
+			m_simplexSolver->addVertex(w, pWorld, qWorld);
+
+			//calculate the closest point to the origin (update vector v)
+			if (!m_simplexSolver->closest(m_cachedSeparatingAxis))
+			{
+				m_degenerateSimplex = 3;
+				checkSimplex = true;
+				break;
+			}
+
+			btScalar previousSquaredDistance = squaredDistance;
+			squaredDistance = m_cachedSeparatingAxis.length2();
+			
+			//redundant m_simplexSolver->compute_points(pointOnA, pointOnB);
+
+			//are we getting any closer ?
+			if (previousSquaredDistance - squaredDistance <= SIMD_EPSILON * previousSquaredDistance) 
+			{ 
+				m_simplexSolver->backup_closest(m_cachedSeparatingAxis);
+				checkSimplex = true;
+				break;
+			}
+
+			  //degeneracy, this is typically due to invalid/uninitialized worldtransforms for a btCollisionObject   
+              if (m_curIter++ > gGjkMaxIter)   
+              {   
+                      #if defined(DEBUG) || defined (_DEBUG)   
+
+                              printf("SpuGjkPairDetector maxIter exceeded:%i\n",m_curIter);   
+                              printf("sepAxis=(%f,%f,%f), squaredDistance = %f, shapeTypeA=%i,shapeTypeB=%i\n",   
+                              m_cachedSeparatingAxis.getX(),   
+                              m_cachedSeparatingAxis.getY(),   
+                              m_cachedSeparatingAxis.getZ(),   
+                              squaredDistance,   
+							  m_shapeTypeA,   
+                              m_shapeTypeB);
+
+                      #endif   
+                      break;   
+
+              } 
+
+
+			bool check = (!m_simplexSolver->fullSimplex());
+			//bool check = (!m_simplexSolver->fullSimplex() && squaredDistance > SIMD_EPSILON * m_simplexSolver->maxVertex());
+
+			if (!check)
+			{
+				//do we need this backup_closest here ?
+				m_simplexSolver->backup_closest(m_cachedSeparatingAxis);
+				break;
+			}
+		}
+
+		if (checkSimplex)
+		{
+			m_simplexSolver->compute_points(pointOnA, pointOnB);
+			normalInB = pointOnA-pointOnB;
+			btScalar lenSqr = m_cachedSeparatingAxis.length2();
+			//valid normal
+			if (lenSqr < 0.0001)
+			{
+				m_degenerateSimplex = 5;
+			} 
+			if (lenSqr > SIMD_EPSILON*SIMD_EPSILON)
+			{
+				btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
+				normalInB *= rlen; //normalize
+				btScalar s = btSqrt(squaredDistance);
+			
+				btAssert(s > btScalar(0.0));
+				pointOnA -= m_cachedSeparatingAxis * (marginA / s);
+				pointOnB += m_cachedSeparatingAxis * (marginB / s);
+				distance = ((btScalar(1.)/rlen) - margin);
+				isValid = true;
+				
+				m_lastUsedMethod = 1;
+			} else
+			{
+				m_lastUsedMethod = 2;
+			}
+		}
+
+		bool catchDegeneratePenetrationCase = 
+			(m_catchDegeneracies && m_penetrationDepthSolver && m_degenerateSimplex && ((distance+margin) < 0.01));
+
+		//if (checkPenetration && !isValid)
+		if (checkPenetration && (!isValid || catchDegeneratePenetrationCase ))
+		{
+			//penetration case
+		
+			//if there is no way to handle penetrations, bail out
+			if (m_penetrationDepthSolver)
+			{
+				// Penetration depth case.
+				btVector3 tmpPointOnA,tmpPointOnB;
+				
+				gSpuNumDeepPenetrationChecks++;
+
+				bool isValid2 = m_penetrationDepthSolver->calcPenDepth( 
+					*m_simplexSolver, 
+					m_minkowskiA,m_minkowskiB,
+                    m_shapeTypeA, m_shapeTypeB,
+                    marginA, marginB,
+					localTransA,localTransB,
+					m_cachedSeparatingAxis, tmpPointOnA, tmpPointOnB,
+					0,input.m_stackAlloc,input.m_convexVertexData[0], input.m_convexVertexData[1]
+					);
+
+				if (isValid2)
+				{
+					btVector3 tmpNormalInB = tmpPointOnB-tmpPointOnA;
+					btScalar lenSqr = tmpNormalInB.length2();
+					if (lenSqr > (SIMD_EPSILON*SIMD_EPSILON))
+					{
+						tmpNormalInB /= btSqrt(lenSqr);
+						btScalar distance2 = -(tmpPointOnA-tmpPointOnB).length();
+						//only replace valid penetrations when the result is deeper (check)
+						if (!isValid || (distance2 < distance))
+						{
+							distance = distance2;
+							pointOnA = tmpPointOnA;
+							pointOnB = tmpPointOnB;
+							normalInB = tmpNormalInB;
+							isValid = true;
+							m_lastUsedMethod = 3;
+						} else
+						{
+							
+						}
+					} else
+					{
+						//isValid = false;
+						m_lastUsedMethod = 4;
+					}
+				} else
+				{
+					m_lastUsedMethod = 5;
+				}
+				
+			}
+		}
+	}
+
+	if (isValid)
+	{
+#ifdef __SPU__
+		//spu_printf("distance\n");
+#endif //__SPU__
+
+
+		output.addContactPoint(
+			normalInB,
+			pointOnB+positionOffset,
+			distance);
+		//printf("gjk add:%f",distance);
+	}
+
+
+}
+
+
+
+
+
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGjkPairDetector.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGjkPairDetector.h
@@ -0,0 +1,93 @@
+
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+
+#ifndef SPU_GJK_PAIR_DETECTOR_H
+#define SPU_GJK_PAIR_DETECTOR_H
+
+
+
+#include "SpuContactResult.h"
+
+
+#include "SpuVoronoiSimplexSolver.h"
+class SpuConvexPenetrationDepthSolver;
+
+/// btGjkPairDetector uses GJK to implement the btDiscreteCollisionDetectorInterface
+class SpuGjkPairDetector 
+{
+	
+
+	btVector3	m_cachedSeparatingAxis;
+	const SpuConvexPenetrationDepthSolver*	m_penetrationDepthSolver;
+	SpuVoronoiSimplexSolver* m_simplexSolver;
+	void* m_minkowskiA;
+	void* m_minkowskiB;
+    int m_shapeTypeA;
+    int m_shapeTypeB;
+    float m_marginA;
+    float m_marginB;
+	bool		m_ignoreMargin;
+	
+
+public:
+
+	//some debugging to fix degeneracy problems
+	int			m_lastUsedMethod;
+	int			m_curIter;
+	int			m_degenerateSimplex;
+	int			m_catchDegeneracies;
+
+
+	SpuGjkPairDetector(void* objectA,void* objectB,int m_shapeTypeA, int m_shapeTypeB, float marginA, float marginB, SpuVoronoiSimplexSolver* simplexSolver, const SpuConvexPenetrationDepthSolver*	penetrationDepthSolver);
+	virtual ~SpuGjkPairDetector() {};
+
+	virtual void	getClosestPoints(const SpuClosestPointInput& input,SpuContactResult& output);
+
+	void setMinkowskiA(void* minkA)
+	{
+		m_minkowskiA = minkA;
+	}
+
+	void setMinkowskiB(void* minkB)
+	{
+		m_minkowskiB = minkB;
+	}
+
+	void setCachedSeperatingAxis(const btVector3& seperatingAxis)
+	{
+		m_cachedSeparatingAxis = seperatingAxis;
+	}
+
+	void	setPenetrationDepthSolver(SpuConvexPenetrationDepthSolver*	penetrationDepthSolver)
+	{
+		m_penetrationDepthSolver = penetrationDepthSolver;
+	}
+
+	///don't use setIgnoreMargin, it's for Bullet's internal use
+	void	setIgnoreMargin(bool ignoreMargin)
+	{
+		m_ignoreMargin = ignoreMargin;
+	}
+
+
+};
+
+
+
+#endif //SPU_GJK_PAIR_DETECTOR_H
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuLocalSupport.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuLocalSupport.h
@@ -0,0 +1,19 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+
+
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
@@ -0,0 +1,347 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuMinkowskiPenetrationDepthSolver.h"
+#include "SpuVoronoiSimplexSolver.h"
+#include "SpuGjkPairDetector.h"
+#include "SpuContactResult.h"
+#include "SpuPreferredPenetrationDirections.h"
+
+
+#include "SpuCollisionShapes.h"
+
+#define NUM_UNITSPHERE_POINTS 42
+static btVector3	sPenetrationDirections[NUM_UNITSPHERE_POINTS+MAX_PREFERRED_PENETRATION_DIRECTIONS*2] = 
+{
+btVector3(btScalar(0.000000) , btScalar(-0.000000),btScalar(-1.000000)),
+btVector3(btScalar(0.723608) , btScalar(-0.525725),btScalar(-0.447219)),
+btVector3(btScalar(-0.276388) , btScalar(-0.850649),btScalar(-0.447219)),
+btVector3(btScalar(-0.894426) , btScalar(-0.000000),btScalar(-0.447216)),
+btVector3(btScalar(-0.276388) , btScalar(0.850649),btScalar(-0.447220)),
+btVector3(btScalar(0.723608) , btScalar(0.525725),btScalar(-0.447219)),
+btVector3(btScalar(0.276388) , btScalar(-0.850649),btScalar(0.447220)),
+btVector3(btScalar(-0.723608) , btScalar(-0.525725),btScalar(0.447219)),
+btVector3(btScalar(-0.723608) , btScalar(0.525725),btScalar(0.447219)),
+btVector3(btScalar(0.276388) , btScalar(0.850649),btScalar(0.447219)),
+btVector3(btScalar(0.894426) , btScalar(0.000000),btScalar(0.447216)),
+btVector3(btScalar(-0.000000) , btScalar(0.000000),btScalar(1.000000)),
+btVector3(btScalar(0.425323) , btScalar(-0.309011),btScalar(-0.850654)),
+btVector3(btScalar(-0.162456) , btScalar(-0.499995),btScalar(-0.850654)),
+btVector3(btScalar(0.262869) , btScalar(-0.809012),btScalar(-0.525738)),
+btVector3(btScalar(0.425323) , btScalar(0.309011),btScalar(-0.850654)),
+btVector3(btScalar(0.850648) , btScalar(-0.000000),btScalar(-0.525736)),
+btVector3(btScalar(-0.525730) , btScalar(-0.000000),btScalar(-0.850652)),
+btVector3(btScalar(-0.688190) , btScalar(-0.499997),btScalar(-0.525736)),
+btVector3(btScalar(-0.162456) , btScalar(0.499995),btScalar(-0.850654)),
+btVector3(btScalar(-0.688190) , btScalar(0.499997),btScalar(-0.525736)),
+btVector3(btScalar(0.262869) , btScalar(0.809012),btScalar(-0.525738)),
+btVector3(btScalar(0.951058) , btScalar(0.309013),btScalar(0.000000)),
+btVector3(btScalar(0.951058) , btScalar(-0.309013),btScalar(0.000000)),
+btVector3(btScalar(0.587786) , btScalar(-0.809017),btScalar(0.000000)),
+btVector3(btScalar(0.000000) , btScalar(-1.000000),btScalar(0.000000)),
+btVector3(btScalar(-0.587786) , btScalar(-0.809017),btScalar(0.000000)),
+btVector3(btScalar(-0.951058) , btScalar(-0.309013),btScalar(-0.000000)),
+btVector3(btScalar(-0.951058) , btScalar(0.309013),btScalar(-0.000000)),
+btVector3(btScalar(-0.587786) , btScalar(0.809017),btScalar(-0.000000)),
+btVector3(btScalar(-0.000000) , btScalar(1.000000),btScalar(-0.000000)),
+btVector3(btScalar(0.587786) , btScalar(0.809017),btScalar(-0.000000)),
+btVector3(btScalar(0.688190) , btScalar(-0.499997),btScalar(0.525736)),
+btVector3(btScalar(-0.262869) , btScalar(-0.809012),btScalar(0.525738)),
+btVector3(btScalar(-0.850648) , btScalar(0.000000),btScalar(0.525736)),
+btVector3(btScalar(-0.262869) , btScalar(0.809012),btScalar(0.525738)),
+btVector3(btScalar(0.688190) , btScalar(0.499997),btScalar(0.525736)),
+btVector3(btScalar(0.525730) , btScalar(0.000000),btScalar(0.850652)),
+btVector3(btScalar(0.162456) , btScalar(-0.499995),btScalar(0.850654)),
+btVector3(btScalar(-0.425323) , btScalar(-0.309011),btScalar(0.850654)),
+btVector3(btScalar(-0.425323) , btScalar(0.309011),btScalar(0.850654)),
+btVector3(btScalar(0.162456) , btScalar(0.499995),btScalar(0.850654))
+};
+
+bool SpuMinkowskiPenetrationDepthSolver::calcPenDepth( SpuVoronoiSimplexSolver& simplexSolver,
+	        void* convexA,void* convexB,int shapeTypeA, int shapeTypeB, float marginA, float marginB,
+            btTransform& transA,const btTransform& transB,
+			btVector3& v, btPoint3& pa, btPoint3& pb,
+			class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataA,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataB
+			) const
+{
+
+	(void)stackAlloc;
+	(void)v;
+	
+
+	struct btIntermediateResult : public SpuContactResult
+	{
+
+		btIntermediateResult():m_hasResult(false)
+		{
+		}
+		
+		btVector3 m_normalOnBInWorld;
+		btVector3 m_pointInWorld;
+		btScalar m_depth;
+		bool	m_hasResult;
+
+		virtual void setShapeIdentifiers(int partId0,int index0,	int partId1,int index1)
+		{
+			(void)partId0;
+			(void)index0;
+			(void)partId1;
+			(void)index1;
+		}
+		void addContactPoint(const btVector3& normalOnBInWorld,const btVector3& pointInWorld,btScalar depth)
+		{
+			m_normalOnBInWorld = normalOnBInWorld;
+			m_pointInWorld = pointInWorld;
+			m_depth = depth;
+			m_hasResult = true;
+		}
+	};
+
+	//just take fixed number of orientation, and sample the penetration depth in that direction
+	btScalar minProj = btScalar(1e30);
+	btVector3 minNorm;
+	btVector3 minVertex;
+	btVector3 minA,minB;
+	btVector3 seperatingAxisInA,seperatingAxisInB;
+	btVector3 pInA,qInB,pWorld,qWorld,w;
+
+//#define USE_BATCHED_SUPPORT 1
+#ifdef USE_BATCHED_SUPPORT
+
+	btVector3	supportVerticesABatch[NUM_UNITSPHERE_POINTS+MAX_PREFERRED_PENETRATION_DIRECTIONS*2];
+	btVector3	supportVerticesBBatch[NUM_UNITSPHERE_POINTS+MAX_PREFERRED_PENETRATION_DIRECTIONS*2];
+	btVector3	seperatingAxisInABatch[NUM_UNITSPHERE_POINTS+MAX_PREFERRED_PENETRATION_DIRECTIONS*2];
+	btVector3	seperatingAxisInBBatch[NUM_UNITSPHERE_POINTS+MAX_PREFERRED_PENETRATION_DIRECTIONS*2];
+	int i;
+
+	int numSampleDirections = NUM_UNITSPHERE_POINTS;
+
+	for (i=0;i<numSampleDirections;i++)
+	{
+		const btVector3& norm = sPenetrationDirections[i];
+		seperatingAxisInABatch[i] =  (-norm) * transA.getBasis() ;
+		seperatingAxisInBBatch[i] =  norm   * transB.getBasis() ;
+	}
+
+	{
+		int numPDA = convexA->getNumPreferredPenetrationDirections();
+		if (numPDA)
+		{
+			for (int i=0;i<numPDA;i++)
+			{
+				btVector3 norm;
+				convexA->getPreferredPenetrationDirection(i,norm);
+				norm  = transA.getBasis() * norm;
+				sPenetrationDirections[numSampleDirections] = norm;
+				seperatingAxisInABatch[numSampleDirections] = (-norm) * transA.getBasis();
+				seperatingAxisInBBatch[numSampleDirections] = norm * transB.getBasis();
+				numSampleDirections++;
+			}
+		}
+	}
+
+	{
+		int numPDB = convexB->getNumPreferredPenetrationDirections();
+		if (numPDB)
+		{
+			for (int i=0;i<numPDB;i++)
+			{
+				btVector3 norm;
+				convexB->getPreferredPenetrationDirection(i,norm);
+				norm  = transB.getBasis() * norm;
+				sPenetrationDirections[numSampleDirections] = norm;
+				seperatingAxisInABatch[numSampleDirections] = (-norm) * transA.getBasis();
+				seperatingAxisInBBatch[numSampleDirections] = norm * transB.getBasis();
+				numSampleDirections++;
+			}
+		}
+	}
+
+
+
+	convexA->batchedUnitVectorGetSupportingVertexWithoutMargin(seperatingAxisInABatch,supportVerticesABatch,numSampleDirections);
+	convexB->batchedUnitVectorGetSupportingVertexWithoutMargin(seperatingAxisInBBatch,supportVerticesBBatch,numSampleDirections);
+
+	for (i=0;i<numSampleDirections;i++)
+	{
+		const btVector3& norm = sPenetrationDirections[i];
+		seperatingAxisInA = seperatingAxisInABatch[i];
+		seperatingAxisInB = seperatingAxisInBBatch[i];
+
+		pInA = supportVerticesABatch[i];
+		qInB = supportVerticesBBatch[i];
+
+		pWorld = transA(pInA);	
+		qWorld = transB(qInB);
+		w	= qWorld - pWorld;
+		btScalar delta = norm.dot(w);
+		//find smallest delta
+		if (delta < minProj)
+		{
+			minProj = delta;
+			minNorm = norm;
+			minA = pWorld;
+			minB = qWorld;
+		}
+	}	
+#else
+
+	int numSampleDirections = NUM_UNITSPHERE_POINTS;
+
+///this is necessary, otherwise the normal is not correct, and sphere will rotate forever on a sloped triangle mesh
+#define DO_PREFERRED_DIRECTIONS 1
+#ifdef DO_PREFERRED_DIRECTIONS
+	{
+		int numPDA = spuGetNumPreferredPenetrationDirections(shapeTypeA,convexA);
+		if (numPDA)
+		{
+			for (int i=0;i<numPDA;i++)
+			{
+				btVector3 norm;
+				spuGetPreferredPenetrationDirection(shapeTypeA,convexA,i,norm);
+				norm  = transA.getBasis() * norm;
+				sPenetrationDirections[numSampleDirections] = norm;
+				numSampleDirections++;
+			}
+		}
+	}
+
+	{
+		int numPDB = spuGetNumPreferredPenetrationDirections(shapeTypeB,convexB);
+		if (numPDB)
+		{
+			for (int i=0;i<numPDB;i++)
+			{
+				btVector3 norm;
+				spuGetPreferredPenetrationDirection(shapeTypeB,convexB,i,norm);
+				norm  = transB.getBasis() * norm;
+				sPenetrationDirections[numSampleDirections] = norm;
+				numSampleDirections++;
+			}
+		}
+	}
+#endif //DO_PREFERRED_DIRECTIONS
+
+	for (int i=0;i<numSampleDirections;i++)
+	{
+		const btVector3& norm = sPenetrationDirections[i];
+		seperatingAxisInA = (-norm)* transA.getBasis();
+		seperatingAxisInB = norm* transB.getBasis();
+
+		pInA = localGetSupportingVertexWithoutMargin(shapeTypeA, convexA, seperatingAxisInA,convexVertexDataA);//, NULL);
+		qInB = localGetSupportingVertexWithoutMargin(shapeTypeB, convexB, seperatingAxisInB,convexVertexDataB);//, NULL);
+
+	//	pInA = convexA->localGetSupportingVertexWithoutMargin(seperatingAxisInA);
+	//	qInB = convexB->localGetSupportingVertexWithoutMargin(seperatingAxisInB);
+
+		pWorld = transA(pInA);	
+		qWorld = transB(qInB);
+		w	= qWorld - pWorld;
+		btScalar delta = norm.dot(w);
+		//find smallest delta
+		if (delta < minProj)
+		{
+			minProj = delta;
+			minNorm = norm;
+			minA = pWorld;
+			minB = qWorld;
+		}
+	}
+#endif //USE_BATCHED_SUPPORT
+
+	//add the margins
+
+	minA += minNorm*marginA;
+	minB -= minNorm*marginB;
+	//no penetration
+	if (minProj < btScalar(0.))
+		return false;
+
+	minProj += (marginA + marginB) + btScalar(1.00);
+
+
+
+
+
+//#define DEBUG_DRAW 1
+#ifdef DEBUG_DRAW
+	if (debugDraw)
+	{
+		btVector3 color(0,1,0);
+		debugDraw->drawLine(minA,minB,color);
+		color = btVector3 (1,1,1);
+		btVector3 vec = minB-minA;
+		btScalar prj2 = minNorm.dot(vec);
+		debugDraw->drawLine(minA,minA+(minNorm*minProj),color);
+
+	}
+#endif //DEBUG_DRAW
+
+	
+
+	SpuGjkPairDetector gjkdet(convexA,convexB,shapeTypeA,shapeTypeB,marginA,marginB,&simplexSolver,0);
+
+	btScalar offsetDist = minProj;
+	btVector3 offset = minNorm * offsetDist;
+	
+
+	SpuClosestPointInput input;
+	input.m_convexVertexData[0] = convexVertexDataA;
+	input.m_convexVertexData[1] = convexVertexDataB;
+	btVector3 newOrg = transA.getOrigin() + offset;
+
+	btTransform displacedTrans = transA;
+	displacedTrans.setOrigin(newOrg);
+
+	input.m_transformA = displacedTrans;
+	input.m_transformB = transB;
+	input.m_maximumDistanceSquared = btScalar(1e30);//minProj;
+	
+	btIntermediateResult res;
+	gjkdet.getClosestPoints(input,res);
+
+	btScalar correctedMinNorm = minProj - res.m_depth;
+
+
+	//the penetration depth is over-estimated, relax it
+	btScalar penetration_relaxation= btScalar(1.);
+	minNorm*=penetration_relaxation;
+
+	if (res.m_hasResult)
+	{
+
+		pa = res.m_pointInWorld - minNorm * correctedMinNorm;
+		pb = res.m_pointInWorld;
+		
+#ifdef DEBUG_DRAW
+		if (debugDraw)
+		{
+			btVector3 color(1,0,0);
+			debugDraw->drawLine(pa,pb,color);
+		}
+#endif//DEBUG_DRAW
+
+
+	} else {
+		// could not seperate shapes
+		btAssert (false);
+	}
+	return res.m_hasResult;
+}
+
+
+
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h
@@ -0,0 +1,47 @@
+
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef MINKOWSKI_PENETRATION_DEPTH_SOLVER_H
+#define MINKOWSKI_PENETRATION_DEPTH_SOLVER_H
+
+
+#include "SpuConvexPenetrationDepthSolver.h"
+
+class btStackAlloc;
+class btIDebugDraw;
+class SpuVoronoiSimplexSolver;
+
+///MinkowskiPenetrationDepthSolver implements bruteforce penetration depth estimation.
+///Implementation is based on sampling the depth using support mapping, and using GJK step to get the witness points.
+class SpuMinkowskiPenetrationDepthSolver : public SpuConvexPenetrationDepthSolver
+{
+public:
+
+	virtual bool calcPenDepth( SpuVoronoiSimplexSolver& simplexSolver,
+	        void* convexA,void* convexB,int shapeTypeA, int shapeTypeB, float marginA, float marginB,
+            btTransform& transA,const btTransform& transB,
+			btVector3& v, btPoint3& pa, btPoint3& pb,
+			class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataA,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataB
+			) const;
+
+
+};
+
+
+#endif //MINKOWSKI_PENETRATION_DEPTH_SOLVER_H
+
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuPreferredPenetrationDirections.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuPreferredPenetrationDirections.h
@@ -0,0 +1,70 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef _SPU_PREFERRED_PENETRATION_DIRECTIONS_H
+#define _SPU_PREFERRED_PENETRATION_DIRECTIONS_H
+
+
+#include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
+
+int		spuGetNumPreferredPenetrationDirections(int shapeType, void* shape)
+{
+	switch (shapeType)
+    {
+		case TRIANGLE_SHAPE_PROXYTYPE:
+		{
+			return 2;
+			//spu_printf("2\n");
+			break;
+		}
+		default:
+			{
+#if __ASSERT
+        spu_printf("spuGetNumPreferredPenetrationDirections() - Unsupported bound type: %d.\n", shapeType);
+#endif // __ASSERT
+			}
+	}
+
+	return 0;	
+}	
+
+void	spuGetPreferredPenetrationDirection(int shapeType, void* shape, int index, btVector3& penetrationVector)
+{
+
+
+	switch (shapeType)
+    {
+		case TRIANGLE_SHAPE_PROXYTYPE:
+		{
+			btVector3* vertices = (btVector3*)shape;
+			///calcNormal
+			penetrationVector = (vertices[1]-vertices[0]).cross(vertices[2]-vertices[0]);
+			penetrationVector.normalize();
+			if (index)
+				penetrationVector *= btScalar(-1.);
+			break;
+		}
+		default:
+			{
+					
+#if __ASSERT
+        spu_printf("spuGetNumPreferredPenetrationDirections() - Unsupported bound type: %d.\n", shapeType);
+#endif // __ASSERT
+			}
+	}
+		
+}
+
+#endif //_SPU_PREFERRED_PENETRATION_DIRECTIONS_H
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuVoronoiSimplexSolver.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuVoronoiSimplexSolver.cpp
@@ -0,0 +1,606 @@
+
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+	
+	Elsevier CDROM license agreements grants nonexclusive license to use the software
+	for any purpose, commercial or non-commercial as long as the following credit is included
+	identifying the original source of the software:
+
+	Parts of the source are "from the book Real-Time Collision Detection by
+	Christer Ericson, published by Morgan Kaufmann Publishers,
+	(c) 2005 Elsevier Inc."
+		
+*/
+
+
+#include "SpuVoronoiSimplexSolver.h"
+#include <assert.h>
+#include <stdio.h>
+
+#define VERTA  0
+#define VERTB  1
+#define VERTC  2
+#define VERTD  3
+
+#define CATCH_DEGENERATE_TETRAHEDRON 1
+void	SpuVoronoiSimplexSolver::removeVertex(int index)
+{
+	
+	assert(m_numVertices>0);
+	m_numVertices--;
+	m_simplexVectorW[index] = m_simplexVectorW[m_numVertices];
+	m_simplexPointsP[index] = m_simplexPointsP[m_numVertices];
+	m_simplexPointsQ[index] = m_simplexPointsQ[m_numVertices];
+}
+
+void	SpuVoronoiSimplexSolver::reduceVertices (const SpuUsageBitfield& usedVerts)
+{
+	if ((numVertices() >= 4) && (!usedVerts.usedVertexD))
+		removeVertex(3);
+
+	if ((numVertices() >= 3) && (!usedVerts.usedVertexC))
+		removeVertex(2);
+
+	if ((numVertices() >= 2) && (!usedVerts.usedVertexB))
+		removeVertex(1);
+	
+	if ((numVertices() >= 1) && (!usedVerts.usedVertexA))
+		removeVertex(0);
+
+}
+
+
+
+
+
+//clear the simplex, remove all the vertices
+void SpuVoronoiSimplexSolver::reset()
+{
+	m_cachedValidClosest = false;
+	m_numVertices = 0;
+	m_needsUpdate = true;
+	m_lastW = btVector3(btScalar(1e30),btScalar(1e30),btScalar(1e30));
+	m_cachedBC.reset();
+}
+
+
+
+	//add a vertex
+void SpuVoronoiSimplexSolver::addVertex(const btVector3& w, const btPoint3& p, const btPoint3& q)
+{
+	m_lastW = w;
+	m_needsUpdate = true;
+
+	m_simplexVectorW[m_numVertices] = w;
+	m_simplexPointsP[m_numVertices] = p;
+	m_simplexPointsQ[m_numVertices] = q;
+
+	m_numVertices++;
+}
+
+bool	SpuVoronoiSimplexSolver::updateClosestVectorAndPoints()
+{
+	
+	if (m_needsUpdate)
+	{
+		m_cachedBC.reset();
+
+		m_needsUpdate = false;
+
+		switch (numVertices())
+		{
+		case 0:
+				m_cachedValidClosest = false;
+				break;
+		case 1:
+			{
+				m_cachedP1 = m_simplexPointsP[0];
+				m_cachedP2 = m_simplexPointsQ[0];
+				m_cachedV = m_cachedP1-m_cachedP2; //== m_simplexVectorW[0]
+				m_cachedBC.reset();
+				m_cachedBC.setBarycentricCoordinates(btScalar(1.),btScalar(0.),btScalar(0.),btScalar(0.));
+				m_cachedValidClosest = m_cachedBC.isValid();
+				break;
+			};
+		case 2:
+			{
+			//closest point origin from line segment
+					const btVector3& from = m_simplexVectorW[0];
+					const btVector3& to = m_simplexVectorW[1];
+					btVector3 nearest;
+
+					btVector3 p (btScalar(0.),btScalar(0.),btScalar(0.));
+					btVector3 diff = p - from;
+					btVector3 v = to - from;
+					btScalar t = v.dot(diff);
+					
+					if (t > 0) {
+						btScalar dotVV = v.dot(v);
+						if (t < dotVV) {
+							t /= dotVV;
+							diff -= t*v;
+							m_cachedBC.m_usedVertices.usedVertexA = true;
+							m_cachedBC.m_usedVertices.usedVertexB = true;
+						} else {
+							t = 1;
+							diff -= v;
+							//reduce to 1 point
+							m_cachedBC.m_usedVertices.usedVertexB = true;
+						}
+					} else
+					{
+						t = 0;
+						//reduce to 1 point
+						m_cachedBC.m_usedVertices.usedVertexA = true;
+					}
+					m_cachedBC.setBarycentricCoordinates(1-t,t);
+					nearest = from + t*v;
+
+					m_cachedP1 = m_simplexPointsP[0] + t * (m_simplexPointsP[1] - m_simplexPointsP[0]);
+					m_cachedP2 = m_simplexPointsQ[0] + t * (m_simplexPointsQ[1] - m_simplexPointsQ[0]);
+					m_cachedV = m_cachedP1 - m_cachedP2;
+					
+					reduceVertices(m_cachedBC.m_usedVertices);
+
+					m_cachedValidClosest = m_cachedBC.isValid();
+					break;
+			}
+		case 3:
+			{
+				//closest point origin from triangle
+				btVector3 p (btScalar(0.),btScalar(0.),btScalar(0.));
+				
+				const btVector3& a = m_simplexVectorW[0];
+				const btVector3& b = m_simplexVectorW[1];
+				const btVector3& c = m_simplexVectorW[2];
+
+				closestPtPointTriangle(p,a,b,c,m_cachedBC);
+				m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] +
+								m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
+								m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2] +
+								m_simplexPointsP[3] * m_cachedBC.m_barycentricCoords[3];
+
+				m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] +
+					m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
+					m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2] +
+					m_simplexPointsQ[3] * m_cachedBC.m_barycentricCoords[3];
+
+				m_cachedV = m_cachedP1-m_cachedP2;
+
+				reduceVertices (m_cachedBC.m_usedVertices);
+				m_cachedValidClosest =  m_cachedBC.isValid();
+
+				break;
+			}
+		case 4:
+			{
+
+				
+				btVector3 p (btScalar(0.),btScalar(0.),btScalar(0.));
+				
+				const btVector3& a = m_simplexVectorW[0];
+				const btVector3& b = m_simplexVectorW[1];
+				const btVector3& c = m_simplexVectorW[2];
+				const btVector3& d = m_simplexVectorW[3];
+
+				bool hasSeperation = closestPtPointTetrahedron(p,a,b,c,d,m_cachedBC);
+
+				if (hasSeperation)
+				{
+
+					m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] +
+						m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
+						m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2] +
+						m_simplexPointsP[3] * m_cachedBC.m_barycentricCoords[3];
+
+					m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] +
+						m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
+						m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2] +
+						m_simplexPointsQ[3] * m_cachedBC.m_barycentricCoords[3];
+
+					m_cachedV = m_cachedP1-m_cachedP2;
+					reduceVertices (m_cachedBC.m_usedVertices);
+				} else
+				{
+//					printf("sub distance got penetration\n");
+
+					if (m_cachedBC.m_degenerate)
+					{
+						m_cachedValidClosest = false;
+					} else
+					{
+						m_cachedValidClosest = true;
+						//degenerate case == false, penetration = true + zero
+						m_cachedV.setValue(btScalar(0.),btScalar(0.),btScalar(0.));
+					}
+					break;
+				}
+
+				m_cachedValidClosest = m_cachedBC.isValid();
+
+				//closest point origin from tetrahedron
+				break;
+			}
+		default:
+			{
+				m_cachedValidClosest = false;
+			}
+		};
+	}
+
+	return m_cachedValidClosest;
+
+}
+
+//return/calculate the closest vertex
+bool SpuVoronoiSimplexSolver::closest(btVector3& v)
+{
+	bool succes = updateClosestVectorAndPoints();
+	v = m_cachedV;
+	return succes;
+}
+
+
+
+btScalar SpuVoronoiSimplexSolver::maxVertex()
+{
+	int i, numverts = numVertices();
+	btScalar maxV = btScalar(0.);
+	for (i=0;i<numverts;i++)
+	{
+		btScalar curLen2 = m_simplexVectorW[i].length2();
+		if (maxV < curLen2)
+			maxV = curLen2;
+	}
+	return maxV;
+}
+
+
+
+	//return the current simplex
+int SpuVoronoiSimplexSolver::getSimplex(btPoint3 *pBuf, btPoint3 *qBuf, btVector3 *yBuf) const
+{
+	int i;
+	for (i=0;i<numVertices();i++)
+	{
+		yBuf[i] = m_simplexVectorW[i];
+		pBuf[i] = m_simplexPointsP[i];
+		qBuf[i] = m_simplexPointsQ[i];
+	}
+	return numVertices();
+}
+
+
+
+
+bool SpuVoronoiSimplexSolver::inSimplex(const btVector3& w)
+{
+	bool found = false;
+	int i, numverts = numVertices();
+	//btScalar maxV = btScalar(0.);
+	
+	//w is in the current (reduced) simplex
+	for (i=0;i<numverts;i++)
+	{
+		if (m_simplexVectorW[i] == w)
+			found = true;
+	}
+
+	//check in case lastW is already removed
+	if (w == m_lastW)
+		return true;
+    	
+	return found;
+}
+
+void SpuVoronoiSimplexSolver::backup_closest(btVector3& v) 
+{
+	v = m_cachedV;
+}
+
+
+bool SpuVoronoiSimplexSolver::emptySimplex() const 
+{
+	return (numVertices() == 0);
+
+}
+
+void SpuVoronoiSimplexSolver::compute_points(btPoint3& p1, btPoint3& p2) 
+{
+	updateClosestVectorAndPoints();
+	p1 = m_cachedP1;
+	p2 = m_cachedP2;
+
+}
+
+
+
+
+bool	SpuVoronoiSimplexSolver::closestPtPointTriangle(const btPoint3& p, const btPoint3& a, const btPoint3& b, const btPoint3& c,SpuSubSimplexClosestResult& result)
+{
+	result.m_usedVertices.reset();
+
+    // Check if P in vertex region outside A
+    btVector3 ab = b - a;
+    btVector3 ac = c - a;
+    btVector3 ap = p - a;
+    btScalar d1 = ab.dot(ap);
+    btScalar d2 = ac.dot(ap);
+    if (d1 <= btScalar(0.0) && d2 <= btScalar(0.0)) 
+	{
+		result.m_closestPointOnSimplex = a;
+		result.m_usedVertices.usedVertexA = true;
+		result.setBarycentricCoordinates(1,0,0);
+		return true;// a; // barycentric coordinates (1,0,0)
+	}
+
+    // Check if P in vertex region outside B
+    btVector3 bp = p - b;
+    btScalar d3 = ab.dot(bp);
+    btScalar d4 = ac.dot(bp);
+    if (d3 >= btScalar(0.0) && d4 <= d3) 
+	{
+		result.m_closestPointOnSimplex = b;
+		result.m_usedVertices.usedVertexB = true;
+		result.setBarycentricCoordinates(0,1,0);
+
+		return true; // b; // barycentric coordinates (0,1,0)
+	}
+    // Check if P in edge region of AB, if so return projection of P onto AB
+    btScalar vc = d1*d4 - d3*d2;
+    if (vc <= btScalar(0.0) && d1 >= btScalar(0.0) && d3 <= btScalar(0.0)) {
+        btScalar v = d1 / (d1 - d3);
+		result.m_closestPointOnSimplex = a + v * ab;
+		result.m_usedVertices.usedVertexA = true;
+		result.m_usedVertices.usedVertexB = true;
+		result.setBarycentricCoordinates(1-v,v,0);
+		return true;
+        //return a + v * ab; // barycentric coordinates (1-v,v,0)
+    }
+
+    // Check if P in vertex region outside C
+    btVector3 cp = p - c;
+    btScalar d5 = ab.dot(cp);
+    btScalar d6 = ac.dot(cp);
+    if (d6 >= btScalar(0.0) && d5 <= d6) 
+	{
+		result.m_closestPointOnSimplex = c;
+		result.m_usedVertices.usedVertexC = true;
+		result.setBarycentricCoordinates(0,0,1);
+		return true;//c; // barycentric coordinates (0,0,1)
+	}
+
+    // Check if P in edge region of AC, if so return projection of P onto AC
+    btScalar vb = d5*d2 - d1*d6;
+    if (vb <= btScalar(0.0) && d2 >= btScalar(0.0) && d6 <= btScalar(0.0)) {
+        btScalar w = d2 / (d2 - d6);
+		result.m_closestPointOnSimplex = a + w * ac;
+		result.m_usedVertices.usedVertexA = true;
+		result.m_usedVertices.usedVertexC = true;
+		result.setBarycentricCoordinates(1-w,0,w);
+		return true;
+        //return a + w * ac; // barycentric coordinates (1-w,0,w)
+    }
+
+    // Check if P in edge region of BC, if so return projection of P onto BC
+    btScalar va = d3*d6 - d5*d4;
+    if (va <= btScalar(0.0) && (d4 - d3) >= btScalar(0.0) && (d5 - d6) >= btScalar(0.0)) {
+        btScalar w = (d4 - d3) / ((d4 - d3) + (d5 - d6));
+		
+		result.m_closestPointOnSimplex = b + w * (c - b);
+		result.m_usedVertices.usedVertexB = true;
+		result.m_usedVertices.usedVertexC = true;
+		result.setBarycentricCoordinates(0,1-w,w);
+		return true;		
+       // return b + w * (c - b); // barycentric coordinates (0,1-w,w)
+    }
+
+    // P inside face region. Compute Q through its barycentric coordinates (u,v,w)
+    btScalar denom = btScalar(1.0) / (va + vb + vc);
+    btScalar v = vb * denom;
+    btScalar w = vc * denom;
+    
+	result.m_closestPointOnSimplex = a + ab * v + ac * w;
+	result.m_usedVertices.usedVertexA = true;
+	result.m_usedVertices.usedVertexB = true;
+	result.m_usedVertices.usedVertexC = true;
+	result.setBarycentricCoordinates(1-v-w,v,w);
+	
+	return true;
+//	return a + ab * v + ac * w; // = u*a + v*b + w*c, u = va * denom = btScalar(1.0) - v - w
+
+}
+
+
+
+
+
+/// Test if point p and d lie on opposite sides of plane through abc
+int SpuVoronoiSimplexSolver::pointOutsideOfPlane(const btPoint3& p, const btPoint3& a, const btPoint3& b, const btPoint3& c, const btPoint3& d)
+{
+	btVector3 normal = (b-a).cross(c-a);
+
+    btScalar signp = (p - a).dot(normal); // [AP AB AC]
+    btScalar signd = (d - a).dot( normal); // [AD AB AC]
+
+#ifdef CATCH_DEGENERATE_TETRAHEDRON
+#ifdef BT_USE_DOUBLE_PRECISION
+if (signd * signd < (btScalar(1e-8) * btScalar(1e-8)))
+	{
+		return -1;
+	}
+#else
+	if (signd * signd < (btScalar(1e-4) * btScalar(1e-4)))
+	{
+//		printf("affine dependent/degenerate\n");//
+		return -1;
+	}
+#endif
+
+#endif
+	// Points on opposite sides if expression signs are opposite
+    return signp * signd < btScalar(0.);
+}
+
+
+bool	SpuVoronoiSimplexSolver::closestPtPointTetrahedron(const btPoint3& p, const btPoint3& a, const btPoint3& b, const btPoint3& c, const btPoint3& d, SpuSubSimplexClosestResult& finalResult)
+{
+	SpuSubSimplexClosestResult tempResult;
+
+    // Start out assuming point inside all halfspaces, so closest to itself
+	finalResult.m_closestPointOnSimplex = p;
+	finalResult.m_usedVertices.reset();
+    finalResult.m_usedVertices.usedVertexA = true;
+	finalResult.m_usedVertices.usedVertexB = true;
+	finalResult.m_usedVertices.usedVertexC = true;
+	finalResult.m_usedVertices.usedVertexD = true;
+
+    int pointOutsideABC = pointOutsideOfPlane(p, a, b, c, d);
+	int pointOutsideACD = pointOutsideOfPlane(p, a, c, d, b);
+  	int	pointOutsideADB = pointOutsideOfPlane(p, a, d, b, c);
+	int	pointOutsideBDC = pointOutsideOfPlane(p, b, d, c, a);
+
+   if (pointOutsideABC < 0 || pointOutsideACD < 0 || pointOutsideADB < 0 || pointOutsideBDC < 0)
+   {
+	   finalResult.m_degenerate = true;
+	   return false;
+   }
+
+   if (!pointOutsideABC  && !pointOutsideACD && !pointOutsideADB && !pointOutsideBDC)
+	 {
+		 return false;
+	 }
+
+
+    btScalar bestSqDist = FLT_MAX;
+    // If point outside face abc then compute closest point on abc
+	if (pointOutsideABC) 
+	{
+        closestPtPointTriangle(p, a, b, c,tempResult);
+		btPoint3 q = tempResult.m_closestPointOnSimplex;
+		
+        btScalar sqDist = (q - p).dot( q - p);
+        // Update best closest point if (squared) distance is less than current best
+        if (sqDist < bestSqDist) {
+			bestSqDist = sqDist;
+			finalResult.m_closestPointOnSimplex = q;
+			//convert result bitmask!
+			finalResult.m_usedVertices.reset();
+			finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
+			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexB;
+			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC;
+			finalResult.setBarycentricCoordinates(
+					tempResult.m_barycentricCoords[VERTA],
+					tempResult.m_barycentricCoords[VERTB],
+					tempResult.m_barycentricCoords[VERTC],
+					0
+			);
+
+		}
+    }
+  
+
+	// Repeat test for face acd
+	if (pointOutsideACD) 
+	{
+        closestPtPointTriangle(p, a, c, d,tempResult);
+		btPoint3 q = tempResult.m_closestPointOnSimplex;
+		//convert result bitmask!
+
+        btScalar sqDist = (q - p).dot( q - p);
+        if (sqDist < bestSqDist) 
+		{
+			bestSqDist = sqDist;
+			finalResult.m_closestPointOnSimplex = q;
+			finalResult.m_usedVertices.reset();
+			finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
+			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexB;
+			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexC;
+			finalResult.setBarycentricCoordinates(
+					tempResult.m_barycentricCoords[VERTA],
+					0,
+					tempResult.m_barycentricCoords[VERTB],
+					tempResult.m_barycentricCoords[VERTC]
+			);
+
+		}
+    }
+    // Repeat test for face adb
+
+	
+	if (pointOutsideADB)
+	{
+		closestPtPointTriangle(p, a, d, b,tempResult);
+		btPoint3 q = tempResult.m_closestPointOnSimplex;
+		//convert result bitmask!
+
+        btScalar sqDist = (q - p).dot( q - p);
+        if (sqDist < bestSqDist) 
+		{
+			bestSqDist = sqDist;
+			finalResult.m_closestPointOnSimplex = q;
+			finalResult.m_usedVertices.reset();
+			finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
+			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;
+			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexC;
+			finalResult.setBarycentricCoordinates(
+					tempResult.m_barycentricCoords[VERTA],
+					tempResult.m_barycentricCoords[VERTC],
+					0,
+					tempResult.m_barycentricCoords[VERTB]
+			);
+
+		}
+    }
+    // Repeat test for face bdc
+    
+
+	if (pointOutsideBDC)
+	{
+        closestPtPointTriangle(p, b, d, c,tempResult);
+		btPoint3 q = tempResult.m_closestPointOnSimplex;
+		//convert result bitmask!
+        btScalar sqDist = (q - p).dot( q - p);
+        if (sqDist < bestSqDist) 
+		{
+			bestSqDist = sqDist;
+			finalResult.m_closestPointOnSimplex = q;
+			finalResult.m_usedVertices.reset();
+			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexA;
+			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;
+			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC;
+
+			finalResult.setBarycentricCoordinates(
+					0,
+					tempResult.m_barycentricCoords[VERTA],
+					tempResult.m_barycentricCoords[VERTC],
+					tempResult.m_barycentricCoords[VERTB]
+			);
+
+		}
+    }
+
+	//help! we ended up full !
+	
+	if (finalResult.m_usedVertices.usedVertexA &&
+		finalResult.m_usedVertices.usedVertexB &&
+		finalResult.m_usedVertices.usedVertexC &&
+		finalResult.m_usedVertices.usedVertexD) 
+	{
+		return true;
+	}
+
+    return true;
+}
+
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuVoronoiSimplexSolver.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuVoronoiSimplexSolver.h
@@ -0,0 +1,156 @@
+
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#ifndef SPUVoronoiSimplexSolver_H
+#define SPUVoronoiSimplexSolver_H
+
+#include <LinearMath/btTransform.h>
+#include <LinearMath/btPoint3.h>
+
+#define VORONOI_SIMPLEX_MAX_VERTS 5
+
+struct SpuUsageBitfield{
+	SpuUsageBitfield()
+	{
+		reset();
+	}
+
+	void reset()
+	{
+		usedVertexA = false;
+		usedVertexB = false;
+		usedVertexC = false;
+		usedVertexD = false;
+	}
+	unsigned short usedVertexA	: 1;
+	unsigned short usedVertexB	: 1;
+	unsigned short usedVertexC	: 1;
+	unsigned short usedVertexD	: 1;
+	unsigned short unused1		: 1;
+	unsigned short unused2		: 1;
+	unsigned short unused3		: 1;
+	unsigned short unused4		: 1;
+};
+
+
+struct	SpuSubSimplexClosestResult
+{
+	btVector3	m_closestPointOnSimplex;
+	//MASK for m_usedVertices
+	//stores the simplex vertex-usage, using the MASK, 
+	// if m_usedVertices & MASK then the related vertex is used
+	SpuUsageBitfield	m_usedVertices;
+	float	m_barycentricCoords[4];
+	bool m_degenerate;
+
+	void	reset()
+	{
+		m_degenerate = false;
+		setBarycentricCoordinates();
+		m_usedVertices.reset();
+	}
+	bool	isValid()
+	{
+		bool valid = (m_barycentricCoords[0] >= float(0.)) &&
+			(m_barycentricCoords[1] >= float(0.)) &&
+			(m_barycentricCoords[2] >= float(0.)) &&
+			(m_barycentricCoords[3] >= float(0.));
+
+
+		return valid;
+	}
+	void	setBarycentricCoordinates(float a=float(0.),float b=float(0.),float c=float(0.),float d=float(0.))
+	{
+		m_barycentricCoords[0] = a;
+		m_barycentricCoords[1] = b;
+		m_barycentricCoords[2] = c;
+		m_barycentricCoords[3] = d;
+	}
+
+};
+
+/// SpuVoronoiSimplexSolver is an implementation of the closest point distance algorithm from a 1-4 points simplex to the origin.
+/// Can be used with GJK, as an alternative to Johnson distance algorithm.
+class SpuVoronoiSimplexSolver
+{
+public:
+
+	int	m_numVertices;
+
+	btVector3	m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS];
+	btVector3	m_simplexPointsP[VORONOI_SIMPLEX_MAX_VERTS];
+	btVector3	m_simplexPointsQ[VORONOI_SIMPLEX_MAX_VERTS];
+
+	int m_VertexIndexA[VORONOI_SIMPLEX_MAX_VERTS];
+	int m_VertexIndexB[VORONOI_SIMPLEX_MAX_VERTS];
+
+	btVector3	m_cachedP1;
+	btVector3	m_cachedP2;
+	btVector3	m_cachedV;
+	btVector3	m_lastW;
+	bool		m_cachedValidClosest;
+
+	SpuSubSimplexClosestResult m_cachedBC;
+
+	bool	m_needsUpdate;
+
+	void	removeVertex(int index);
+	void	reduceVertices (const SpuUsageBitfield& usedVerts);
+	bool	updateClosestVectorAndPoints();
+
+	bool	closestPtPointTetrahedron(const btVector3& p, const btVector3& a, const btVector3& b, const btVector3& c, const btVector3& d, SpuSubSimplexClosestResult& finalResult);
+	int		pointOutsideOfPlane(const btVector3& p, const btVector3& a, const btVector3& b, const btVector3& c, const btVector3& d);
+	bool	closestPtPointTriangle(const btVector3& p, const btVector3& a, const btVector3& b, const btVector3& c,SpuSubSimplexClosestResult& result);
+
+	int RemoveDegenerateIndices (const int *inArray, int numIndices, int *outArray) const;
+
+public:
+
+	void reset();
+
+	void addVertex(const btVector3& w, const btPoint3& p, const btPoint3& q);
+
+
+	bool closest(btVector3& v);
+
+	btScalar maxVertex();
+
+	bool fullSimplex() const
+	{
+		return (m_numVertices == 4);
+	}
+
+	int getSimplex(btVector3 *pBuf, btVector3 *qBuf, btVector3 *yBuf) const;
+
+	bool inSimplex(const btVector3& w);
+
+	void backup_closest(btVector3& v) ;
+
+	bool emptySimplex() const ;
+
+	void compute_points(btVector3& p1, btVector3& p2) ;
+
+	int numVertices() const 
+	{
+		return m_numVertices;
+	}
+};
+
+
+
+#endif //SpuVoronoiSimplexSolver
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/readme.txt
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/readme.txt
@@ -0,0 +1 @@
+Empty placeholder for future Libspe2 SPU task
--- a/src/BulletMultiThreaded/SpuParallelSolver.cpp
+++ b/src/BulletMultiThreaded/SpuParallelSolver.cpp
@@ -0,0 +1,604 @@
+/*
+Bullet Continuous Collision Detection and Physics Library - Parallel solver
+Copyright (c) 2007 Starbreeze Studios
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+Written by: Marten Svanfeldt
+*/
+
+#include "SpuParallelSolver.h"
+
+//#include "SpuFakeDma.h"
+#include "SpuSync.h"
+
+#include "LinearMath/btVector3.h"
+#include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+#include "BulletDynamics/ConstraintSolver/btContactSolverInfo.h"
+#include "LinearMath/btMinMax.h"
+#include "BulletCollision/CollisionShapes/btCollisionShape.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletDynamics/ConstraintSolver/btTypedConstraint.h"
+#include "LinearMath/btQuickprof.h"
+
+#include "SpuSolverTask/SpuParallellSolverTask.h"
+
+#include <stdio.h>
+
+enum
+{
+	PARALLEL_SOLVER_BODIES_PER_TASK = 64,
+	PARALLEL_SOLVER_CELLS_PER_TASK = SPU_HASH_NUMCELLS >> 3
+};
+
+
+//-- Hash handling
+static void recordDependency(SpuSolverHash* hash, unsigned int i, unsigned int j)
+{
+	hash->m_dependencyMatrix[i][j >> 5] |= (1 << (j & 31));
+	hash->m_dependencyMatrix[j][i >> 5] |= (1 << (i & 31));
+}
+
+
+// Clear the given hash
+static void clearHash (SpuSolverHash* hash)
+{
+	size_t hashSize = sizeof(SpuSolverHash);
+	memset(hash, 0, hashSize);
+	int i;
+
+	// Setup basic dependency
+	for ( i = 0; i < SPU_HASH_NUMCELLS; ++i)
+	{
+		hash->m_dependencyMatrix[i][i >> 5] |= (1 << (i & 31));
+	}
+
+	// Set some ones to "unused cells"
+	for ( i = SPU_HASH_WORDWIDTH-SPU_HASH_NUMUNUSEDBITS; i < SPU_HASH_WORDWIDTH; ++i)
+	{
+		hash->m_currentMask[0][SPU_HASH_NUMCELLDWORDS-1] |= (1 << i);
+	}
+}
+/*
+static bool getDependency(SpuSolverHash* hash, unsigned int i, unsigned int j)
+{
+	return (hash->m_dependencyMatrix[i][j >> 5] & (1 << (j & 31))) != 0;
+}
+*/
+
+
+static unsigned int getObjectIndex (btCollisionObject* object)
+{
+	btVector3 center = object->getWorldTransform().getOrigin();
+	int cx = (int)floorf(center.x() / SPU_HASH_PHYSSIZE);
+	int cy = (int)floorf(center.y() / SPU_HASH_PHYSSIZE);
+	int cz = (int)floorf(center.z() / SPU_HASH_PHYSSIZE);
+
+	return spuGetHashCellIndex(cx, cy, cz);
+};
+
+
+
+
+
+btParallelSequentialImpulseSolver::btParallelSequentialImpulseSolver (btThreadSupportInterface* threadIf, int maxOutstandingTasks)
+: m_numberOfContacts(0), m_taskScheduler (threadIf, maxOutstandingTasks)
+{
+	m_solverHash = new SpuSolverHash;
+	clearHash(m_solverHash);
+}
+
+btParallelSequentialImpulseSolver::~btParallelSequentialImpulseSolver ()
+{
+	delete m_solverHash;
+}
+
+
+void btParallelSequentialImpulseSolver::prepareSolve(int numBodies, int numManifolds)
+{
+	m_sortedManifolds.reserve(numManifolds);
+	m_allObjects.reserve(numBodies);
+}
+
+btScalar btParallelSequentialImpulseSolver::solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints, const btContactSolverInfo& info,class btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher)
+{
+	BT_PROFILE("parallel_solveGroup");
+
+	if (!numManifolds && !numConstraints)
+		return 0;
+	int i;
+
+///refresh contact points is not needed anymore, it has been moved into the processCollision detection part.
+#ifdef FORCE_REFESH_CONTACT_MANIFOLDS
+	for ( i = 0; i < numManifolds; ++i)
+	{
+		btPersistentManifold* currManifold = manifold[i];
+		btRigidBody* rb0 = (btRigidBody*)currManifold->getBody0();
+		btRigidBody* rb1 = (btRigidBody*)currManifold->getBody1();
+
+		currManifold->refreshContactPoints(rb0->getCenterOfMassTransform(),rb1->getCenterOfMassTransform());
+	}
+#endif //FORCE_REFESH_CONTACT_MANIFOLDS
+
+	// Record and mark the manifolds to the cells
+	for ( i = 0; i < numManifolds; ++i)
+	{
+		// Compute a hash cell for this manifold
+		btPersistentManifold* currManifold = manifold[i];
+
+		btCollisionObject *ownerObject, *otherObject;
+
+		btRigidBody* rb0 = (btRigidBody*)currManifold->getBody0();
+		btRigidBody* rb1 = (btRigidBody*)currManifold->getBody1();
+
+		if (rb0->getIslandTag() >= 0)
+		{
+			ownerObject = rb0;
+			otherObject = rb1;
+		}
+		else
+		{
+			ownerObject = rb1;
+			otherObject = rb0;
+		}
+
+		// Save the cell
+		unsigned int ownerCellIdx = getObjectIndex(ownerObject);
+		ManifoldCellHolder holder = {ownerCellIdx, currManifold};
+		m_sortedManifolds.push_back(holder);
+		m_solverHash->m_Hash[ownerCellIdx].m_numManifolds++;
+
+		// Record dependency
+		if (rb0->getIslandTag() >= 0 && rb1->getIslandTag() >= 0)
+		{
+			unsigned int otherCellIdx = getObjectIndex(otherObject);
+			recordDependency(m_solverHash, ownerCellIdx, otherCellIdx);
+		}
+		
+		// Save statistics
+		int numContacts = currManifold->getNumContacts();
+		m_solverHash->m_Hash[ownerCellIdx].m_numContacts += numContacts;
+		m_numberOfContacts += numContacts;
+	}
+
+	// Record and mark constraints to the cells
+	for ( i = 0; i < numConstraints; ++i)
+	{
+		// Compute a hash cell for this manifold
+		btTypedConstraint* currConstraint = constraints[i];
+
+		if (!constraintTypeSupported(currConstraint->getConstraintType()))
+			continue;
+
+		btCollisionObject *ownerObject, *otherObject;
+
+		btRigidBody* rb0 = &currConstraint->getRigidBodyA();
+		btRigidBody* rb1 = &currConstraint->getRigidBodyB();
+
+		if (rb0->getIslandTag() >= 0)
+		{
+			ownerObject = rb0;
+			otherObject = rb1;
+		}
+		else
+		{
+			ownerObject = rb1;
+			otherObject = rb0;
+		}
+
+		// Save the cell
+		unsigned int ownerCellIdx = getObjectIndex(ownerObject);
+		ConstraintCellHolder holder = {ownerCellIdx, currConstraint->getConstraintType(), currConstraint};
+		m_sortedConstraints.push_back(holder);
+		m_solverHash->m_Hash[ownerCellIdx].m_numConstraints++;
+
+		// Record dependency
+		if (rb0 && rb1 && rb0->getIslandTag() >= 0 && rb1->getIslandTag() >= 0)
+		{
+			unsigned int otherCellIdx = getObjectIndex(otherObject);
+			recordDependency(m_solverHash, ownerCellIdx, otherCellIdx);
+		}
+	}
+
+	// Save all RBs
+	for ( i = 0; i < numBodies; ++i)
+	{
+		btCollisionObject* obj = bodies[i];
+		//unsigned int cellIdx = getObjectIndex(obj);
+
+		btRigidBody* rb = btRigidBody::upcast(obj);
+		m_allObjects.push_back(rb);
+	}
+
+	return 0;
+}
+
+template<typename T>
+class CellHolderPredicate
+{
+public:
+	SIMD_FORCE_INLINE bool operator() ( const T& lhs, const T& rhs )
+	{
+		return lhs.m_hashCellIndex < rhs.m_hashCellIndex;
+	}
+};
+
+
+/*static void printDependencyMatrix(SpuSolverHash* hash)
+{
+	for (int r = 0; r < SPU_HASH_NUMCELLS; ++r)
+	{
+		for (int c = 0; c < SPU_HASH_NUMCELLS; ++c)
+		{
+			if (getDependency(hash, r, c))
+			{
+				printf("1");
+			}
+			else
+			{
+				printf("0");
+			}
+		}
+
+		printf("\n");
+	}
+	printf("\n");
+	fflush(stdout);
+}
+*/
+
+// Solver caches
+btAlignedObjectArray<SpuSolverBody> solverBodyPool_persist;
+btAlignedObjectArray<uint32_t> solverBodyOffsetList_persist;
+btAlignedObjectArray<SpuSolverInternalConstraint> solverInternalConstraintPool_persist;
+btAlignedObjectArray<SpuSolverConstraint> solverConstraintPool_persist;
+
+
+void btParallelSequentialImpulseSolver::allSolved (const btContactSolverInfo& info,class btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc)
+{
+	BT_PROFILE("parallel_allSolved");
+
+	if (!m_numberOfContacts && !m_sortedConstraints.size())
+	{
+		m_sortedManifolds.clear();
+		m_sortedConstraints.clear();
+		m_allObjects.clear();
+		clearHash(m_solverHash);
+		return;
+	}
+
+
+	//printDependencyMatrix(m_solverHash);
+
+	// Sort the manifolds list
+	int numManifolds = m_sortedManifolds.size();
+	m_sortedManifolds.quickSort(CellHolderPredicate<ManifoldCellHolder>());
+
+	// Sort the constraint list
+	int numConstraints = m_sortedConstraints.size();
+	m_sortedConstraints.quickSort(CellHolderPredicate<ConstraintCellHolder>());
+
+
+	// Sort the body list
+	int numBodies = m_allObjects.size();
+	
+	// Reassign the hash offset
+	uint32_t emptyCellMask[SPU_HASH_NUMCELLDWORDS] = {0};
+	int numBodyOffsets = 0;
+	{
+		int manifoldRunner = 0;
+		int bodyOffsetRunner = 0;
+		int internalConstraintRunner = 0;
+		int constraintRunner = 0;
+		
+		for (int i = 0; i < SPU_HASH_NUMCELLS; ++i)
+		{
+			bool empty = true;
+
+			SpuSolverHashCell& hashCell = m_solverHash->m_Hash[i];
+			hashCell.m_solverBodyOffsetListOffset = bodyOffsetRunner;
+
+			if (hashCell.m_numManifolds)
+			{
+				hashCell.m_manifoldListOffset = manifoldRunner;
+				manifoldRunner += hashCell.m_numManifolds;
+				
+				bodyOffsetRunner += hashCell.m_numManifolds*2;
+			}			
+			if (hashCell.m_numContacts)
+			{
+				hashCell.m_internalConstraintListOffset = internalConstraintRunner*3;
+				internalConstraintRunner += hashCell.m_numContacts;
+				empty = false;
+			}
+
+			if (hashCell.m_numConstraints)
+			{
+				hashCell.m_constraintListOffset = constraintRunner;
+				constraintRunner += hashCell.m_numConstraints;
+
+				bodyOffsetRunner += hashCell.m_numConstraints*2;
+
+				empty = false;
+			}
+			
+
+			emptyCellMask[i >> 5] |= (empty ? (1 << (i&31)) : 0);
+			// Align the bodyOffsetRunner to a whole number of 4 for right alignment in the list
+			bodyOffsetRunner = (bodyOffsetRunner+3)&~0x3;
+		}
+
+		numBodyOffsets = bodyOffsetRunner;
+	}
+
+	// Setup rigid bodies
+	// Allocate temporary data
+	solverBodyPool_persist.resize(numBodies + numManifolds + numConstraints);
+	SpuSolverBody* solverBodyPool = &solverBodyPool_persist[0];
+
+	solverBodyOffsetList_persist.resize(numBodyOffsets);
+	uint32_t* solverBodyOffsetList = &solverBodyOffsetList_persist[0];
+
+	solverInternalConstraintPool_persist.resize(m_numberOfContacts*3);
+	SpuSolverInternalConstraint* solverInternalConstraintPool = &solverInternalConstraintPool_persist[0];
+	
+	solverConstraintPool_persist.resize(numConstraints);
+	SpuSolverConstraint* solverConstraintPool = &solverConstraintPool_persist[0];
+
+	// Setup all the moving rigid bodies
+	{
+		BT_PROFILE("setup moving rigidbodies");
+
+		int bodiesPerTask = PARALLEL_SOLVER_BODIES_PER_TASK;
+		int bodiesToSchedule = numBodies;
+		int startBody = 0;
+
+		while (bodiesToSchedule > 0)
+		{
+			// Schedule a bunch of hash cells
+			int numBodiesInTask = bodiesToSchedule > bodiesPerTask ? bodiesPerTask : bodiesToSchedule;
+
+			SpuSolverTaskDesc* desc = m_taskScheduler.getTask();
+
+			desc->m_solverCommand = CMD_SOLVER_SETUP_BODIES;
+			desc->m_solverData.m_solverHash = m_solverHash;
+			desc->m_solverData.m_solverBodyList = solverBodyPool;
+
+			desc->m_commandData.m_bodySetup.m_startBody = startBody;
+			desc->m_commandData.m_bodySetup.m_numBodies = numBodiesInTask;
+			desc->m_commandData.m_bodySetup.m_rbList = &m_allObjects[0];
+
+			m_taskScheduler.issueTask();
+			bodiesToSchedule -= numBodiesInTask;
+			startBody += numBodiesInTask;
+		}
+		
+		m_taskScheduler.flushTasks();
+	}
+
+	// Manifold setup
+	{
+		int cellsPerTask = PARALLEL_SOLVER_CELLS_PER_TASK;
+		int cellsToSchedule = SPU_HASH_NUMCELLS;
+		int startCell = 0;
+
+		while (cellsToSchedule > 0)
+		{
+			int numCellsInTask = cellsToSchedule > cellsPerTask ? cellsPerTask : cellsToSchedule;
+			
+			SpuSolverTaskDesc* desc = m_taskScheduler.getTask();
+
+			desc->m_solverCommand = CMD_SOLVER_MANIFOLD_SETUP;
+			desc->m_solverData.m_solverHash = m_solverHash;
+			desc->m_solverData.m_solverBodyList = solverBodyPool;
+			desc->m_solverData.m_solverBodyOffsetList = solverBodyOffsetList;
+			desc->m_solverData.m_solverInternalConstraintList = solverInternalConstraintPool;
+			desc->m_solverData.m_solverConstraintList = solverConstraintPool;
+
+			desc->m_commandData.m_manifoldSetup.m_startCell = startCell;
+			desc->m_commandData.m_manifoldSetup.m_numCells = numCellsInTask;
+			desc->m_commandData.m_manifoldSetup.m_numBodies = numBodies;
+			desc->m_commandData.m_manifoldSetup.m_numManifolds = numManifolds;
+			desc->m_commandData.m_manifoldSetup.m_manifoldHolders = &m_sortedManifolds[0];
+			desc->m_commandData.m_manifoldSetup.m_constraintHolders = &m_sortedConstraints[0];
+			desc->m_commandData.m_manifoldSetup.m_solverInfo = info;
+
+			m_taskScheduler.issueTask();
+			cellsToSchedule -= numCellsInTask;
+			startCell += numCellsInTask;
+		}
+		m_taskScheduler.flushTasks();
+	}
+
+	{
+		BT_PROFILE("parallel_solve_iterations");
+
+		btSpinlock::SpinVariable* spinVar = (btSpinlock::SpinVariable*)btAlignedAlloc(sizeof(btSpinlock::SpinVariable), 128);
+		for (int iter = 0; iter < info.m_numIterations; ++iter)
+		{
+			btSpinlock lock (spinVar);
+			lock.Init();
+
+			// Clear the "processed cells" part of the hash
+			memcpy(m_solverHash->m_currentMask[0], emptyCellMask, sizeof(uint32_t)*SPU_HASH_NUMCELLDWORDS);
+
+			for (int task = 0; task < m_taskScheduler.getMaxOutstandingTasks(); ++task)
+			{
+				SpuSolverTaskDesc* desc = m_taskScheduler.getTask();
+				desc->m_solverCommand = CMD_SOLVER_SOLVE_ITERATE;
+
+				desc->m_solverData.m_solverHash = m_solverHash;
+				desc->m_solverData.m_solverBodyList = solverBodyPool;
+				desc->m_solverData.m_solverBodyOffsetList = solverBodyOffsetList;
+				desc->m_solverData.m_solverInternalConstraintList = solverInternalConstraintPool;
+				desc->m_solverData.m_solverConstraintList = solverConstraintPool;
+
+				desc->m_commandData.m_iterate.m_spinLockVar = spinVar;
+
+				m_taskScheduler.issueTask();
+			} 
+			m_taskScheduler.flushTasks();		
+		}
+		btAlignedFree((void*)spinVar);
+	}
+	
+	// Write back velocity
+	{
+		int bodiesPerTask = PARALLEL_SOLVER_BODIES_PER_TASK;
+		int bodiesToSchedule = numBodies;
+		int startBody = 0;
+
+		while (bodiesToSchedule > 0)
+		{
+			// Schedule a bunch of hash cells
+			int numBodiesInTask = bodiesToSchedule > bodiesPerTask ? bodiesPerTask : bodiesToSchedule;
+
+			SpuSolverTaskDesc* desc = m_taskScheduler.getTask();
+
+			desc->m_solverCommand = CMD_SOLVER_COPYBACK_BODIES;
+			desc->m_solverData.m_solverHash = m_solverHash;
+			desc->m_solverData.m_solverBodyList = solverBodyPool;
+
+			desc->m_commandData.m_bodyCopyback.m_startBody = startBody;
+			desc->m_commandData.m_bodyCopyback.m_numBodies = numBodiesInTask;
+			desc->m_commandData.m_bodyCopyback.m_rbList = &m_allObjects[0];
+
+			m_taskScheduler.issueTask();
+			bodiesToSchedule -= numBodiesInTask;
+			startBody += numBodiesInTask;
+		}
+
+		m_taskScheduler.flushTasks();
+	}
+
+
+	// Clean up
+	m_sortedManifolds.resize(0);
+	m_sortedConstraints.resize(0);
+	m_allObjects.resize(0);
+	clearHash(m_solverHash);
+
+
+	m_numberOfContacts = 0;
+}
+
+void btParallelSequentialImpulseSolver::reset()
+{
+	m_sortedManifolds.clear();
+	m_allObjects.clear();
+	m_numberOfContacts = 0;
+	clearHash(m_solverHash);
+
+	solverBodyPool_persist.clear();
+	solverBodyOffsetList_persist.clear();
+	solverConstraintPool_persist.clear();
+	solverInternalConstraintPool_persist.clear();
+}
+
+
+SolverTaskScheduler::SolverTaskScheduler(btThreadSupportInterface* threadIf, int maxOutstandingTasks)
+: m_threadInterface (threadIf), m_maxNumOutstandingTasks (maxOutstandingTasks > SPU_MAX_SPUS ? SPU_MAX_SPUS : maxOutstandingTasks), 
+m_currentTask (0), m_numBusyTasks (0)
+{
+	m_taskDescriptors.resize(m_maxNumOutstandingTasks);
+	m_taskBusy.resize(m_maxNumOutstandingTasks);
+
+	m_threadInterface->startSPU();
+}
+
+
+SolverTaskScheduler::~SolverTaskScheduler()
+{
+	m_threadInterface->stopSPU();
+}
+
+SpuSolverTaskDesc* SolverTaskScheduler::getTask()
+{
+	int taskIdx = -1;
+
+	if (m_taskBusy[m_currentTask])
+	{
+		//try to find a new one
+		for (int i = 0; i < m_maxNumOutstandingTasks; ++i)
+		{
+			if (!m_taskBusy[i])
+			{
+				taskIdx = i;
+				break;
+			}
+		}
+
+		if (taskIdx < 0)
+		{
+			// Have to wait
+			unsigned int taskId;
+			unsigned int outputSize;
+
+			for (int i=0;i<m_maxNumOutstandingTasks;i++)
+			  {
+				  if (m_taskBusy[i])
+				  {
+					  taskId = i;
+					  break;
+				  }
+			  }
+
+			m_threadInterface->waitForResponse(&taskId, &outputSize);
+
+			m_taskBusy[taskId] = false;
+			m_numBusyTasks--;
+
+			taskIdx = taskId;
+		}
+
+		m_currentTask = taskIdx;
+	}
+
+
+	SpuSolverTaskDesc* result = &m_taskDescriptors[m_currentTask];
+	memset(result, 0, sizeof(SpuSolverTaskDesc));
+	result->m_taskId = m_currentTask;
+
+	return result;
+}
+
+void SolverTaskScheduler::issueTask()
+{
+	m_taskBusy[m_currentTask] = true;
+	m_numBusyTasks++;
+
+	SpuSolverTaskDesc& desc = m_taskDescriptors[m_currentTask];
+	
+	m_threadInterface->sendRequest(1, (uint32_t)&desc, m_currentTask);
+}
+
+void SolverTaskScheduler::flushTasks()
+{
+	while (m_numBusyTasks > 0)
+	{
+		unsigned int taskId;
+		unsigned int outputSize;
+		for (int i=0;i<m_maxNumOutstandingTasks;i++)
+	  {
+		  if (m_taskBusy[i])
+		  {
+			  taskId = i;
+			  break;
+		  }
+	  }
+
+		m_threadInterface->waitForResponse(&taskId, &outputSize);
+
+		m_taskBusy[taskId] = false;
+		m_numBusyTasks--;
+	}
+}
--- a/src/BulletMultiThreaded/SpuParallelSolver.h
+++ b/src/BulletMultiThreaded/SpuParallelSolver.h
@@ -0,0 +1,75 @@
+/*
+Bullet Continuous Collision Detection and Physics Library - Parallel solver
+Copyright (c) 2007 Starbreeze Studios
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+Written by: Marten Svanfeldt
+*/
+
+#ifndef SPU_PARALLELSOLVER_H
+#define SPU_PARALLELSOLVER_H
+
+#include "BulletDynamics/ConstraintSolver/btConstraintSolver.h"
+#include "btThreadSupportInterface.h"
+#include "LinearMath/btAlignedObjectArray.h"
+
+class SolverTaskScheduler
+{
+protected:
+	class	btThreadSupportInterface*	m_threadInterface;
+	int						m_maxNumOutstandingTasks;
+
+	unsigned int						m_currentTask;
+	unsigned int						m_numBusyTasks;
+
+	btAlignedObjectArray<struct SpuSolverTaskDesc>	m_taskDescriptors;
+	btAlignedObjectArray<bool>						m_taskBusy;
+
+public:
+	SolverTaskScheduler (btThreadSupportInterface* threadIf, int maxOutstandingTasks);
+	~SolverTaskScheduler ();
+
+	struct SpuSolverTaskDesc* getTask ();
+
+	void issueTask();
+	void flushTasks();
+
+	int getMaxOutstandingTasks()
+	{
+		return m_maxNumOutstandingTasks;
+	}
+};
+
+class btParallelSequentialImpulseSolver : public btConstraintSolver
+{
+protected:
+
+	struct SpuSolverHash*						m_solverHash;
+	btAlignedObjectArray<struct ManifoldCellHolder>				m_sortedManifolds;
+	btAlignedObjectArray<struct ConstraintCellHolder>				m_sortedConstraints;
+	btAlignedObjectArray<class btRigidBody*>	m_allObjects;
+
+	int											m_numberOfContacts;	
+
+	SolverTaskScheduler							m_taskScheduler;
+
+public:
+	btParallelSequentialImpulseSolver (btThreadSupportInterface* threadIf, int maxOutstandingTasks);
+	virtual ~btParallelSequentialImpulseSolver();
+
+	virtual void prepareSolve (int numBodies, int numManifolds);
+	virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints, const btContactSolverInfo& info,class btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher);
+	virtual void allSolved (const btContactSolverInfo& info,class btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc);
+	virtual void reset ();
+};
+
+#endif
--- a/src/BulletMultiThreaded/SpuRaycastTask/SpuRaycastTask.cpp
+++ b/src/BulletMultiThreaded/SpuRaycastTask/SpuRaycastTask.cpp
@@ -0,0 +1,786 @@
+
+
+#include "../PlatformDefinitions.h"
+#include "SpuRaycastTask.h"
+#include "../SpuCollisionObjectWrapper.h"
+#include "../SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h"
+#include "SpuSubSimplexConvexCast.h"
+#include "LinearMath/btAabbUtil2.h"
+
+
+/* Future optimization strategies: 
+1. BBOX prune before loading shape data
+2. Could reduce number of dmas for ray output data to a single read and write.
+   By sharing the temporary work unit output structures across objects.
+3. The reason SpuRaycastNodeCallback1 is slower is because the triangle data isn't
+   being cached across calls. Fix that by doing the final ray pruning inside the callback.
+*/
+
+/* Future work:
+1. support first hit, closest hit, etc rather than just closest hit.
+2. support compound objects
+*/
+
+#define CALLBACK_ALL
+
+struct RaycastTask_LocalStoreMemory
+{
+	ATTRIBUTE_ALIGNED16(char gColObj [sizeof(btCollisionObject)+16]);
+	btCollisionObject* getColObj()
+	{
+		return (btCollisionObject*) gColObj;
+	}
+
+	ATTRIBUTE_ALIGNED16(SpuCollisionObjectWrapper gCollisionObjectWrapper);
+	SpuCollisionObjectWrapper* getCollisionObjectWrapper ()
+	{
+		return &gCollisionObjectWrapper;
+	}
+
+	CollisionShape_LocalStoreMemory gCollisionShape;
+	ATTRIBUTE_ALIGNED16(int	spuIndices[16]);
+
+	bvhMeshShape_LocalStoreMemory bvhShapeData;
+	SpuConvexPolyhedronVertexData convexVertexData;
+	CompoundShape_LocalStoreMemory compoundShapeData;
+};
+
+#ifdef WIN32
+void* createRaycastLocalStoreMemory()
+{
+	return new RaycastTask_LocalStoreMemory;
+};
+#elif defined(__CELLOS_LV2__)
+ATTRIBUTE_ALIGNED16(RaycastTask_LocalStoreMemory gLocalStoreMemory);
+void* createRaycastLocalStoreMemory()
+{
+	return &gLocalStoreMemory;
+}
+#endif
+
+void GatherCollisionObjectAndShapeData (RaycastGatheredObjectData* gatheredObjectData, RaycastTask_LocalStoreMemory* lsMemPtr, ppu_address_t objectWrapper)
+{
+	register int dmaSize;
+	register ppu_address_t	dmaPpuAddress2;
+	/* DMA Collision object wrapper into local store */
+	dmaSize = sizeof(SpuCollisionObjectWrapper);
+	dmaPpuAddress2 = objectWrapper;
+	cellDmaGet(&lsMemPtr->gCollisionObjectWrapper, dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0);
+	cellDmaWaitTagStatusAll(DMA_MASK(1));
+
+	/* DMA Collision object into local store */
+	dmaSize = sizeof(btCollisionObject);
+	dmaPpuAddress2 = lsMemPtr->getCollisionObjectWrapper()->getCollisionObjectPtr();
+	cellDmaGet(&lsMemPtr->gColObj, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
+	cellDmaWaitTagStatusAll(DMA_MASK(2));
+	
+	/* Gather information about collision object and shape */
+	gatheredObjectData->m_worldTransform = lsMemPtr->getColObj()->getWorldTransform();
+	gatheredObjectData->m_collisionMargin = lsMemPtr->getCollisionObjectWrapper()->getCollisionMargin ();
+	gatheredObjectData->m_shapeType = lsMemPtr->getCollisionObjectWrapper()->getShapeType ();
+	gatheredObjectData->m_collisionShape = (ppu_address_t)lsMemPtr->getColObj()->getCollisionShape();
+	gatheredObjectData->m_spuCollisionShape = (void*)&lsMemPtr->gCollisionShape.collisionShape;
+
+	/* DMA shape data */
+	dmaCollisionShape (gatheredObjectData->m_spuCollisionShape, gatheredObjectData->m_collisionShape, 1, gatheredObjectData->m_shapeType);
+	cellDmaWaitTagStatusAll(DMA_MASK(1));
+	if (btBroadphaseProxy::isConvex (gatheredObjectData->m_shapeType))
+	{
+		btConvexInternalShape* spuConvexShape = (btConvexInternalShape*)gatheredObjectData->m_spuCollisionShape;
+		gatheredObjectData->m_primitiveDimensions = spuConvexShape->getImplicitShapeDimensions ();
+	} else {
+		gatheredObjectData->m_primitiveDimensions = btVector3(1.0, 1.0, 1.0);
+	}
+
+}
+
+void dmaLoadRayOutput (ppu_address_t rayOutputAddr, SpuRaycastTaskWorkUnitOut* rayOutput, uint32_t dmaTag)
+{
+	cellDmaGet(rayOutput, rayOutputAddr, sizeof(*rayOutput), DMA_TAG(dmaTag), 0, 0);
+}
+
+void dmaStoreRayOutput (ppu_address_t rayOutputAddr, const SpuRaycastTaskWorkUnitOut* rayOutput, uint32_t dmaTag)
+{
+	cellDmaLargePut (rayOutput, rayOutputAddr, sizeof(*rayOutput), DMA_TAG(dmaTag), 0, 0);
+}
+
+#if 0
+SIMD_FORCE_INLINE void small_cache_read(void* buffer, ppu_address_t ea, size_t size)
+{
+#if USE_SOFTWARE_CACHE
+	// Check for alignment requirements. We need to make sure the entire request fits within one cache line,
+	// so the first and last bytes should fall on the same cache line
+	btAssert((ea & ~SPE_CACHELINE_MASK) == ((ea + size - 1) & ~SPE_CACHELINE_MASK));
+
+	void* ls = spe_cache_read(ea);
+	memcpy(buffer, ls, size);
+#else
+	stallingUnalignedDmaSmallGet(buffer,ea,size);
+#endif
+}
+#endif
+
+void small_cache_read_triple(	void* ls0, ppu_address_t ea0,
+												void* ls1, ppu_address_t ea1,
+												void* ls2, ppu_address_t ea2,
+												size_t size)
+{
+		btAssert(size<16);
+		ATTRIBUTE_ALIGNED16(char	tmpBuffer0[32]);
+		ATTRIBUTE_ALIGNED16(char	tmpBuffer1[32]);
+		ATTRIBUTE_ALIGNED16(char	tmpBuffer2[32]);
+
+		uint32_t i;
+		
+
+		///make sure last 4 bits are the same, for cellDmaSmallGet
+		char* localStore0 = (char*)ls0;
+		uint32_t last4BitsOffset = ea0 & 0x0f;
+		char* tmpTarget0 = tmpBuffer0 + last4BitsOffset;
+		tmpTarget0 = (char*)cellDmaSmallGetReadOnly(tmpTarget0,ea0,size,DMA_TAG(1),0,0);
+
+
+		char* localStore1 = (char*)ls1;
+		last4BitsOffset = ea1 & 0x0f;
+		char* tmpTarget1 = tmpBuffer1 + last4BitsOffset;
+		tmpTarget1 = (char*)cellDmaSmallGetReadOnly(tmpTarget1,ea1,size,DMA_TAG(1),0,0);
+		
+		char* localStore2 = (char*)ls2;
+		last4BitsOffset = ea2 & 0x0f;
+		char* tmpTarget2 = tmpBuffer2 + last4BitsOffset;
+		tmpTarget2 = (char*)cellDmaSmallGetReadOnly(tmpTarget2,ea2,size,DMA_TAG(1),0,0);
+		
+		
+		cellDmaWaitTagStatusAll( DMA_MASK(1) );
+
+		//this is slowish, perhaps memcpy on SPU is smarter?
+		for (i=0; btLikely( i<size );i++)
+		{
+			localStore0[i] = tmpTarget0[i];
+			localStore1[i] = tmpTarget1[i];
+			localStore2[i] = tmpTarget2[i];
+		}
+}
+
+void performRaycastAgainstConvex (RaycastGatheredObjectData* gatheredObjectData, const SpuRaycastTaskWorkUnit& workUnit, SpuRaycastTaskWorkUnitOut* workUnitOut, RaycastTask_LocalStoreMemory* lsMemPtr);
+
+class spuRaycastNodeCallback1 : public btNodeOverlapCallback
+{
+	RaycastGatheredObjectData* m_gatheredObjectData;
+	const SpuRaycastTaskWorkUnit* m_workUnits;
+	SpuRaycastTaskWorkUnitOut* m_workUnitsOut;
+	int m_workUnit;
+	RaycastTask_LocalStoreMemory* m_lsMemPtr;
+
+	ATTRIBUTE_ALIGNED16(btVector3	spuTriangleVertices[3]);
+	ATTRIBUTE_ALIGNED16(btScalar	spuUnscaledVertex[4]);
+	//ATTRIBUTE_ALIGNED16(int	spuIndices[16]);
+public:
+	spuRaycastNodeCallback1(RaycastGatheredObjectData* gatheredObjectData,const SpuRaycastTaskWorkUnit* workUnits, SpuRaycastTaskWorkUnitOut* workUnitsOut, RaycastTask_LocalStoreMemory* lsMemPtr)
+		: m_gatheredObjectData(gatheredObjectData),
+		  m_workUnits(workUnits),
+		  m_workUnitsOut(workUnitsOut),
+		  m_workUnit(0),
+		  m_lsMemPtr (lsMemPtr)
+	{
+	}
+
+	void setWorkUnit (int workUnit) { m_workUnit = workUnit; }
+	virtual void processNode(int subPart, int triangleIndex)
+	{
+		///Create a triangle on the stack, call process collision, with GJK
+		///DMA the vertices, can benefit from software caching
+
+		//		spu_printf("processNode with triangleIndex %d\n",triangleIndex);
+
+			// ugly solution to support both 16bit and 32bit indices
+		if (m_lsMemPtr->bvhShapeData.gIndexMesh.m_indexType == PHY_SHORT)
+		{
+			short int* indexBasePtr = (short int*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexBase+triangleIndex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexStride);
+			ATTRIBUTE_ALIGNED16(short int tmpIndices[3]);
+
+			small_cache_read_triple(&tmpIndices[0],(ppu_address_t)&indexBasePtr[0],
+									&tmpIndices[1],(ppu_address_t)&indexBasePtr[1],
+									&tmpIndices[2],(ppu_address_t)&indexBasePtr[2],
+									sizeof(short int));
+
+			m_lsMemPtr->spuIndices[0] = int(tmpIndices[0]);
+			m_lsMemPtr->spuIndices[1] = int(tmpIndices[1]);
+			m_lsMemPtr->spuIndices[2] = int(tmpIndices[2]);
+		} else
+		{
+			int* indexBasePtr = (int*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexBase+triangleIndex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexStride);
+
+			small_cache_read_triple(&m_lsMemPtr->spuIndices[0],(ppu_address_t)&indexBasePtr[0],
+								&m_lsMemPtr->spuIndices[1],(ppu_address_t)&indexBasePtr[1],
+								&m_lsMemPtr->spuIndices[2],(ppu_address_t)&indexBasePtr[2],
+								sizeof(int));
+		}
+
+		//printf("%d %d %d\n", m_lsMemPtr->spuIndices[0], m_lsMemPtr->spuIndices[1], m_lsMemPtr->spuIndices[2]);
+		//		spu_printf("SPU index0=%d ,",spuIndices[0]);
+		//		spu_printf("SPU index1=%d ,",spuIndices[1]);
+		//		spu_printf("SPU index2=%d ,",spuIndices[2]);
+		//		spu_printf("SPU: indexBasePtr=%llx\n",indexBasePtr);
+
+		const btVector3& meshScaling = m_lsMemPtr->bvhShapeData.gTriangleMeshInterfacePtr->getScaling();
+	
+		for (int j=2;btLikely( j>=0 );j--)
+		{
+			int graphicsindex = m_lsMemPtr->spuIndices[j];
+
+						//spu_printf("SPU index=%d ,",graphicsindex);
+			btScalar* graphicsbasePtr = (btScalar*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexBase+graphicsindex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexStride);
+			
+			//			spu_printf("SPU graphicsbasePtr=%llx\n",graphicsbasePtr);
+
+
+			///handle un-aligned vertices...
+
+			//another DMA for each vertex
+			small_cache_read_triple(&spuUnscaledVertex[0],(ppu_address_t)&graphicsbasePtr[0],
+									&spuUnscaledVertex[1],(ppu_address_t)&graphicsbasePtr[1],
+									&spuUnscaledVertex[2],(ppu_address_t)&graphicsbasePtr[2],
+									sizeof(btScalar));
+			
+			//printf("%f %f %f\n", spuUnscaledVertex[0],spuUnscaledVertex[1],spuUnscaledVertex[2]);
+			spuTriangleVertices[j] = btVector3(
+				spuUnscaledVertex[0]*meshScaling.getX(),
+				spuUnscaledVertex[1]*meshScaling.getY(),
+				spuUnscaledVertex[2]*meshScaling.getZ());
+
+				//spu_printf("SPU:triangle vertices:%f,%f,%f\n",spuTriangleVertices[j].x(),spuTriangleVertices[j].y(),spuTriangleVertices[j].z());
+		}
+		
+		RaycastGatheredObjectData triangleGatheredObjectData (*m_gatheredObjectData);
+		triangleGatheredObjectData.m_shapeType = TRIANGLE_SHAPE_PROXYTYPE;
+		triangleGatheredObjectData.m_spuCollisionShape = &spuTriangleVertices[0];
+
+		//printf("%f %f %f\n", spuTriangleVertices[0][0],spuTriangleVertices[0][1],spuTriangleVertices[0][2]);
+		//printf("%f %f %f\n", spuTriangleVertices[1][0],spuTriangleVertices[1][1],spuTriangleVertices[1][2]);
+		//printf("%f %f %f\n", spuTriangleVertices[2][0],spuTriangleVertices[2][1],spuTriangleVertices[2][2]);
+		SpuRaycastTaskWorkUnitOut out;
+		out.hitFraction = 1.0;
+		performRaycastAgainstConvex (&triangleGatheredObjectData, m_workUnits[m_workUnit], &out, m_lsMemPtr);
+		/* XXX: For now only take the closest hit */
+		if (out.hitFraction < m_workUnitsOut[m_workUnit].hitFraction)
+		{
+			m_workUnitsOut[m_workUnit].hitFraction = out.hitFraction;
+			m_workUnitsOut[m_workUnit].hitNormal = out.hitNormal;
+		}
+	}
+
+};
+
+class spuRaycastNodeCallback : public btNodeOverlapCallback
+{
+	RaycastGatheredObjectData* m_gatheredObjectData;
+	const SpuRaycastTaskWorkUnit* m_workUnits;
+	SpuRaycastTaskWorkUnitOut* m_workUnitsOut;
+	int m_numWorkUnits;
+	RaycastTask_LocalStoreMemory* m_lsMemPtr;
+
+	ATTRIBUTE_ALIGNED16(btVector3	spuTriangleVertices[3]);
+	ATTRIBUTE_ALIGNED16(btScalar	spuUnscaledVertex[4]);
+	//ATTRIBUTE_ALIGNED16(int	spuIndices[16]);
+public:
+	spuRaycastNodeCallback(RaycastGatheredObjectData* gatheredObjectData,const SpuRaycastTaskWorkUnit* workUnits, SpuRaycastTaskWorkUnitOut* workUnitsOut, int numWorkUnits, RaycastTask_LocalStoreMemory* lsMemPtr)
+		: m_gatheredObjectData(gatheredObjectData),
+		  m_workUnits(workUnits),
+		  m_workUnitsOut(workUnitsOut),
+		  m_numWorkUnits(numWorkUnits),
+		  m_lsMemPtr (lsMemPtr)
+	{
+	}
+
+	virtual void processNode(int subPart, int triangleIndex)
+	{
+		///Create a triangle on the stack, call process collision, with GJK
+		///DMA the vertices, can benefit from software caching
+
+		//		spu_printf("processNode with triangleIndex %d\n",triangleIndex);
+
+			// ugly solution to support both 16bit and 32bit indices
+		if (m_lsMemPtr->bvhShapeData.gIndexMesh.m_indexType == PHY_SHORT)
+		{
+			short int* indexBasePtr = (short int*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexBase+triangleIndex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexStride);
+			ATTRIBUTE_ALIGNED16(short int tmpIndices[3]);
+
+			small_cache_read_triple(&tmpIndices[0],(ppu_address_t)&indexBasePtr[0],
+									&tmpIndices[1],(ppu_address_t)&indexBasePtr[1],
+									&tmpIndices[2],(ppu_address_t)&indexBasePtr[2],
+									sizeof(short int));
+
+			m_lsMemPtr->spuIndices[0] = int(tmpIndices[0]);
+			m_lsMemPtr->spuIndices[1] = int(tmpIndices[1]);
+			m_lsMemPtr->spuIndices[2] = int(tmpIndices[2]);
+		} else
+		{
+			int* indexBasePtr = (int*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexBase+triangleIndex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexStride);
+
+			small_cache_read_triple(&m_lsMemPtr->spuIndices[0],(ppu_address_t)&indexBasePtr[0],
+								&m_lsMemPtr->spuIndices[1],(ppu_address_t)&indexBasePtr[1],
+								&m_lsMemPtr->spuIndices[2],(ppu_address_t)&indexBasePtr[2],
+								sizeof(int));
+		}
+
+		//printf("%d %d %d\n", m_lsMemPtr->spuIndices[0], m_lsMemPtr->spuIndices[1], m_lsMemPtr->spuIndices[2]);
+		//		spu_printf("SPU index0=%d ,",spuIndices[0]);
+		//		spu_printf("SPU index1=%d ,",spuIndices[1]);
+		//		spu_printf("SPU index2=%d ,",spuIndices[2]);
+		//		spu_printf("SPU: indexBasePtr=%llx\n",indexBasePtr);
+
+		const btVector3& meshScaling = m_lsMemPtr->bvhShapeData.gTriangleMeshInterfacePtr->getScaling();
+	
+		for (int j=2;btLikely( j>=0 );j--)
+		{
+			int graphicsindex = m_lsMemPtr->spuIndices[j];
+
+						//spu_printf("SPU index=%d ,",graphicsindex);
+			btScalar* graphicsbasePtr = (btScalar*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexBase+graphicsindex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexStride);
+			
+			//			spu_printf("SPU graphicsbasePtr=%llx\n",graphicsbasePtr);
+
+
+			///handle un-aligned vertices...
+
+			//another DMA for each vertex
+			small_cache_read_triple(&spuUnscaledVertex[0],(ppu_address_t)&graphicsbasePtr[0],
+									&spuUnscaledVertex[1],(ppu_address_t)&graphicsbasePtr[1],
+									&spuUnscaledVertex[2],(ppu_address_t)&graphicsbasePtr[2],
+									sizeof(btScalar));
+			
+			//printf("%f %f %f\n", spuUnscaledVertex[0],spuUnscaledVertex[1],spuUnscaledVertex[2]);
+			spuTriangleVertices[j] = btVector3(
+				spuUnscaledVertex[0]*meshScaling.getX(),
+				spuUnscaledVertex[1]*meshScaling.getY(),
+				spuUnscaledVertex[2]*meshScaling.getZ());
+
+				//spu_printf("SPU:triangle vertices:%f,%f,%f\n",spuTriangleVertices[j].x(),spuTriangleVertices[j].y(),spuTriangleVertices[j].z());
+		}
+		
+		RaycastGatheredObjectData triangleGatheredObjectData (*m_gatheredObjectData);
+		triangleGatheredObjectData.m_shapeType = TRIANGLE_SHAPE_PROXYTYPE;
+		triangleGatheredObjectData.m_spuCollisionShape = &spuTriangleVertices[0];
+
+		//printf("%f %f %f\n", spuTriangleVertices[0][0],spuTriangleVertices[0][1],spuTriangleVertices[0][2]);
+		//printf("%f %f %f\n", spuTriangleVertices[1][0],spuTriangleVertices[1][1],spuTriangleVertices[1][2]);
+		//printf("%f %f %f\n", spuTriangleVertices[2][0],spuTriangleVertices[2][1],spuTriangleVertices[2][2]);
+		for (int i = 0; i < m_numWorkUnits; i++)
+		{
+			SpuRaycastTaskWorkUnitOut out;
+			out.hitFraction = 1.0;
+			performRaycastAgainstConvex (&triangleGatheredObjectData, m_workUnits[i], &out, m_lsMemPtr);
+			/* XXX: For now only take the closest hit */
+			if (out.hitFraction < m_workUnitsOut[i].hitFraction)
+			{
+				m_workUnitsOut[i].hitFraction = out.hitFraction;
+				m_workUnitsOut[i].hitNormal = out.hitNormal;
+			}
+		}
+	}
+
+};
+
+
+void	spuWalkStacklessQuantizedTreeAgainstRays(RaycastTask_LocalStoreMemory* lsMemPtr, 
+						 btNodeOverlapCallback* nodeCallback,
+						 const btVector3* rayFrom,
+						 const btVector3* rayTo,
+						 int numWorkUnits,
+						 unsigned short int* quantizedQueryAabbMin,
+						 unsigned short int* quantizedQueryAabbMax,
+						 const btQuantizedBvhNode* rootNode,
+						 int startNodeIndex,int endNodeIndex)
+{
+	int curIndex = startNodeIndex;
+	int walkIterations = 0;
+	int subTreeSize = endNodeIndex - startNodeIndex;
+
+	int escapeIndex;
+
+	unsigned int boxBoxOverlap, rayBoxOverlap, anyRayBoxOverlap;
+	unsigned int isLeafNode;
+
+#define RAYAABB2
+#ifdef RAYAABB2
+	unsigned int sign[SPU_RAYCAST_WORK_UNITS_PER_TASK][3];
+	btVector3 rayInvDirection[SPU_RAYCAST_WORK_UNITS_PER_TASK];
+	btScalar lambda_max[SPU_RAYCAST_WORK_UNITS_PER_TASK];
+	for (int i = 0; i < numWorkUnits; i++)
+	{
+		btVector3 rayDirection = (rayTo[i]-rayFrom[i]);
+		rayDirection.normalize ();
+		lambda_max[i] = rayDirection.dot(rayTo[i]-rayFrom[i]);
+		rayInvDirection[i][0] = btScalar(1.0) / rayDirection[0];
+		rayInvDirection[i][1] = btScalar(1.0) / rayDirection[1];
+		rayInvDirection[i][2] = btScalar(1.0) / rayDirection[2];
+		sign[i][0] = rayDirection[0] < 0.0;
+		sign[i][1] = rayDirection[1] < 0.0;
+		sign[i][2] = rayDirection[2] < 0.0;
+	}
+#endif
+
+	while (curIndex < endNodeIndex)
+	{
+		//catch bugs in tree data
+		assert (walkIterations < subTreeSize);
+
+		walkIterations++;
+
+		isLeafNode = rootNode->isLeafNode();
+
+		anyRayBoxOverlap = 0;
+
+		for (int i = 0; i < numWorkUnits; i++)
+		{
+			unsigned short int* quamin = (quantizedQueryAabbMin + 3 * i);
+			unsigned short int* quamax = (quantizedQueryAabbMax + 3 * i);
+			boxBoxOverlap = spuTestQuantizedAabbAgainstQuantizedAabb(quamin,quamax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax);
+			if (!boxBoxOverlap)
+				continue;
+
+			rayBoxOverlap = 0;
+			btScalar param = 1.0;
+			btVector3 normal;
+			btVector3 bounds[2];
+			bounds[0] = lsMemPtr->bvhShapeData.getOptimizedBvh()->unQuantize(rootNode->m_quantizedAabbMin);
+			bounds[1] = lsMemPtr->bvhShapeData.getOptimizedBvh()->unQuantize(rootNode->m_quantizedAabbMax);
+#ifdef RAYAABB2
+			rayBoxOverlap = btRayAabb2 (rayFrom[i], rayInvDirection[i], sign[i], bounds, param, 0.0, lambda_max[i]);
+#else
+			rayBoxOverlap = btRayAabb(rayFrom[i], rayTo[i], bounds[0], bounds[1], param, normal);
+#endif
+
+#ifndef CALLBACK_ALL
+			anyRayBoxOverlap = rayBoxOverlap || anyRayBoxOverlap;
+			/* If we have any ray vs. box overlap and this isn't a leaf node
+			   we know that we need to dig deeper
+			*/
+			if (!isLeafNode && anyRayBoxOverlap)
+				break;
+
+			if (isLeafNode && rayBoxOverlap)
+			{
+				spuRaycastNodeCallback1* callback = (spuRaycastNodeCallback1*)nodeCallback;
+				callback->setWorkUnit (i);
+				nodeCallback->processNode (0, rootNode->getTriangleIndex());
+			}
+#else
+			/* If we have any ray vs. box overlap and this isn't a leaf node
+			   we know that we need to dig deeper
+			*/
+			if (rayBoxOverlap)
+			{
+				anyRayBoxOverlap = 1;
+				break;
+			}
+#endif
+		}
+
+#ifdef CALLBACK_ALL
+		if (isLeafNode && anyRayBoxOverlap)
+		{
+			nodeCallback->processNode (0, rootNode->getTriangleIndex());
+		}
+#endif
+
+		if (anyRayBoxOverlap || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->getEscapeIndex();
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+
+}
+
+
+void performRaycastAgainstConcave (RaycastGatheredObjectData* gatheredObjectData, const SpuRaycastTaskWorkUnit* workUnits, SpuRaycastTaskWorkUnitOut* workUnitsOut, int numWorkUnits, RaycastTask_LocalStoreMemory* lsMemPtr)
+{
+	//order: first collision shape is convex, second concave. m_isSwapped is true, if the original order was opposite
+	register int dmaSize;
+	register ppu_address_t	dmaPpuAddress2;
+
+	
+	btBvhTriangleMeshShape*	trimeshShape = (btBvhTriangleMeshShape*)gatheredObjectData->m_spuCollisionShape;
+
+	//need the mesh interface, for access to triangle vertices
+	dmaBvhShapeData (&(lsMemPtr->bvhShapeData), trimeshShape);
+
+	unsigned short int quantizedQueryAabbMin[SPU_RAYCAST_WORK_UNITS_PER_TASK][3];
+	unsigned short int quantizedQueryAabbMax[SPU_RAYCAST_WORK_UNITS_PER_TASK][3];
+	btVector3 rayFromInTriangleSpace[SPU_RAYCAST_WORK_UNITS_PER_TASK];
+	btVector3 rayToInTriangleSpace[SPU_RAYCAST_WORK_UNITS_PER_TASK];
+
+	/* Calculate the AABB for the ray in the triangle mesh shape */
+	btTransform rayInTriangleSpace;
+	rayInTriangleSpace = gatheredObjectData->m_worldTransform.inverse();
+
+	for (int i = 0; i < numWorkUnits; i++)
+	{
+		btVector3 aabbMin;
+		btVector3 aabbMax;
+
+		rayFromInTriangleSpace[i] = rayInTriangleSpace(workUnits[i].rayFrom);
+		rayToInTriangleSpace[i] = rayInTriangleSpace(workUnits[i].rayTo);
+
+		aabbMin = rayFromInTriangleSpace[i];
+		aabbMin.setMin (rayToInTriangleSpace[i]);
+		aabbMax = rayFromInTriangleSpace[i];
+		aabbMax.setMax (rayToInTriangleSpace[i]);
+
+		lsMemPtr->bvhShapeData.getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMin[i],aabbMin,0);
+		lsMemPtr->bvhShapeData.getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMax[i],aabbMax,1);
+	}
+
+	QuantizedNodeArray&	nodeArray = lsMemPtr->bvhShapeData.getOptimizedBvh()->getQuantizedNodeArray();
+	//spu_printf("SPU: numNodes = %d\n",nodeArray.size());
+
+	BvhSubtreeInfoArray& subTrees = lsMemPtr->bvhShapeData.getOptimizedBvh()->getSubtreeInfoArray();	
+
+#ifdef CALLBACK_ALL
+	spuRaycastNodeCallback nodeCallback (gatheredObjectData, workUnits, workUnitsOut, numWorkUnits, lsMemPtr);
+#else
+	spuRaycastNodeCallback1 nodeCallback (gatheredObjectData, workUnits, workUnitsOut, lsMemPtr);
+#endif
+	
+	IndexedMeshArray&	indexArray = lsMemPtr->bvhShapeData.gTriangleMeshInterfacePtr->getIndexedMeshArray();
+
+	//spu_printf("SPU:indexArray.size() = %d\n",indexArray.size());
+	//	spu_printf("SPU: numSubTrees = %d\n",subTrees.size());
+	//not likely to happen
+	if (subTrees.size() && indexArray.size() == 1)
+	{
+		///DMA in the index info
+		dmaBvhIndexedMesh (&lsMemPtr->bvhShapeData.gIndexMesh, indexArray, 0 /* index into indexArray */, 1 /* dmaTag */);
+		cellDmaWaitTagStatusAll(DMA_MASK(1));
+		
+		//display the headers
+		int numBatch = subTrees.size();
+		for (int i=0;i<numBatch;)
+		{
+// BEN: TODO - can reorder DMA transfers for less stall
+			int remaining = subTrees.size() - i;
+			int nextBatch = remaining < MAX_SPU_SUBTREE_HEADERS ? remaining : MAX_SPU_SUBTREE_HEADERS;
+			
+			dmaBvhSubTreeHeaders (&lsMemPtr->bvhShapeData.gSubtreeHeaders[0], (ppu_address_t)(&subTrees[i]), nextBatch, 1);
+			cellDmaWaitTagStatusAll(DMA_MASK(1));
+			
+
+			//			spu_printf("nextBatch = %d\n",nextBatch);
+
+			
+			for (int j=0;j<nextBatch;j++)
+			{
+				const btBvhSubtreeInfo& subtree = lsMemPtr->bvhShapeData.gSubtreeHeaders[j];
+				
+				unsigned int overlap = 1;
+				for (int boxId = 0; boxId < numWorkUnits; boxId++)
+				{
+					overlap = spuTestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin[boxId],quantizedQueryAabbMax[boxId],subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+					if (overlap)
+						break;
+				}
+
+				if (overlap)
+				{
+					btAssert(subtree.m_subtreeSize);
+
+					//dma the actual nodes of this subtree
+					dmaBvhSubTreeNodes (&lsMemPtr->bvhShapeData.gSubtreeNodes[0], subtree, nodeArray, 2);
+
+					cellDmaWaitTagStatusAll(DMA_MASK(2));
+
+					/* Walk this subtree */
+					
+					{
+
+						spuWalkStacklessQuantizedTreeAgainstRays(lsMemPtr,
+										        &nodeCallback,
+										        &rayFromInTriangleSpace[0],
+											&rayToInTriangleSpace[0],
+											numWorkUnits,
+											&quantizedQueryAabbMin[0][0],&quantizedQueryAabbMax[0][0],
+											&lsMemPtr->bvhShapeData.gSubtreeNodes[0], 0, subtree.m_subtreeSize);
+					}
+				}
+				//				spu_printf("subtreeSize = %d\n",gSubtreeHeaders[j].m_subtreeSize);
+			}
+
+			//	unsigned short int	m_quantizedAabbMin[3];
+			//	unsigned short int	m_quantizedAabbMax[3];
+			//	int			m_rootNodeIndex;
+			//	int			m_subtreeSize;
+			i+=nextBatch;
+		}
+
+		//pre-fetch first tree, then loop and double buffer
+	}
+	
+}
+
+void performRaycastAgainstCompound (RaycastGatheredObjectData* gatheredObjectData, const SpuRaycastTaskWorkUnit& workUnit, SpuRaycastTaskWorkUnitOut* workUnitOut, RaycastTask_LocalStoreMemory* lsMemPtr)
+{
+	//XXX spu_printf ("Currently no support for ray. vs compound objects. Support coming soon.\n");
+}
+
+void
+performRaycastAgainstConvex (RaycastGatheredObjectData* gatheredObjectData, const SpuRaycastTaskWorkUnit& workUnit, SpuRaycastTaskWorkUnitOut* workUnitOut, RaycastTask_LocalStoreMemory* lsMemPtr)
+{
+	SpuVoronoiSimplexSolver simplexSolver;
+
+	btTransform rayFromTrans, rayToTrans;
+	rayFromTrans.setIdentity ();
+	rayFromTrans.setOrigin (workUnit.rayFrom);
+	rayToTrans.setIdentity ();
+	rayToTrans.setOrigin (workUnit.rayTo);
+
+	SpuCastResult result;
+
+	/* Load the vertex data if the shape is a convex hull */
+	/* XXX: We might be loading the shape twice */
+	ATTRIBUTE_ALIGNED16(char convexHullShape[sizeof(btConvexHullShape)]);
+	if (gatheredObjectData->m_shapeType == CONVEX_HULL_SHAPE_PROXYTYPE)
+	{
+		register int dmaSize;
+		register ppu_address_t	dmaPpuAddress2;
+		dmaSize = sizeof(btConvexHullShape);
+		dmaPpuAddress2 = gatheredObjectData->m_collisionShape;
+		cellDmaGet(&convexHullShape, dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0);
+		cellDmaWaitTagStatusAll(DMA_MASK(1));
+		dmaConvexVertexData (&lsMemPtr->convexVertexData, (btConvexHullShape*)&convexHullShape);
+		cellDmaWaitTagStatusAll(DMA_MASK(2)); // dmaConvexVertexData uses dma channel 2!
+		lsMemPtr->convexVertexData.gSpuConvexShapePtr = gatheredObjectData->m_spuCollisionShape;
+		lsMemPtr->convexVertexData.gConvexPoints = &lsMemPtr->convexVertexData.g_convexPointBuffer[0];
+	}
+
+	/* performRaycast */
+	SpuSubsimplexRayCast caster (gatheredObjectData->m_spuCollisionShape, &lsMemPtr->convexVertexData, gatheredObjectData->m_shapeType, gatheredObjectData->m_collisionMargin, &simplexSolver);
+	bool r = caster.calcTimeOfImpact (rayFromTrans, rayToTrans, gatheredObjectData->m_worldTransform, gatheredObjectData->m_worldTransform,result);
+
+	if (r)
+	{
+		workUnitOut->hitFraction = result.m_fraction;
+		workUnitOut->hitNormal = result.m_normal;
+	}
+}
+
+void	processRaycastTask(void* userPtr, void* lsMemory)
+{
+	RaycastTask_LocalStoreMemory* localMemory = (RaycastTask_LocalStoreMemory*)lsMemory;
+
+	SpuRaycastTaskDesc* taskDescPtr = (SpuRaycastTaskDesc*)userPtr;
+	SpuRaycastTaskDesc& taskDesc = *taskDescPtr;
+
+	SpuCollisionObjectWrapper* cows = (SpuCollisionObjectWrapper*)taskDesc.spuCollisionObjectsWrappers;
+
+	//spu_printf("in processRaycastTask %d\n", taskDesc.numSpuCollisionObjectWrappers);
+	/* for each object */
+	RaycastGatheredObjectData gatheredObjectData;
+	for (int objectId = 0; objectId < taskDesc.numSpuCollisionObjectWrappers; objectId++)
+	{
+		//spu_printf("%d / %d\n", objectId, taskDesc.numSpuCollisionObjectWrappers);
+		
+		/* load initial collision shape */
+		GatherCollisionObjectAndShapeData (&gatheredObjectData, localMemory, (ppu_address_t)&cows[objectId]);
+
+		if (btBroadphaseProxy::isConcave (gatheredObjectData.m_shapeType))
+		{
+			SpuRaycastTaskWorkUnitOut tWorkUnitsOut[SPU_RAYCAST_WORK_UNITS_PER_TASK];
+			for (int rayId = 0; rayId < taskDesc.numWorkUnits; rayId++)
+			{
+				tWorkUnitsOut[rayId].hitFraction = 1.0;
+			}
+
+			performRaycastAgainstConcave (&gatheredObjectData, &taskDesc.workUnits[0], &tWorkUnitsOut[0], taskDesc.numWorkUnits, localMemory);
+
+			for (int rayId = 0; rayId < taskDesc.numWorkUnits; rayId++)
+			{
+				const SpuRaycastTaskWorkUnit& workUnit = taskDesc.workUnits[rayId];
+				if (tWorkUnitsOut[rayId].hitFraction == 1.0)
+					continue;
+
+				ATTRIBUTE_ALIGNED16(SpuRaycastTaskWorkUnitOut workUnitOut);
+				dmaLoadRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1);
+				cellDmaWaitTagStatusAll(DMA_MASK(1));
+				
+				
+				/* XXX Only support taking the closest hit for now */
+				if (tWorkUnitsOut[rayId].hitFraction < workUnitOut.hitFraction)
+				{
+					workUnitOut.hitFraction = tWorkUnitsOut[rayId].hitFraction;
+					workUnitOut.hitNormal = tWorkUnitsOut[rayId].hitNormal;
+				}
+
+				/* write ray cast data back */
+				dmaStoreRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1);
+				cellDmaWaitTagStatusAll(DMA_MASK(1));
+			}
+		} else if (btBroadphaseProxy::isConvex (gatheredObjectData.m_shapeType)) {
+
+			btVector3 objectBoxMin, objectBoxMax;
+			computeAabb (objectBoxMin, objectBoxMax, (btConvexInternalShape*)gatheredObjectData.m_spuCollisionShape, gatheredObjectData.m_collisionShape, gatheredObjectData.m_shapeType, gatheredObjectData.m_worldTransform);
+			for (unsigned int rayId = 0; rayId < taskDesc.numWorkUnits; rayId++)
+			{
+				const SpuRaycastTaskWorkUnit& workUnit = taskDesc.workUnits[rayId];
+			
+				btScalar ignored_param = 1.0;
+				btVector3 ignored_normal;
+				if (btRayAabb(workUnit.rayFrom, workUnit.rayTo, objectBoxMin, objectBoxMax, ignored_param, ignored_normal))
+				{
+					ATTRIBUTE_ALIGNED16(SpuRaycastTaskWorkUnitOut workUnitOut);
+					SpuRaycastTaskWorkUnitOut tWorkUnitOut;
+					tWorkUnitOut.hitFraction = 1.0;
+
+					performRaycastAgainstConvex (&gatheredObjectData, workUnit, &tWorkUnitOut, localMemory);
+					if (tWorkUnitOut.hitFraction == 1.0)
+						continue;
+	
+					dmaLoadRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1);
+					cellDmaWaitTagStatusAll(DMA_MASK(1));
+
+					/* XXX Only support taking the closest hit for now */
+					if (tWorkUnitOut.hitFraction < workUnitOut.hitFraction)
+					{
+						workUnitOut.hitFraction = tWorkUnitOut.hitFraction;
+						workUnitOut.hitNormal = tWorkUnitOut.hitNormal;
+						/* write ray cast data back */
+						dmaStoreRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1);
+						cellDmaWaitTagStatusAll(DMA_MASK(1));
+					}
+				}
+			}
+
+		} else if (btBroadphaseProxy::isCompound (gatheredObjectData.m_shapeType)) {
+			for (unsigned int rayId = 0; rayId < taskDesc.numWorkUnits; rayId++)
+			{
+				const SpuRaycastTaskWorkUnit& workUnit = taskDesc.workUnits[rayId];
+				ATTRIBUTE_ALIGNED16(SpuRaycastTaskWorkUnitOut workUnitOut);
+				SpuRaycastTaskWorkUnitOut tWorkUnitOut;
+				tWorkUnitOut.hitFraction = 1.0;
+
+				performRaycastAgainstCompound (&gatheredObjectData, workUnit, &tWorkUnitOut, localMemory);
+				if (tWorkUnitOut.hitFraction == 1.0)
+					continue;
+
+				dmaLoadRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1);
+				cellDmaWaitTagStatusAll(DMA_MASK(1));
+				/* XXX Only support taking the closest hit for now */
+				if (tWorkUnitOut.hitFraction < workUnitOut.hitFraction)
+				{
+					workUnitOut.hitFraction = tWorkUnitOut.hitFraction;
+					workUnitOut.hitNormal = tWorkUnitOut.hitNormal;
+				}
+
+				/* write ray cast data back */
+				dmaStoreRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1);
+				cellDmaWaitTagStatusAll(DMA_MASK(1));
+			}
+		}
+	}
+}
--- a/src/BulletMultiThreaded/SpuRaycastTask/SpuRaycastTask.h
+++ b/src/BulletMultiThreaded/SpuRaycastTask/SpuRaycastTask.h
@@ -0,0 +1,50 @@
+#ifndef __SPU_RAYCAST_TASK_H
+#define __SPU_RAYCAST_TASK_H
+
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/CollisionDispatch/btCollisionWorld.h"
+#include "LinearMath/btVector3.h"
+#include "../PlatformDefinitions.h"
+
+ATTRIBUTE_ALIGNED16(struct) RaycastGatheredObjectData
+{
+	ppu_address_t m_collisionShape;
+	void* m_spuCollisionShape;
+	btVector3	m_primitiveDimensions;
+	int		m_shapeType;
+	float	m_collisionMargin;
+	btTransform	m_worldTransform;
+};
+
+
+ATTRIBUTE_ALIGNED16(struct) SpuRaycastTaskWorkUnitOut
+{
+	btVector3 hitNormal; /* out */
+	btScalar hitFraction; /* out */
+	btCollisionWorld::LocalShapeInfo shapeInfo; /* out */
+};
+
+/* Perform a raycast on collision object */
+ATTRIBUTE_ALIGNED16(struct) SpuRaycastTaskWorkUnit
+{
+	btVector3 rayFrom; /* in */
+	btVector3 rayTo; /* in */
+	SpuRaycastTaskWorkUnitOut* output; /* out */
+};
+
+#define SPU_RAYCAST_WORK_UNITS_PER_TASK 16
+
+ATTRIBUTE_ALIGNED128(struct) SpuRaycastTaskDesc
+{
+	SpuRaycastTaskWorkUnit workUnits[SPU_RAYCAST_WORK_UNITS_PER_TASK];
+	unsigned int numWorkUnits;
+	void* spuCollisionObjectsWrappers;
+	unsigned int numSpuCollisionObjectWrappers;
+	int taskId;
+};
+
+
+void	processRaycastTask (void* userPtr, void* lsMemory);
+void*	createRaycastLocalStoreMemory ();
+
+#endif
--- a/src/BulletMultiThreaded/SpuRaycastTask/SpuSubSimplexConvexCast.cpp
+++ b/src/BulletMultiThreaded/SpuRaycastTask/SpuSubSimplexConvexCast.cpp
@@ -0,0 +1,152 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuSubSimplexConvexCast.h"
+
+
+#include "BulletCollision/CollisionShapes/btConvexShape.h"
+#include "BulletCollision/CollisionShapes/btMinkowskiSumShape.h"
+#include "BulletCollision/NarrowPhaseCollision/btSimplexSolverInterface.h"
+
+
+SpuSubsimplexRayCast::SpuSubsimplexRayCast (void* shapeB, SpuConvexPolyhedronVertexData* convexDataB, int shapeTypeB, float marginB,
+										    SpuVoronoiSimplexSolver* simplexSolver)
+	:m_simplexSolver(simplexSolver), m_shapeB(shapeB), m_convexDataB(convexDataB), m_shapeTypeB(shapeTypeB), m_marginB(marginB)
+{
+}
+
+///Typically the conservative advancement reaches solution in a few iterations, clip it to 32 for degenerate cases.
+///See discussion about this here http://continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=565
+#ifdef BT_USE_DOUBLE_PRECISION
+#define MAX_ITERATIONS 64
+#else
+#define MAX_ITERATIONS 32
+#endif
+
+/* Returns the support point of the minkowski sum:
+ * MSUM(Pellet, ConvexShape)
+ *
+ */
+void supportPoints (const btTransform xformRay,
+		    const btTransform xformB,
+		    const int shapeType,
+		    const void* shape,
+		    SpuConvexPolyhedronVertexData* convexVertexData,
+		    const btScalar marginB,
+		    const btVector3& seperatingAxis,
+		    btVector3& w,
+		    btVector3& supVertexRay,
+		    btVector3& supVertexB)
+{
+	btVector3 saUnit = seperatingAxis;
+	saUnit.normalize();
+	btVector3 SupportPellet = xformRay(0.0001 * -saUnit);
+	btVector3 rotatedSeperatingAxis = seperatingAxis * xformB.getBasis();
+	btVector3 SupportShape = xformB(localGetSupportingVertexWithoutMargin(shapeType, (void*)shape, rotatedSeperatingAxis, convexVertexData));
+	SupportShape += saUnit * marginB;
+	w = SupportPellet - SupportShape;
+	supVertexRay = SupportPellet;
+	supVertexB = SupportShape;
+}
+
+bool	SpuSubsimplexRayCast::calcTimeOfImpact(const btTransform& fromRay,
+											   const btTransform& toRay,
+											   const btTransform& fromB,
+											   const btTransform& toB,
+											   SpuCastResult& result)
+{
+	m_simplexSolver->reset();
+
+	btVector3 linVelRay, linVelB;
+	linVelRay = toRay.getOrigin() - fromRay.getOrigin();
+	linVelB = toB.getOrigin() - fromB.getOrigin ();
+
+	btScalar lambda = btScalar(0.);
+	
+	btTransform interpolatedTransRay = fromRay;
+	btTransform interpolatedTransB = fromB;
+
+	btVector3 r = (linVelRay-linVelB);
+	btVector3 supVertexRay;
+	btVector3 supVertexB;
+	btVector3 v;
+	supportPoints (fromRay, fromB, m_shapeTypeB, m_shapeB, m_convexDataB, m_marginB, r, v, supVertexRay, supVertexB);
+
+	btVector3 n;
+	n.setValue(btScalar(0.), btScalar(0.), btScalar(0.));
+	bool hasResult = false;
+	btVector3 c;
+	int maxIter = MAX_ITERATIONS;
+
+	btScalar lastLambda = lambda;
+
+	btScalar dist2 = v.length2();
+
+#ifdef BT_USE_DOUBLE_PRECISION
+	btScalar epsilon = btScalar(0.0001);
+#else
+	btScalar epsilon = btScalar(0.0001);
+#endif //BT_USE_DOUBLE_PRECISION
+	btVector3 w,p;
+	btScalar VdotR;
+	
+	while ( (dist2 > epsilon) && maxIter--)
+	{
+		supportPoints (interpolatedTransRay, interpolatedTransB, m_shapeTypeB, m_shapeB, m_convexDataB, m_marginB, v, w, supVertexRay, supVertexB);
+
+		btScalar VdotW = v.dot(w);
+
+		if (lambda > btScalar(1.0))
+		{
+			return false;
+		}
+
+		if ( VdotW > btScalar(0.))
+		{
+			VdotR = v.dot(r);
+
+			if (VdotR >= -(SIMD_EPSILON*SIMD_EPSILON))
+				return false;
+			else
+			{
+				lambda = lambda - VdotW / VdotR;
+				interpolatedTransRay.getOrigin().setInterpolate3(fromRay.getOrigin(), toRay.getOrigin(), lambda);
+				interpolatedTransB.getOrigin().setInterpolate3(fromB.getOrigin(), toB.getOrigin(), lambda);
+				lastLambda = lambda;
+				n = v;
+				hasResult = true;
+			}
+		} 
+		m_simplexSolver->addVertex(w, supVertexRay, supVertexB);
+		if (m_simplexSolver->closest(v))
+		{
+			dist2 = v.length2();
+			hasResult = true;
+			//printf("V=%f , %f, %f\n",v[0],v[1],v[2]);
+			//printf("DIST2=%f\n",dist2);
+			//printf("numverts = %i\n",m_simplexSolver->numVertices());
+		} else
+		{
+			dist2 = btScalar(0.);
+		} 
+	}
+
+	result.m_fraction = lambda;
+	result.m_normal = n;
+	btVector3 hitRay, hitB;
+	m_simplexSolver->compute_points (hitRay, hitB);
+	/* TODO: We could output hit point here (hitB) */
+	return true;
+}
--- a/src/BulletMultiThreaded/SpuRaycastTask/SpuSubSimplexConvexCast.h
+++ b/src/BulletMultiThreaded/SpuRaycastTask/SpuSubSimplexConvexCast.h
@@ -0,0 +1,60 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef SPU_SUBSIMPLEX_RAY_CAST_H
+#define SPU_SUBSIMPLEX_RAY_CAST_H
+
+#include "../SpuNarrowPhaseCollisionTask/SpuVoronoiSimplexSolver.h"
+#include "../SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h"
+#include "SpuRaycastTask.h"
+
+class btConvexShape;
+
+struct SpuCastResult
+{
+	float m_fraction;
+	btVector3 m_normal;
+};
+
+/// btSubsimplexConvexCast implements Gino van den Bergens' paper
+///"Ray Casting against bteral Convex Objects with Application to Continuous Collision Detection"
+/// GJK based Ray Cast, optimized version
+/// Objects should not start in overlap, otherwise results are not defined.
+class SpuSubsimplexRayCast
+{
+	SpuVoronoiSimplexSolver* m_simplexSolver;
+	void* m_shapeB;
+	SpuConvexPolyhedronVertexData* m_convexDataB;
+	int m_shapeTypeB;
+	float m_marginB;
+
+public:
+	SpuSubsimplexRayCast (void* shapeB, SpuConvexPolyhedronVertexData* convexDataB, int shapeTypeB, float marginB,
+						  SpuVoronoiSimplexSolver* simplexSolver);
+
+	//virtual ~btSubsimplexConvexCast();
+
+	///SimsimplexConvexCast calculateTimeOfImpact calculates the time of impact+normal for the linear cast (sweep) between two moving objects.
+	///Precondition is that objects should not penetration/overlap at the start from the interval. Overlap can be tested using btGjkPairDetector.
+	bool calcTimeOfImpact(const btTransform& fromRay,
+						  const btTransform& toRay,
+						  const btTransform& fromB,
+						  const btTransform& toB,
+						  SpuCastResult& result);
+
+};
+
+#endif //SUBSIMPLEX_RAY_CAST_H
--- a/src/BulletMultiThreaded/SpuRaycastTaskProcess.cpp
+++ b/src/BulletMultiThreaded/SpuRaycastTaskProcess.cpp
@@ -0,0 +1,189 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuRaycastTaskProcess.h"
+
+
+SpuRaycastTaskProcess::SpuRaycastTaskProcess(class	btThreadSupportInterface*	threadInterface,  int	maxNumOutstandingTasks)
+:m_threadInterface(threadInterface),
+m_maxNumOutstandingTasks(maxNumOutstandingTasks)
+{
+	m_workUnitTaskBuffers = (unsigned char *)0;
+	m_taskBusy.resize(m_maxNumOutstandingTasks);
+	m_spuRaycastTaskDesc.resize(m_maxNumOutstandingTasks);
+
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		m_taskBusy[i] = false;
+	}
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+	m_currentWorkUnitInTask = 0;
+
+	m_threadInterface->startSPU();
+
+	//printf("sizeof vec_float4: %d\n", sizeof(vec_float4));
+	//printf("sizeof SpuGatherAndProcessWorkUnitInput: %d\n", sizeof(SpuGatherAndProcessWorkUnitInput));
+
+}
+
+SpuRaycastTaskProcess::~SpuRaycastTaskProcess()
+{
+	
+	if (m_workUnitTaskBuffers != 0)
+	{
+		btAlignedFree(m_workUnitTaskBuffers);
+		m_workUnitTaskBuffers = 0;
+	}
+	
+	m_threadInterface->stopSPU();	
+}
+
+
+
+void SpuRaycastTaskProcess::initialize2(void* spuCollisionObjectsWrappers, int numSpuCollisionObjectWrappers)
+{
+	m_spuCollisionObjectWrappers = spuCollisionObjectsWrappers;
+	m_numSpuCollisionObjectWrappers = numSpuCollisionObjectWrappers;
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		m_taskBusy[i] = false;
+	}
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+	m_currentWorkUnitInTask = 0;
+
+#ifdef DEBUG_SpuRaycastTaskProcess
+	m_initialized = true;
+#endif
+}
+
+
+void SpuRaycastTaskProcess::issueTask2()
+{
+	m_taskBusy[m_currentTask] = true;
+	m_numBusyTasks++;
+
+	SpuRaycastTaskDesc& taskDesc = m_spuRaycastTaskDesc[m_currentTask];
+
+	taskDesc.taskId = m_currentTask;
+	m_threadInterface->sendRequest(1, (uint32_t) &taskDesc,m_currentTask);
+	//printf("send thread requested for task %d\n", m_currentTask);
+	// if all tasks busy, wait for spu event to clear the task.
+	if (m_numBusyTasks >= m_maxNumOutstandingTasks)
+	{
+		unsigned int taskId;
+		unsigned int outputSize;
+
+		for (int i=0;i<m_maxNumOutstandingTasks;i++)
+	  {
+		  if (m_taskBusy[i])
+		  {
+			  taskId = i;
+			  break;
+		  }
+	  }
+		m_threadInterface->waitForResponse(&taskId, &outputSize);
+
+		//printf("PPU: after issue, received event: %u %d\n", taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	} else {
+		//printf("Sent request, not enough busy tasks\n");
+	}
+}
+
+void SpuRaycastTaskProcess::addWorkToTask(SpuRaycastTaskWorkUnit& workunit)
+{
+	m_spuRaycastTaskDesc[m_currentTask].workUnits[m_currentWorkUnitInTask] = workunit;
+	m_currentWorkUnitInTask++;
+	if (m_currentWorkUnitInTask == SPU_RAYCAST_WORK_UNITS_PER_TASK)
+	{
+		m_spuRaycastTaskDesc[m_currentTask].numWorkUnits = m_currentWorkUnitInTask;
+		m_spuRaycastTaskDesc[m_currentTask].numSpuCollisionObjectWrappers = m_numSpuCollisionObjectWrappers;
+		m_spuRaycastTaskDesc[m_currentTask].spuCollisionObjectsWrappers = m_spuCollisionObjectWrappers;
+		//printf("Task buffer full, issuing\n");
+		issueTask2 ();
+		//printf("Returned from issueTask2()\n");
+		m_currentWorkUnitInTask = 0;
+
+		// find new task buffer
+		for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+		{
+			if (!m_taskBusy[i])
+			{
+				m_currentTask = i;
+				//init the task data
+				break;
+			}
+		}
+		//printf("next task = %d\n", m_currentTask);
+	}
+}
+
+
+void 
+SpuRaycastTaskProcess::flush2()
+{
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("\nSpuRaycastTaskProcess::flush()\n");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	
+	// if there's a partially filled task buffer, submit that task
+	//printf("Flushing... %d remaining\n", m_currentWorkUnitInTask);
+	if (m_currentWorkUnitInTask > 0)
+	{
+		m_spuRaycastTaskDesc[m_currentTask].numWorkUnits = m_currentWorkUnitInTask;
+		m_spuRaycastTaskDesc[m_currentTask].numSpuCollisionObjectWrappers = m_numSpuCollisionObjectWrappers;
+		m_spuRaycastTaskDesc[m_currentTask].spuCollisionObjectsWrappers = m_spuCollisionObjectWrappers;
+		issueTask2();
+		m_currentWorkUnitInTask = 0;
+	}
+
+
+	// all tasks are issued, wait for all tasks to be complete
+	while(m_numBusyTasks > 0)
+	{
+	  // Consolidating SPU code
+	  unsigned int taskId;
+	  unsigned int outputSize;
+	  
+	  for (int i=0;i<m_maxNumOutstandingTasks;i++)
+	  {
+		  if (m_taskBusy[i])
+		  {
+			  taskId = i;
+			  break;
+		  }
+	  }
+
+	  //printf("Busy tasks... %d\n", m_numBusyTasks);
+
+	  {
+			// SPURS support.
+			m_threadInterface->waitForResponse(&taskId, &outputSize);
+		}
+
+		//printf("PPU: flushing, received event: %u %d\n", taskId, outputSize);
+
+		//postProcess(taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	}
+}
--- a/src/BulletMultiThreaded/SpuRaycastTaskProcess.h
+++ b/src/BulletMultiThreaded/SpuRaycastTaskProcess.h
@@ -0,0 +1,72 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_RAY_TASK_PROCESS_H
+#define SPU_RAY_TASK_PROCESS_H
+
+#include <assert.h>
+#include <string.h>
+
+#include <LinearMath/btScalar.h>
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include <LinearMath/btAlignedAllocator.h>
+
+#include "PlatformDefinitions.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "SpuRaycastTask/SpuRaycastTask.h"
+
+#include "btThreadSupportInterface.h"
+
+/// SpuRaycastTaskProcess handles SPU processing of raycast requests
+class SpuRaycastTaskProcess
+{
+	unsigned char  *m_workUnitTaskBuffers;
+
+	// track task buffers that are being used, and total busy tasks
+	btAlignedObjectArray<bool>	m_taskBusy;
+	btAlignedObjectArray<SpuRaycastTaskDesc>	m_spuRaycastTaskDesc;
+
+	btThreadSupportInterface*	m_threadInterface;
+
+	int	m_maxNumOutstandingTasks;
+
+	int	m_numBusyTasks;
+
+	// the current task and the current entry to insert a new work unit
+	int m_currentTask;
+	int m_currentWorkUnitInTask;
+	int m_numSpuCollisionObjectWrappers;
+	void* m_spuCollisionObjectWrappers;
+	void issueTask2();
+	//void postProcess(unsigned int taskId, int outputSize);
+
+public:
+	SpuRaycastTaskProcess(btThreadSupportInterface*	threadInterface, int maxNumOutstandingTasks);
+	
+	~SpuRaycastTaskProcess();
+	
+	/// call initialize in the beginning of the frame, before addCollisionPairToTask
+	void initialize2(void* spuCollisionObjectsWrappers, int numSpuCollisionObjectWrappers);
+
+	/// batch up additional work to a current task for SPU processing. When batch is full, it issues the task.
+	void addWorkToTask(struct SpuRaycastTaskWorkUnit&);
+
+	/// call flush to submit potential outstanding work to SPUs and wait for all involved SPUs to be finished
+	void flush2();
+};
+
+
+#endif // SPU_COLLISION_TASK_PROCESS_H
+
--- a/src/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.cpp
+++ b/src/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.cpp
@@ -0,0 +1,214 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, Copyright (c) 2007 Erwin Coumans
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+
+#include "SpuSampleTask.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+#include "../PlatformDefinitions.h"
+#include "../SpuFakeDma.h"
+#include "LinearMath/btMinMax.h"
+
+#ifdef __SPU__
+#include <spu_printf.h>
+#else
+#include <stdio.h>
+#define spu_printf printf
+#endif
+
+#define MAX_NUM_BODIES 8192
+
+struct SampleTask_LocalStoreMemory
+{
+	ATTRIBUTE_ALIGNED16(char gLocalRigidBody [sizeof(btRigidBody)+16]);
+	ATTRIBUTE_ALIGNED16(void* gPointerArray[MAX_NUM_BODIES]);
+
+};
+
+
+
+
+//-- MAIN METHOD
+void processSampleTask(void* userPtr, void* lsMemory)
+{
+	//	BT_PROFILE("processSampleTask");
+
+	SampleTask_LocalStoreMemory* localMemory = (SampleTask_LocalStoreMemory*)lsMemory;
+
+	SpuSampleTaskDesc* taskDescPtr = (SpuSampleTaskDesc*)userPtr;
+	SpuSampleTaskDesc& taskDesc = *taskDescPtr;
+
+	switch (taskDesc.m_sampleCommand)
+	{
+	case CMD_SAMPLE_INTEGRATE_BODIES:
+		{
+			btTransform predictedTrans;
+			btCollisionObject** eaPtr = (btCollisionObject**)taskDesc.m_mainMemoryPtr;
+
+			int batchSize = taskDesc.m_sampleValue;
+			if (batchSize>MAX_NUM_BODIES)
+			{
+				spu_printf("SPU Error: exceed number of bodies, see MAX_NUM_BODIES in SpuSampleTask.cpp\n");
+				break;
+			}
+			int dmaArraySize = batchSize*sizeof(void*);
+
+			uint64_t ppuArrayAddress = reinterpret_cast<uint64_t>(eaPtr);
+
+			//			spu_printf("array location is at %llx, batchSize = %d, DMA size = %d\n",ppuArrayAddress,batchSize,dmaArraySize);
+
+			if (dmaArraySize>=16)
+			{
+				cellDmaLargeGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress  , dmaArraySize, DMA_TAG(1), 0, 0);	
+				cellDmaWaitTagStatusAll(DMA_MASK(1));
+			} else
+			{
+				stallingUnalignedDmaSmallGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress  , dmaArraySize);
+			}
+
+
+			for ( int i=0;i<batchSize;i++)
+			{
+				///DMA rigid body
+
+				void* localPtr = &localMemory->gLocalRigidBody[0];
+				void* shortAdd = localMemory->gPointerArray[i];
+				uint64_t ppuRigidBodyAddress = reinterpret_cast<uint64_t>(shortAdd);
+
+				//	spu_printf("cellDmaGet at CMD_SAMPLE_INTEGRATE_BODIES from %llx to %llx\n",ppuRigidBodyAddress,localPtr);
+
+				int dmaBodySize = sizeof(btRigidBody);
+
+				cellDmaGet((void*)localPtr, ppuRigidBodyAddress  , dmaBodySize, DMA_TAG(1), 0, 0);	
+				cellDmaWaitTagStatusAll(DMA_MASK(1));
+
+
+				float timeStep = 1.f/60.f;
+
+				btRigidBody* body = (btRigidBody*) localPtr;//btRigidBody::upcast(colObj);
+				if (body)
+				{
+					if (body->isActive() && (!body->isStaticOrKinematicObject()))
+					{
+						body->predictIntegratedTransform(timeStep, predictedTrans);
+						body->proceedToTransform( predictedTrans);
+						void* ptr = (void*)localPtr;
+						//	spu_printf("cellDmaLargePut from %llx to LS %llx\n",ptr,ppuRigidBodyAddress);
+
+						cellDmaLargePut(ptr, ppuRigidBodyAddress  , dmaBodySize, DMA_TAG(1), 0, 0);
+						cellDmaWaitTagStatusAll(DMA_MASK(1));
+
+					}
+				}
+
+			}
+			break;
+		}
+
+
+	case CMD_SAMPLE_PREDICT_MOTION_BODIES:
+		{
+			btTransform predictedTrans;
+			btCollisionObject** eaPtr = (btCollisionObject**)taskDesc.m_mainMemoryPtr;
+
+			int batchSize = taskDesc.m_sampleValue;
+			int dmaArraySize = batchSize*sizeof(void*);
+
+			if (batchSize>MAX_NUM_BODIES)
+			{
+				spu_printf("SPU Error: exceed number of bodies, see MAX_NUM_BODIES in SpuSampleTask.cpp\n");
+				break;
+			}
+
+			uint64_t ppuArrayAddress = reinterpret_cast<uint64_t>(eaPtr);
+
+			//			spu_printf("array location is at %llx, batchSize = %d, DMA size = %d\n",ppuArrayAddress,batchSize,dmaArraySize);
+
+			if (dmaArraySize>=16)
+			{
+				cellDmaLargeGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress  , dmaArraySize, DMA_TAG(1), 0, 0);	
+				cellDmaWaitTagStatusAll(DMA_MASK(1));
+			} else
+			{
+				stallingUnalignedDmaSmallGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress  , dmaArraySize);
+			}
+
+
+			for ( int i=0;i<batchSize;i++)
+			{
+				///DMA rigid body
+
+				void* localPtr = &localMemory->gLocalRigidBody[0];
+				void* shortAdd = localMemory->gPointerArray[i];
+				uint64_t ppuRigidBodyAddress = reinterpret_cast<uint64_t>(shortAdd);
+
+				//	spu_printf("cellDmaGet at CMD_SAMPLE_INTEGRATE_BODIES from %llx to %llx\n",ppuRigidBodyAddress,localPtr);
+
+				int dmaBodySize = sizeof(btRigidBody);
+
+				cellDmaGet((void*)localPtr, ppuRigidBodyAddress  , dmaBodySize, DMA_TAG(1), 0, 0);	
+				cellDmaWaitTagStatusAll(DMA_MASK(1));
+
+
+				float timeStep = 1.f/60.f;
+
+				btRigidBody* body = (btRigidBody*) localPtr;//btRigidBody::upcast(colObj);
+				if (body)
+				{
+					if (!body->isStaticOrKinematicObject())
+					{
+						if (body->isActive())
+						{
+							body->integrateVelocities( timeStep);
+							//damping
+							body->applyDamping(timeStep);
+
+							body->predictIntegratedTransform(timeStep,body->getInterpolationWorldTransform());
+
+							void* ptr = (void*)localPtr;
+							cellDmaLargePut(ptr, ppuRigidBodyAddress  , dmaBodySize, DMA_TAG(1), 0, 0);
+							cellDmaWaitTagStatusAll(DMA_MASK(1));
+						}
+					}
+				}
+
+			}
+			break;
+		}
+	
+
+
+	default:
+		{
+
+		}
+	};
+}
+
+
+#if defined(__CELLOS_LV2__) || defined (LIBSPE2)
+
+ATTRIBUTE_ALIGNED16(SampleTask_LocalStoreMemory	gLocalStoreMemory);
+
+void* createSampleLocalStoreMemory()
+{
+	return &gLocalStoreMemory;
+}
+#else
+void* createSampleLocalStoreMemory()
+{
+	return new SampleTask_LocalStoreMemory;
+};
+
+#endif
--- a/src/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.h
+++ b/src/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.h
@@ -0,0 +1,54 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, Copyright (c) 2007 Erwin Coumans
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef SPU_SAMPLE_TASK_H
+#define SPU_SAMPLE_TASK_H
+
+#include "../PlatformDefinitions.h"
+#include "LinearMath/btScalar.h"
+#include "LinearMath/btVector3.h"
+#include "LinearMath/btMatrix3x3.h"
+
+#include "LinearMath/btAlignedAllocator.h"
+
+
+enum
+{
+	CMD_SAMPLE_INTEGRATE_BODIES = 1,
+	CMD_SAMPLE_PREDICT_MOTION_BODIES
+};
+
+
+
+ATTRIBUTE_ALIGNED16(struct) SpuSampleTaskDesc
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	uint32_t						m_sampleCommand;
+	uint32_t						m_taskId;
+
+	uint64_t 	m_mainMemoryPtr;
+	int			m_sampleValue;
+	
+
+};
+
+
+void	processSampleTask(void* userPtr, void* lsMemory);
+void*	createSampleLocalStoreMemory();
+
+
+#endif //SPU_SAMPLE_TASK_H
+
--- a/src/BulletMultiThreaded/SpuSampleTask/readme.txt
+++ b/src/BulletMultiThreaded/SpuSampleTask/readme.txt
@@ -0,0 +1 @@
+Empty placeholder for future Libspe2 SPU task
--- a/src/BulletMultiThreaded/SpuSampleTaskProcess.cpp
+++ b/src/BulletMultiThreaded/SpuSampleTaskProcess.cpp
@@ -0,0 +1,222 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//#define __CELLOS_LV2__ 1
+
+#define USE_SAMPLE_PROCESS 1
+#ifdef USE_SAMPLE_PROCESS
+
+
+#include "SpuSampleTaskProcess.h"
+#include <stdio.h>
+
+#ifdef __SPU__
+
+
+
+void	SampleThreadFunc(void* userPtr,void* lsMemory)
+{
+	//do nothing
+	printf("hello world\n");
+}
+
+
+void*	SamplelsMemoryFunc()
+{
+	//don't create local store memory, just return 0
+	return 0;
+}
+
+
+#else
+
+
+#include "btThreadSupportInterface.h"
+
+//#	include "SPUAssert.h"
+#include <string.h>
+
+
+
+extern "C" {
+	extern char SPU_SAMPLE_ELF_SYMBOL[];
+};
+
+
+
+
+
+SpuSampleTaskProcess::SpuSampleTaskProcess(btThreadSupportInterface*	threadInterface, unsigned int maxNumOutstandingTasks)
+:m_threadInterface(threadInterface),
+m_maxNumOutstandingTasks(maxNumOutstandingTasks)
+{
+
+	m_taskBusy.resize(m_maxNumOutstandingTasks);
+	m_spuSampleTaskDesc.resize(m_maxNumOutstandingTasks);
+
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		m_taskBusy[i] = false;
+	}
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+
+	m_initialized = false;
+
+	m_threadInterface->startSPU();
+
+
+}
+
+SpuSampleTaskProcess::~SpuSampleTaskProcess()
+{
+	m_threadInterface->stopSPU();
+	
+}
+
+
+
+void	SpuSampleTaskProcess::initialize()
+{
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("SpuSampleTaskProcess::initialize()\n");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		m_taskBusy[i] = false;
+	}
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+	m_initialized = true;
+
+}
+
+
+void SpuSampleTaskProcess::issueTask(void* sampleMainMemPtr,int sampleValue,int sampleCommand)
+{
+
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("SpuSampleTaskProcess::issueTask (m_currentTask= %d\)n", m_currentTask);
+#endif //DEBUG_SPU_TASK_SCHEDULING
+
+	m_taskBusy[m_currentTask] = true;
+	m_numBusyTasks++;
+
+	SpuSampleTaskDesc& taskDesc = m_spuSampleTaskDesc[m_currentTask];
+	{
+		// send task description in event message
+		// no error checking here...
+		// but, currently, event queue can be no larger than NUM_WORKUNIT_TASKS.
+	
+		taskDesc.m_mainMemoryPtr = reinterpret_cast<uint64_t>(sampleMainMemPtr);
+		taskDesc.m_sampleValue = sampleValue;
+		taskDesc.m_sampleCommand = sampleCommand;
+
+		//some bookkeeping to recognize finished tasks
+		taskDesc.m_taskId = m_currentTask;
+	}
+
+
+	m_threadInterface->sendRequest(1, (uint32_t) &taskDesc, m_currentTask);
+
+	// if all tasks busy, wait for spu event to clear the task.
+	
+	if (m_numBusyTasks >= m_maxNumOutstandingTasks)
+	{
+		unsigned int taskId;
+		unsigned int outputSize;
+
+		for (int i=0;i<m_maxNumOutstandingTasks;i++)
+	  {
+		  if (m_taskBusy[i])
+		  {
+			  taskId = i;
+			  break;
+		  }
+	  }
+		m_threadInterface->waitForResponse(&taskId, &outputSize);
+
+		//printf("PPU: after issue, received event: %u %d\n", taskId, outputSize);
+
+		postProcess(taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	}
+
+	// find new task buffer
+	for (unsigned int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		if (!m_taskBusy[i])
+		{
+			m_currentTask = i;
+			break;
+		}
+	}
+}
+
+
+///Optional PPU-size post processing for each task
+void SpuSampleTaskProcess::postProcess(int taskId, int outputSize)
+{
+
+}
+
+
+void SpuSampleTaskProcess::flush()
+{
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("\nSpuCollisionTaskProcess::flush()\n");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	
+
+	// all tasks are issued, wait for all tasks to be complete
+	while(m_numBusyTasks > 0)
+	{
+// Consolidating SPU code
+	  unsigned int taskId;
+	  unsigned int outputSize;
+	  
+	  for (int i=0;i<m_maxNumOutstandingTasks;i++)
+	  {
+		  if (m_taskBusy[i])
+		  {
+			  taskId = i;
+			  break;
+		  }
+	  }
+	  {
+			
+		  m_threadInterface->waitForResponse(&taskId, &outputSize);
+	  }
+
+		//printf("PPU: flushing, received event: %u %d\n", taskId, outputSize);
+
+		postProcess(taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	}
+
+
+}
+
+#endif
+
+
+#endif //USE_SAMPLE_PROCESS
--- a/src/BulletMultiThreaded/SpuSampleTaskProcess.h
+++ b/src/BulletMultiThreaded/SpuSampleTaskProcess.h
@@ -0,0 +1,153 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_SAMPLE_TASK_PROCESS_H
+#define SPU_SAMPLE_TASK_PROCESS_H
+
+#include <assert.h>
+
+
+#include "PlatformDefinitions.h"
+
+#include <stdlib.h>
+
+#include "LinearMath/btAlignedObjectArray.h"
+
+
+#include "SpuSampleTask/SpuSampleTask.h"
+
+
+//just add your commands here, try to keep them globally unique for debugging purposes
+#define CMD_SAMPLE_TASK_COMMAND 10
+
+
+
+/// SpuSampleTaskProcess handles SPU processing of collision pairs.
+/// When PPU issues a task, it will look for completed task buffers
+/// PPU will do postprocessing, dependent on workunit output (not likely)
+class SpuSampleTaskProcess
+{
+	// track task buffers that are being used, and total busy tasks
+	btAlignedObjectArray<bool>	m_taskBusy;
+	btAlignedObjectArray<SpuSampleTaskDesc>m_spuSampleTaskDesc;
+	
+	unsigned int   m_numBusyTasks;
+
+	// the current task and the current entry to insert a new work unit
+	unsigned int   m_currentTask;
+
+	bool m_initialized;
+
+	void postProcess(int taskId, int outputSize);
+	
+	class	btThreadSupportInterface*	m_threadInterface;
+
+	unsigned int	m_maxNumOutstandingTasks;
+
+
+
+public:
+	SpuSampleTaskProcess(btThreadSupportInterface*	threadInterface, unsigned int maxNumOutstandingTasks);
+	
+	~SpuSampleTaskProcess();
+	
+	///call initialize in the beginning of the frame, before addCollisionPairToTask
+	void initialize();
+
+	void issueTask(void* sampleMainMemPtr,int sampleValue,int sampleCommand);
+
+	///call flush to submit potential outstanding work to SPUs and wait for all involved SPUs to be finished
+	void flush();
+};
+
+
+#if defined(USE_LIBSPE2) && defined(__SPU__)
+////////////////////MAIN/////////////////////////////
+#include "../SpuLibspe2Support.h"
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+#include <SpuFakeDma.h>
+
+void * SamplelsMemoryFunc();
+void SampleThreadFunc(void* userPtr,void* lsMemory);
+
+//#define DEBUG_LIBSPE2_MAINLOOP
+
+int main(unsigned long long speid, addr64 argp, addr64 envp)
+{
+	printf("SPU is up \n");
+	
+	ATTRIBUTE_ALIGNED128(btSpuStatus status);
+	ATTRIBUTE_ALIGNED16( SpuSampleTaskDesc taskDesc ) ;
+	unsigned int received_message = Spu_Mailbox_Event_Nothing;
+        bool shutdown = false;
+
+	cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+	cellDmaWaitTagStatusAll(DMA_MASK(3));
+
+	status.m_status = Spu_Status_Free;
+	status.m_lsMemory.p = SamplelsMemoryFunc();
+
+	cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+	cellDmaWaitTagStatusAll(DMA_MASK(3));
+	
+	
+	while (!shutdown)
+	{
+		received_message = spu_read_in_mbox();
+		
+
+		
+		switch(received_message)
+		{
+		case Spu_Mailbox_Event_Shutdown:
+			shutdown = true;
+			break; 
+		case Spu_Mailbox_Event_Task:
+			// refresh the status
+#ifdef DEBUG_LIBSPE2_MAINLOOP
+			printf("SPU recieved Task \n");
+#endif //DEBUG_LIBSPE2_MAINLOOP
+			cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+			cellDmaWaitTagStatusAll(DMA_MASK(3));
+		
+			btAssert(status.m_status==Spu_Status_Occupied);
+			
+			cellDmaGet(&taskDesc, status.m_taskDesc.p, sizeof(SpuSampleTaskDesc), DMA_TAG(3), 0, 0);
+			cellDmaWaitTagStatusAll(DMA_MASK(3));
+			
+			SampleThreadFunc((void*)&taskDesc, reinterpret_cast<void*> (taskDesc.m_mainMemoryPtr) );
+			break;
+		case Spu_Mailbox_Event_Nothing:
+		default:
+			break;
+		}
+
+		// set to status free and wait for next task
+		status.m_status = Spu_Status_Free;
+		cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+		cellDmaWaitTagStatusAll(DMA_MASK(3));		
+				
+		
+  	}
+  	return 0;
+}
+//////////////////////////////////////////////////////
+#endif
+
+
+
+#endif // SPU_SAMPLE_TASK_PROCESS_H
+
--- a/src/BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.cpp
+++ b/src/BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.cpp
--- a/src/BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.h
+++ b/src/BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.h
@@ -0,0 +1,279 @@
+/*
+Bullet Continuous Collision Detection and Physics Library - Parallel solver
+Copyright (c) 2007 Starbreeze Studios
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+Written by: Marten Svanfeldt
+*/
+
+#ifndef SPU_PARALLELSOLVERTASK_H
+#define SPU_PARALLELSOLVERTASK_H
+
+#include "../PlatformDefinitions.h"
+#include "LinearMath/btScalar.h"
+#include "LinearMath/btVector3.h"
+#include "LinearMath/btMatrix3x3.h"
+#include "BulletDynamics/ConstraintSolver/btContactSolverInfo.h"
+#include "../SpuSync.h"
+#include "BulletDynamics/ConstraintSolver/btTypedConstraint.h"
+#include "LinearMath/btAlignedAllocator.h"
+
+
+ATTRIBUTE_ALIGNED16(struct) ManifoldCellHolder
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	uint32_t					m_hashCellIndex;		
+	class btPersistentManifold*	m_manifold;
+};
+
+ATTRIBUTE_ALIGNED16(struct) ConstraintCellHolder
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	uint32_t					m_hashCellIndex;		
+	uint32_t					m_constraintType;
+	class btTypedConstraint*	m_constraint;
+};
+
+enum
+{
+	SPU_HASH_NUMCELLS = 128,
+	SPU_HASH_WORDWIDTH = sizeof(uint32_t)*8,
+	SPU_HASH_NUMCELLDWORDS = ((SPU_HASH_NUMCELLS + SPU_HASH_WORDWIDTH - 1) / SPU_HASH_WORDWIDTH),
+	SPU_HASH_NUMUNUSEDBITS = (SPU_HASH_NUMCELLDWORDS * SPU_HASH_WORDWIDTH) - SPU_HASH_NUMCELLS, 
+	SPU_HASH_PHYSSIZE = 4, //TODO: MAKE CONFIGURABLE
+
+	SPU_MAX_BODIES_PER_CELL = 1024,
+
+	SPU_MAX_SPUS = 6
+};
+
+enum
+{
+	CMD_SOLVER_SETUP_BODIES = 1,
+	CMD_SOLVER_MANIFOLD_SETUP = 2,
+	CMD_SOLVER_CONSTRAINT_SETUP = 3,
+	CMD_SOLVER_SOLVE_ITERATE = 4,
+	CMD_SOLVER_COPYBACK_BODIES = 5
+};
+
+struct SpuSolverHashCell
+{
+	uint16_t						m_numLocalBodies;
+	uint16_t						m_solverBodyOffsetListOffset;
+
+	uint16_t						m_numManifolds;
+	uint16_t						m_manifoldListOffset;
+
+	uint16_t						m_numContacts;
+	uint16_t						m_internalConstraintListOffset;
+
+	uint16_t						m_numConstraints;
+	uint16_t						m_constraintListOffset;
+};
+
+// Shared data structures
+struct SpuSolverHash
+{
+	// Dependency matrix
+	ATTRIBUTE_ALIGNED16(uint32_t m_dependencyMatrix[SPU_HASH_NUMCELLS][SPU_HASH_NUMCELLDWORDS]);
+	ATTRIBUTE_ALIGNED16(uint32_t m_currentMask[SPU_MAX_SPUS+1][SPU_HASH_NUMCELLDWORDS]);
+
+	// The hash itself
+	ATTRIBUTE_ALIGNED16(SpuSolverHashCell m_Hash[SPU_HASH_NUMCELLS]);
+
+	// Hash meta-data	
+};
+
+inline unsigned int spuHash(unsigned int k)  { return k*2654435769u; };
+inline unsigned int spuGetHashCellIndex(int x, int y, int z)
+{
+	//int n = 0x8da6b343 * x + 0xd8163841 * y + 0xcb1ab31f * z;
+
+	int n = x ^ spuHash(y ^ spuHash (z));
+
+	return ((unsigned int)n) & (SPU_HASH_NUMCELLS-1);
+}
+
+
+ATTRIBUTE_ALIGNED16(struct) SpuSolverBody
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	btVector3			m_linearVelocity;
+	btVector3			m_angularVelocity;
+
+	btMatrix3x3			m_worldInvInertiaTensor;
+	btScalar            m_angularFactor;
+	btScalar				m_invertedMass;
+};
+
+ATTRIBUTE_ALIGNED16(struct) SpuSolverInternalConstraint
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	uint32_t			m_localOffsetBodyA;
+	uint32_t			m_localOffsetBodyB;
+
+	btScalar				m_appliedImpulse;
+
+	btScalar				m_friction;
+	btScalar				m_restitution;
+	btScalar				m_jacDiagABInv;
+	btScalar				m_penetration;
+
+	btVector3			m_normal;
+
+	btVector3			m_relpos1CrossNormal;
+	btVector3			m_relpos2CrossNormal;
+	btVector3			m_angularComponentA;
+	btVector3			m_angularComponentB;
+};
+
+
+ATTRIBUTE_ALIGNED16(struct) SpuSolverConstraint
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	uint16_t			m_localOffsetBodyA;
+	uint16_t			m_localOffsetBodyB;
+
+	uint16_t			m_constraintType;
+	struct 
+	{
+		uint16_t		m_useLinear : 1;
+		
+		uint16_t		m_limit1	: 1;
+		uint16_t		m_limit2	: 1;
+		uint16_t		m_limit3	: 1;
+		uint16_t		m_limit4	: 1;
+		uint16_t		m_limit5	: 1;
+		uint16_t		m_limit6	: 1;
+
+		uint16_t		m_motor1	: 1;
+		uint16_t		m_motor2	: 1;
+		uint16_t		m_motor3	: 1;
+		uint16_t		m_motor4	: 1;
+		uint16_t		m_motor5	: 1;
+		uint16_t		m_motor6	: 1;
+	}					m_flags;
+
+	// Linear parts, used by all constraints
+	btQuadWordStorage	m_relPos1;
+	btQuadWordStorage	m_relPos2;
+	btQuadWordStorage	m_jacdiagABInv;		//Jacobian inverse multiplied by gamma (damping) for each axis
+	btQuadWordStorage	m_linearBias;		//depth*tau/(dt*gamma) along each axis
+
+	// Joint-specific parts
+	union
+	{
+		struct 
+		{
+			btQuadWordStorage	m_frameAinW[3];
+			btQuadWordStorage	m_frameBinW[3];
+
+			// For angular
+			btQuadWordStorage	m_angJacdiagABInv;		//1/j 
+			btQuadWordStorage	m_angularBias;			//error/dt, in x/y.		limit error*bias factor / (dt * relaxation factor) in z
+			
+			// For limit
+			float				m_limitAccumulatedImpulse;
+			float				m_limitJacFactor;		//limitSign*relaxation factor
+
+			// For motor
+			float				m_motorVelocity;
+			float				m_motorImpulse;
+		} hinge;
+		
+		struct  
+		{
+			btQuadWordStorage	m_swingAxis;
+			btQuadWordStorage	m_twistAxis;
+
+			float				m_swingError;
+			float				m_swingJacInv;
+			float				m_swingLimitImpulse;
+
+			float				m_twistError;
+			float				m_twistJacInv;
+			float				m_twistLimitImpulse;
+		} conetwist;
+	};
+};
+
+
+ATTRIBUTE_ALIGNED16(struct) SpuSolverDataDesc
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	SpuSolverHash*					m_solverHash;
+	SpuSolverBody*					m_solverBodyList;
+	SpuSolverInternalConstraint*	m_solverInternalConstraintList;
+	SpuSolverConstraint*			m_solverConstraintList;
+	uint32_t*						m_solverBodyOffsetList;
+};
+
+
+ATTRIBUTE_ALIGNED16(struct) SpuSolverTaskDesc
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	uint32_t						m_solverCommand;
+	uint32_t						m_taskId;
+	SpuSolverDataDesc				m_solverData;
+
+	// command specific data
+	union
+	{
+		// Body setup
+		struct 
+		{
+			uint32_t				m_startBody;
+			uint32_t				m_numBodies;
+
+			class btRigidBody**		m_rbList;
+		} m_bodySetup, m_bodyCopyback;
+
+		struct 
+		{
+			uint32_t				m_startCell;
+			uint32_t				m_numCells;
+
+			uint32_t				m_numBodies;
+			uint32_t				m_numManifolds;
+
+			ManifoldCellHolder*		m_manifoldHolders;
+			ConstraintCellHolder*	m_constraintHolders;
+			btContactSolverInfoData	m_solverInfo;
+		} m_manifoldSetup;
+
+		struct  
+		{
+			btSpinlock::SpinVariable*	m_spinLockVar;
+		} m_iterate;
+	}								m_commandData;
+};
+
+void	processSolverTask(void* userPtr, void* lsMemory);
+void*	createSolverLocalStoreMemory();
+
+// Helper
+inline bool constraintTypeSupported(btTypedConstraintType type)
+{
+	return type == POINT2POINT_CONSTRAINT_TYPE ||
+		type == HINGE_CONSTRAINT_TYPE ||
+		type == CONETWIST_CONSTRAINT_TYPE ||
+		type == D6_CONSTRAINT_TYPE;
+}
+
+#endif
--- a/src/BulletMultiThreaded/SpuSync.h
+++ b/src/BulletMultiThreaded/SpuSync.h
@@ -0,0 +1,146 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2007 Starbreeze Studios
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+Written by: Marten Svanfeldt
+*/
+
+#ifndef SPU_SYNC_H
+#define	SPU_SYNC_H
+
+
+#include "PlatformDefinitions.h"
+
+
+#if defined(WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#ifdef _XBOX
+#include <Xtl.h>
+#else
+#include <Windows.h>
+#endif
+
+class btSpinlock
+{
+public:
+	//typedef volatile LONG SpinVariable;
+	typedef CRITICAL_SECTION SpinVariable;
+
+	btSpinlock (SpinVariable* var)
+		: spinVariable (var)
+	{}
+
+	void Init ()
+	{
+		//*spinVariable = 0;
+		InitializeCriticalSection(spinVariable);
+	}
+
+	void Lock ()
+	{
+		EnterCriticalSection(spinVariable);
+	}
+
+	void Unlock ()
+	{
+		LeaveCriticalSection(spinVariable);
+	}
+
+private:
+	SpinVariable* spinVariable;
+};
+
+
+#elif defined (__CELLOS_LV2__)
+
+//#include <cell/atomic.h>
+#include <cell/sync/mutex.h>
+
+class btSpinlock
+{
+public:
+	typedef CellSyncMutex SpinVariable;
+
+	btSpinlock (SpinVariable* var)
+		: spinVariable (var)
+	{}
+
+	void Init ()
+	{
+#ifndef __SPU__
+		//*spinVariable = 1;
+		cellSyncMutexInitialize(spinVariable);
+#endif
+	}
+
+
+
+	void Lock ()
+	{
+#ifdef __SPU__
+		// lock semaphore
+		/*while (cellAtomicTestAndDecr32(atomic_buf, (uint64_t)spinVariable) == 0) 
+		{
+
+		};*/
+		cellSyncMutexLock((uint64_t)spinVariable);
+#endif
+	}
+
+	void Unlock ()
+	{
+#ifdef __SPU__
+		//cellAtomicIncr32(atomic_buf, (uint64_t)spinVariable);
+		cellSyncMutexUnlock((uint64_t)spinVariable);
+#endif 
+	}
+
+
+private:
+	SpinVariable*	spinVariable;
+	ATTRIBUTE_ALIGNED128(uint32_t		atomic_buf[32]);
+};
+
+#else
+//create a dummy implementation (without any locking) useful for serial processing
+class btSpinlock
+{
+public:
+	typedef int  SpinVariable;
+
+	btSpinlock (SpinVariable* var)
+		: spinVariable (var)
+	{}
+
+	void Init ()
+	{
+	}
+
+	void Lock ()
+	{
+	}
+
+	void Unlock ()
+	{
+	}
+
+private:
+	SpinVariable* spinVariable;
+};
+
+
+#endif
+
+
+#endif
--- a/src/BulletMultiThreaded/Win32ThreadSupport.cpp
+++ b/src/BulletMultiThreaded/Win32ThreadSupport.cpp
@@ -0,0 +1,259 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "Win32ThreadSupport.h"
+
+#ifdef USE_WIN32_THREADING
+
+#include <windows.h>
+
+#include "SpuCollisionTaskProcess.h"
+
+#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
+
+
+
+///The number of threads should be equal to the number of available cores
+///Todo: each worker should be linked to a single core, using SetThreadIdealProcessor.
+
+///Win32ThreadSupport helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+///Setup and initialize SPU/CELL/Libspe2
+Win32ThreadSupport::Win32ThreadSupport(const Win32ThreadConstructionInfo & threadConstructionInfo)
+{
+	startThreads(threadConstructionInfo);
+}
+
+///cleanup/shutdown Libspe2
+Win32ThreadSupport::~Win32ThreadSupport()
+{
+	stopSPU();
+}
+
+
+
+
+#include <stdio.h>
+
+DWORD WINAPI Thread_no_1( LPVOID lpParam ) 
+{
+
+	Win32ThreadSupport::btSpuStatus* status = (Win32ThreadSupport::btSpuStatus*)lpParam;
+
+	
+	while (1)
+	{
+		WaitForSingleObject(status->m_eventStartHandle,INFINITE);
+		
+		void* userPtr = status->m_userPtr;
+
+		if (userPtr)
+		{
+			btAssert(status->m_status);
+			status->m_userThreadFunc(userPtr,status->m_lsMemory);
+			status->m_status = 2;
+			SetEvent(status->m_eventCompletetHandle);
+		} else
+		{
+			//exit Thread
+			status->m_status = 3;
+			SetEvent(status->m_eventCompletetHandle);
+			printf("Thread with taskId %i with handle %i exiting\n",status->m_taskId, status->m_threadHandle);
+			break;
+		}
+		
+	}
+
+	printf("Thread TERMINATED\n");
+	return 0;
+
+}
+
+///send messages to SPUs
+void Win32ThreadSupport::sendRequest(uint32_t uiCommand, uint32_t uiArgument0, uint32_t taskId)
+{
+	///	gMidphaseSPU.sendRequest(CMD_GATHER_AND_PROCESS_PAIRLIST, (uint32_t) &taskDesc);
+	
+	///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished
+	
+
+
+	switch (uiCommand)
+	{
+	case 	CMD_GATHER_AND_PROCESS_PAIRLIST:
+		{
+
+
+//#define SINGLE_THREADED 1
+#ifdef SINGLE_THREADED
+
+			btSpuStatus&	spuStatus = m_activeSpuStatus[0];
+			spuStatus.m_userPtr=(void*)uiArgument0;
+			spuStatus.m_userThreadFunc(spuStatus.m_userPtr,spuStatus.m_lsMemory);
+			HANDLE handle =0;
+#else
+
+
+			btSpuStatus&	spuStatus = m_activeSpuStatus[taskId];
+			btAssert(taskId>=0);
+			btAssert(taskId<m_activeSpuStatus.size());
+
+			spuStatus.m_commandId = uiCommand;
+			spuStatus.m_status = 1;
+			spuStatus.m_userPtr = (void*)uiArgument0;
+
+			///fire event to start new task
+			SetEvent(spuStatus.m_eventStartHandle);
+
+#endif //CollisionTask_LocalStoreMemory
+
+			
+
+			break;
+		}
+	default:
+		{
+			///not implemented
+			btAssert(0);
+		}
+
+	};
+
+
+}
+
+
+///check for messages from SPUs
+void Win32ThreadSupport::waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1)
+{
+	///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response
+	
+	///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback'
+
+
+	btAssert(m_activeSpuStatus.size());
+
+	int last = -1;
+#ifndef SINGLE_THREADED
+	DWORD res = WaitForMultipleObjects(m_completeHandles.size(), &m_completeHandles[0], FALSE, INFINITE);
+	btAssert(res != WAIT_FAILED);
+	last = res - WAIT_OBJECT_0;
+
+	btSpuStatus& spuStatus = m_activeSpuStatus[last];
+	btAssert(spuStatus.m_threadHandle);
+	btAssert(spuStatus.m_eventCompletetHandle);
+
+	//WaitForSingleObject(spuStatus.m_eventCompletetHandle, INFINITE);
+	btAssert(spuStatus.m_status > 1);
+	spuStatus.m_status = 0;
+
+	///need to find an active spu
+	btAssert(last>=0);
+
+#else
+	last=0;
+	btSpuStatus& spuStatus = m_activeSpuStatus[last];
+#endif //SINGLE_THREADED
+
+	
+
+	*puiArgument0 = spuStatus.m_taskId;
+	*puiArgument1 = spuStatus.m_status;
+
+
+}
+
+
+
+void Win32ThreadSupport::startThreads(const Win32ThreadConstructionInfo& threadConstructionInfo)
+{
+
+	m_activeSpuStatus.resize(threadConstructionInfo.m_numThreads);
+	m_completeHandles.resize(threadConstructionInfo.m_numThreads);
+
+	for (int i=0;i<threadConstructionInfo.m_numThreads;i++)
+	{
+		printf("starting thread %d\n",i);
+
+		btSpuStatus&	spuStatus = m_activeSpuStatus[i];
+
+		LPSECURITY_ATTRIBUTES lpThreadAttributes=NULL;
+		SIZE_T dwStackSize=threadConstructionInfo.m_threadStackSize;
+		LPTHREAD_START_ROUTINE lpStartAddress=&Thread_no_1;
+		LPVOID lpParameter=&spuStatus;
+		DWORD dwCreationFlags=0;
+		LPDWORD lpThreadId=0;
+
+		spuStatus.m_userPtr=0;
+
+		sprintf(spuStatus.m_eventStartHandleName,"eventStart%s%d",threadConstructionInfo.m_uniqueName,i);
+		spuStatus.m_eventStartHandle = CreateEvent(0,false,false,spuStatus.m_eventStartHandleName);
+
+		sprintf(spuStatus.m_eventCompletetHandleName,"eventComplete%s%d",threadConstructionInfo.m_uniqueName,i);
+		spuStatus.m_eventCompletetHandle = CreateEvent(0,false,false,spuStatus.m_eventCompletetHandleName);
+
+		m_completeHandles[i] = spuStatus.m_eventCompletetHandle;
+
+		HANDLE handle = CreateThread(lpThreadAttributes,dwStackSize,lpStartAddress,lpParameter,	dwCreationFlags,lpThreadId);
+		SetThreadPriority(handle,THREAD_PRIORITY_HIGHEST);
+		//SetThreadPriority(handle,THREAD_PRIORITY_TIME_CRITICAL);
+
+		SetThreadAffinityMask(handle, 1<<i);
+
+		spuStatus.m_taskId = i;
+		spuStatus.m_commandId = 0;
+		spuStatus.m_status = 0;
+		spuStatus.m_threadHandle = handle;
+		spuStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
+		spuStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
+
+		printf("started thread %d with threadHandle %d\n",i,handle);
+		
+	}
+
+}
+
+void Win32ThreadSupport::startSPU()
+{
+}
+
+
+///tell the task scheduler we are done with the SPU tasks
+void Win32ThreadSupport::stopSPU()
+{
+	int i;
+	for (i=0;i<m_activeSpuStatus.size();i++)
+	{
+		btSpuStatus& spuStatus = m_activeSpuStatus[i];
+		if (spuStatus.m_status>0)
+		{
+			WaitForSingleObject(spuStatus.m_eventCompletetHandle, INFINITE);
+		}
+		
+
+		spuStatus.m_userPtr = 0;
+		SetEvent(spuStatus.m_eventStartHandle);
+		WaitForSingleObject(spuStatus.m_eventCompletetHandle, INFINITE);
+
+		CloseHandle(spuStatus.m_eventCompletetHandle);
+		CloseHandle(spuStatus.m_eventStartHandle);
+		CloseHandle(spuStatus.m_threadHandle);
+	}
+
+	m_activeSpuStatus.clear();
+	m_completeHandles.clear();
+
+}
+
+#endif //USE_WIN32_THREADING
--- a/src/BulletMultiThreaded/Win32ThreadSupport.h
+++ b/src/BulletMultiThreaded/Win32ThreadSupport.h
@@ -0,0 +1,121 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "LinearMath/btScalar.h"
+#include "PlatformDefinitions.h"
+
+#ifdef USE_WIN32_THREADING  //platform specific defines are defined in PlatformDefinitions.h
+
+#ifndef WIN32_THREAD_SUPPORT_H
+#define WIN32_THREAD_SUPPORT_H
+
+#include "LinearMath/btAlignedObjectArray.h"
+
+#include "btThreadSupportInterface.h"
+
+
+typedef void (*Win32ThreadFunc)(void* userPtr,void* lsMemory);
+typedef void* (*Win32lsMemorySetupFunc)();
+
+
+
+
+
+
+///Win32ThreadSupport helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+class Win32ThreadSupport : public btThreadSupportInterface 
+{
+public:
+	///placeholder, until libspe2 support is there
+	struct	btSpuStatus
+	{
+		uint32_t	m_taskId;
+		uint32_t	m_commandId;
+		uint32_t	m_status;
+
+		Win32ThreadFunc	m_userThreadFunc;
+		void*	m_userPtr; //for taskDesc etc
+		void*	m_lsMemory; //initialized using Win32LocalStoreMemorySetupFunc
+
+		void*	m_threadHandle; //this one is calling 'Win32ThreadFunc'
+
+		void*	m_eventStartHandle;
+		char	m_eventStartHandleName[32];
+
+		void*	m_eventCompletetHandle;
+		char	m_eventCompletetHandleName[32];
+		
+
+	};
+private:
+
+	btAlignedObjectArray<btSpuStatus>	m_activeSpuStatus;
+	btAlignedObjectArray<void*>			m_completeHandles;
+	
+public:
+	///Setup and initialize SPU/CELL/Libspe2
+
+	struct	Win32ThreadConstructionInfo
+	{
+		Win32ThreadConstructionInfo(char* uniqueName,
+									Win32ThreadFunc userThreadFunc,
+									Win32lsMemorySetupFunc	lsMemoryFunc,
+									int numThreads=1,
+									int threadStackSize=65535
+									)
+									:m_uniqueName(uniqueName),
+									m_userThreadFunc(userThreadFunc),
+									m_lsMemoryFunc(lsMemoryFunc),
+									m_numThreads(numThreads),
+									m_threadStackSize(threadStackSize)
+		{
+
+		}
+
+		char*					m_uniqueName;
+		Win32ThreadFunc			m_userThreadFunc;
+		Win32lsMemorySetupFunc	m_lsMemoryFunc;
+		int						m_numThreads;
+		int						m_threadStackSize;
+
+	};
+
+
+
+	Win32ThreadSupport(const Win32ThreadConstructionInfo& threadConstructionInfo);
+
+///cleanup/shutdown Libspe2
+	virtual	~Win32ThreadSupport();
+
+	void	startThreads(const Win32ThreadConstructionInfo&	threadInfo);
+
+
+///send messages to SPUs
+	virtual	void sendRequest(uint32_t uiCommand, uint32_t uiArgument0, uint32_t uiArgument1);
+
+///check for messages from SPUs
+	virtual	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1);
+
+///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+	virtual	void startSPU();
+
+///tell the task scheduler we are done with the SPU tasks
+	virtual	void stopSPU();
+
+};
+
+#endif //WIN32_THREAD_SUPPORT_H
+
+#endif //USE_WIN32_THREADING
--- a/src/BulletMultiThreaded/btThreadSupportInterface.cpp
+++ b/src/BulletMultiThreaded/btThreadSupportInterface.cpp
@@ -0,0 +1,22 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "btThreadSupportInterface.h"
+
+btThreadSupportInterface::~btThreadSupportInterface()
+{
+
+}
+
--- a/src/BulletMultiThreaded/btThreadSupportInterface.h
+++ b/src/BulletMultiThreaded/btThreadSupportInterface.h
@@ -0,0 +1,43 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef THREAD_SUPPORT_INTERFACE_H
+#define THREAD_SUPPORT_INTERFACE_H
+
+
+//#include <LinearMath/btScalar.h> //for uint32_t etc.
+#include "PlatformDefinitions.h"
+
+class btThreadSupportInterface
+{
+public:
+
+	virtual ~btThreadSupportInterface();
+
+///send messages to SPUs
+	virtual void sendRequest(uint32_t uiCommand, uint32_t uiArgument0, uint32_t uiArgument1) =0;
+
+///check for messages from SPUs
+	virtual	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1) =0;
+
+///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+	virtual	void startSPU() =0;
+
+///tell the task scheduler we are done with the SPU tasks
+	virtual	void stopSPU()=0;
+};
+
+#endif //THREAD_SUPPORT_INTERFACE_H
+
				`@@ -0,0 +1 @@`
				`Empty placeholder for future Libspe2 SPU task`