Added libspe2 parallel Cell SPE support contribution by IBM Germany 'Extreme Blue' project:

Thanks to Benjamin Hoeferlin, Minh Cuong Tran,Martina Huellmann,Frederick Roth.
This commit is contained in:
ejcoumans
2007-09-26 23:35:47 +00:00
parent b2b2ea71c6
commit 0ff4444118
3 changed files with 334 additions and 248 deletions

View File

@@ -1,106 +1,107 @@
#ifndef DOUBLE_BUFFER_H #ifndef DOUBLE_BUFFER_H
#define DOUBLE_BUFFER_H #define DOUBLE_BUFFER_H
#include "SpuFakeDma.h" #include "SpuFakeDma.h"
#include <LinearMath/btScalar.h>
///DoubleBuffer
template<class T, int size> ///DoubleBuffer
class DoubleBuffer template<class T, int size>
{ class DoubleBuffer
#ifdef __CELLOS_LV2__ {
T m_buffer0[size] __attribute__ ((aligned (128))); #if defined(__CELLOS_LV2__) || defined(USE_LIBSPE2)
T m_buffer1[size] __attribute__ ((aligned (128))); ATTRIBUTE_ALIGNED128( T m_buffer0[size] ) ;
#else ATTRIBUTE_ALIGNED128( T m_buffer1[size] ) ;
T m_buffer0[size]; #else
T m_buffer1[size]; T m_buffer0[size];
#endif T m_buffer1[size];
#endif
T *m_frontBuffer;
T *m_backBuffer; T *m_frontBuffer;
T *m_backBuffer;
unsigned int m_dmaTag;
bool m_dmaPending; unsigned int m_dmaTag;
public: bool m_dmaPending;
bool isPending() const { return m_dmaPending;} public:
DoubleBuffer(); bool isPending() const { return m_dmaPending;}
DoubleBuffer();
void init ();
void init ();
// dma get and put commands
void backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag); // dma get and put commands
void backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag); void backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag);
void backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag);
// gets pointer to a buffer
T *getFront(); // gets pointer to a buffer
T *getBack(); T *getFront();
T *getBack();
// if back buffer dma was started, wait for it to complete
// then move back to front and vice versa // if back buffer dma was started, wait for it to complete
T *swapBuffers(); // then move back to front and vice versa
}; T *swapBuffers();
};
template<class T, int size>
DoubleBuffer<T,size>::DoubleBuffer() template<class T, int size>
{ DoubleBuffer<T,size>::DoubleBuffer()
init (); {
} init ();
}
template<class T, int size>
void DoubleBuffer<T,size>::init() template<class T, int size>
{ void DoubleBuffer<T,size>::init()
this->m_dmaPending = false; {
this->m_frontBuffer = &this->m_buffer0[0]; this->m_dmaPending = false;
this->m_backBuffer = &this->m_buffer1[0]; this->m_frontBuffer = &this->m_buffer0[0];
} this->m_backBuffer = &this->m_buffer1[0];
}
template<class T, int size>
void template<class T, int size>
DoubleBuffer<T,size>::backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag) void
{ DoubleBuffer<T,size>::backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag)
m_dmaPending = true; {
m_dmaTag = tag; m_dmaPending = true;
cellDmaLargeGet(m_backBuffer, ea, numBytes, tag, 0, 0); m_dmaTag = tag;
} cellDmaLargeGet(m_backBuffer, ea, numBytes, tag, 0, 0);
}
template<class T, int size>
void template<class T, int size>
DoubleBuffer<T,size>::backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag) void
{ DoubleBuffer<T,size>::backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag)
m_dmaPending = true; {
m_dmaTag = tag; m_dmaPending = true;
cellDmaLargePut(m_backBuffer, ea, numBytes, tag, 0, 0); m_dmaTag = tag;
} cellDmaLargePut(m_backBuffer, ea, numBytes, tag, 0, 0);
}
template<class T, int size>
T * template<class T, int size>
DoubleBuffer<T,size>::getFront() T *
{ DoubleBuffer<T,size>::getFront()
return m_frontBuffer; {
} return m_frontBuffer;
}
template<class T, int size>
T * template<class T, int size>
DoubleBuffer<T,size>::getBack() T *
{ DoubleBuffer<T,size>::getBack()
return m_backBuffer; {
} return m_backBuffer;
}
template<class T, int size>
T * template<class T, int size>
DoubleBuffer<T,size>::swapBuffers() T *
{ DoubleBuffer<T,size>::swapBuffers()
if (m_dmaPending) {
{ if (m_dmaPending)
cellDmaWaitTagStatusAll(1<<m_dmaTag); {
m_dmaPending = false; cellDmaWaitTagStatusAll(1<<m_dmaTag);
} m_dmaPending = false;
}
T *tmp = m_backBuffer;
m_backBuffer = m_frontBuffer; T *tmp = m_backBuffer;
m_frontBuffer = tmp; m_backBuffer = m_frontBuffer;
m_frontBuffer = tmp;
return m_frontBuffer;
} return m_frontBuffer;
}
#endif
#endif

View File

@@ -1,106 +1,106 @@
#include "SpuFakeDma.h" #include "SpuFakeDma.h"
#include <LinearMath/btScalar.h> //for btAssert #include <LinearMath/btScalar.h> //for btAssert
//Disabling memcpy sometimes helps debugging DMA //Disabling memcpy sometimes helps debugging DMA
#define USE_MEMCPY 1 #define USE_MEMCPY 1
///this unalignedDma should not be frequently used, only for small data. It handles alignment and performs check on size (<16 bytes) ///this unalignedDma should not be frequently used, only for small data. It handles alignment and performs check on size (<16 bytes)
int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size) int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size)
{ {
btAssert(size<16); btAssert(size<16);
ATTRIBUTE_ALIGNED16(char tmpBuffer[32]); ATTRIBUTE_ALIGNED16(char tmpBuffer[32]);
char* mainMem = (char*)ea; char* mainMem = (char*)ea;
char* localStore = (char*)ls; char* localStore = (char*)ls;
uint32_t i; uint32_t i;
///make sure last 4 bits are the same, for cellDmaSmallGet ///make sure last 4 bits are the same, for cellDmaSmallGet
uint32_t last4BitsOffset = ea & 0x0f; uint32_t last4BitsOffset = ea & 0x0f;
char* tmpTarget = tmpBuffer + last4BitsOffset; char* tmpTarget = tmpBuffer + last4BitsOffset;
#ifdef WIN32 #ifdef WIN32
#ifdef USE_MEMCPY #ifdef USE_MEMCPY
memcpy(tmpTarget,mainMem,size); memcpy(tmpTarget,mainMem,size);
#else #else
for ( i=0;i<size;i++) for ( i=0;i<size;i++)
{ {
tmpTarget[i] = mainMem[i]; tmpTarget[i] = mainMem[i];
} }
#endif #endif //USE_MEMCPY
#else #else
cellDmaSmallGet(tmpTarget,ea,size,DMA_TAG(1),0,0); cellDmaSmallGet(tmpTarget,ea,size,DMA_TAG(1),0,0);
//copy into final destination //copy into final destination
#endif #endif //WIN32
cellDmaWaitTagStatusAll(DMA_MASK(1)); cellDmaWaitTagStatusAll(DMA_MASK(1));
//this is slowish, perhaps memcpy on SPU is smarter? //this is slowish, perhaps memcpy on SPU is smarter?
for (i=0;i<size;i++) for (i=0; btLikely( i<size );i++)
{ {
localStore[i] = tmpTarget[i]; localStore[i] = tmpTarget[i];
} }
return 0; return 0;
} }
#ifdef WIN32 #ifdef WIN32
int cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid) int cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{ {
char* mainMem = (char*)ea; char* mainMem = (char*)ea;
char* localStore = (char*)ls; char* localStore = (char*)ls;
#ifdef USE_MEMCPY #ifdef USE_MEMCPY
memcpy(localStore,mainMem,size); memcpy(localStore,mainMem,size);
#else #else
for (uint32_t i=0;i<size;i++) for (uint32_t i=0;i<size;i++)
{ {
localStore[i] = mainMem[i]; localStore[i] = mainMem[i];
} }
#endif #endif
return 0; return 0;
} }
int cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid) int cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{ {
char* mainMem = (char*)ea; char* mainMem = (char*)ea;
char* localStore = (char*)ls; char* localStore = (char*)ls;
#ifdef USE_MEMCPY #ifdef USE_MEMCPY
memcpy(localStore,mainMem,size); memcpy(localStore,mainMem,size);
#else #else
for (uint32_t i=0;i<size;i++) for (uint32_t i=0;i<size;i++)
{ {
localStore[i] = mainMem[i]; localStore[i] = mainMem[i];
} }
#endif //#ifdef USE_MEMCPY #endif //#ifdef USE_MEMCPY
return 0; return 0;
} }
int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid) int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{ {
char* mainMem = (char*)ea; char* mainMem = (char*)ea;
const char* localStore = (const char*)ls; const char* localStore = (const char*)ls;
#ifdef USE_MEMCPY #ifdef USE_MEMCPY
memcpy(mainMem,localStore,size); memcpy(mainMem,localStore,size);
#else #else
for (uint32_t i=0;i<size;i++) for (uint32_t i=0;i<size;i++)
{ {
mainMem[i] = localStore[i]; mainMem[i] = localStore[i];
} }
#endif //#ifdef USE_MEMCPY #endif //#ifdef USE_MEMCPY
return 0; return 0;
} }
void cellDmaWaitTagStatusAll(int ignore) void cellDmaWaitTagStatusAll(int ignore)
{ {
} }
#endif //WIN32 #endif //WIN32

View File

@@ -1,36 +1,121 @@
#ifndef FAKE_DMA_H #ifndef FAKE_DMA_H
#define FAKE_DMA_H #define FAKE_DMA_H
#include "PlatformDefinitions.h" #include "PlatformDefinitions.h"
#ifdef __CELLOS_LV2__ #ifdef __CELLOS_LV2__
#include <cell/dma.h> #include <cell/dma.h>
#include <stdint.h> #include <stdint.h>
#define DMA_TAG(xfer) (xfer + 1) #define DMA_TAG(xfer) (xfer + 1)
#define DMA_MASK(xfer) (1 << DMA_TAG(xfer)) #define DMA_MASK(xfer) (1 << DMA_TAG(xfer))
#elif defined (WIN32)
#else
#define DMA_TAG(a) (a) #ifdef WIN32
#define DMA_MASK(a) (a)
#define DMA_TAG(a) (a)
#define DMA_MASK(a) (a)
/// cellDmaLargeGet Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
int cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
int cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid); /// cellDmaLargeGet Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
/// cellDmaLargePut Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy) int cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid); int cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
/// cellDmaWaitTagStatusAll Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy) /// cellDmaLargePut Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
void cellDmaWaitTagStatusAll(int ignore); int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
#endif //WIN32 /// cellDmaWaitTagStatusAll Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
void cellDmaWaitTagStatusAll(int ignore);
///stallingUnalignedDmaSmallGet internally uses DMA_TAG(1)
int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size);
#elif defined(USE_LIBSPE2)
#endif //FAKE_DMA_H
#define DMA_TAG(xfer) (xfer + 1)
#define DMA_MASK(xfer) (1 << DMA_TAG(xfer))
#include <spu_mfcio.h>
//#define DEBUG_DMA
#ifdef DEBUG_DMA
#define dUASSERT(a,b) if (!(a)) { printf(b);}
#ifdef USE_ADDR64
#define uintsize unsigned long long
#else
#define uintsize unsigned int
#endif
#define cellDmaLargeGet(ls, ea, size, tag, tid, rid) if ( (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0)) || (size > 16), "Not naturally aligned: "); \
dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
dUASSERT(size < 16384, "size too big: "); \
dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
} \
mfc_get(ls, ea, size, tag, tid, rid)
#define cellDmaGet(ls, ea, size, tag, tid, rid) if ( (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0)) || (size > 16), "Not naturally aligned: "); \
dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
dUASSERT(size < 16384, "size too big: "); \
dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
} \
mfc_get(ls, ea, size, tag, tid, rid)
#define cellDmaLargePut(ls, ea, size, tag, tid, rid) if ( (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0)) || (size > 16), "Not naturally aligned: "); \
dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
dUASSERT(size < 16384, "size too big: "); \
dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
printf("PUT %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ls,(unsigned int)ea,(unsigned int)size); \
} \
mfc_put(ls, ea, size, tag, tid, rid)
#define cellDmaSmallGet(ls, ea, size, tag, tid, rid) if ( (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0)) || (size > 16), "Not naturally aligned: "); \
dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
dUASSERT(size < 16384, "size too big: "); \
dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
} \
mfc_get(ls, ea, size, tag, tid, rid)
#define cellDmaWaitTagStatusAll(ignore) mfc_write_tag_mask(ignore) ; mfc_read_tag_status_all()
#else
#define cellDmaLargeGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
#define cellDmaGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
#define cellDmaLargePut(ls, ea, size, tag, tid, rid) mfc_put(ls, ea, size, tag, tid, rid)
#define cellDmaSmallGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
#define cellDmaWaitTagStatusAll(ignore) mfc_write_tag_mask(ignore) ; mfc_read_tag_status_all()
#endif // DEBUG_DMA
#endif // WIN32
#endif //__CELLOS_LV2__
///stallingUnalignedDmaSmallGet internally uses DMA_TAG(1)
int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size);
#endif //FAKE_DMA_H