Added libspe2 parallel Cell SPE support contribution by IBM Germany 'Extreme Blue' project:

Thanks to Benjamin Hoeferlin, Minh Cuong Tran,Martina Huellmann,Frederick Roth.
This commit is contained in:
ejcoumans
2007-09-26 23:35:47 +00:00
parent b2b2ea71c6
commit 0ff4444118
3 changed files with 334 additions and 248 deletions

View File

@@ -1,106 +1,107 @@
#ifndef DOUBLE_BUFFER_H
#define DOUBLE_BUFFER_H
#include "SpuFakeDma.h"
///DoubleBuffer
template<class T, int size>
class DoubleBuffer
{
#ifdef __CELLOS_LV2__
T m_buffer0[size] __attribute__ ((aligned (128)));
T m_buffer1[size] __attribute__ ((aligned (128)));
#else
T m_buffer0[size];
T m_buffer1[size];
#endif
T *m_frontBuffer;
T *m_backBuffer;
unsigned int m_dmaTag;
bool m_dmaPending;
public:
bool isPending() const { return m_dmaPending;}
DoubleBuffer();
void init ();
// dma get and put commands
void backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag);
void backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag);
// gets pointer to a buffer
T *getFront();
T *getBack();
// if back buffer dma was started, wait for it to complete
// then move back to front and vice versa
T *swapBuffers();
};
template<class T, int size>
DoubleBuffer<T,size>::DoubleBuffer()
{
init ();
}
template<class T, int size>
void DoubleBuffer<T,size>::init()
{
this->m_dmaPending = false;
this->m_frontBuffer = &this->m_buffer0[0];
this->m_backBuffer = &this->m_buffer1[0];
}
template<class T, int size>
void
DoubleBuffer<T,size>::backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag)
{
m_dmaPending = true;
m_dmaTag = tag;
cellDmaLargeGet(m_backBuffer, ea, numBytes, tag, 0, 0);
}
template<class T, int size>
void
DoubleBuffer<T,size>::backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag)
{
m_dmaPending = true;
m_dmaTag = tag;
cellDmaLargePut(m_backBuffer, ea, numBytes, tag, 0, 0);
}
template<class T, int size>
T *
DoubleBuffer<T,size>::getFront()
{
return m_frontBuffer;
}
template<class T, int size>
T *
DoubleBuffer<T,size>::getBack()
{
return m_backBuffer;
}
template<class T, int size>
T *
DoubleBuffer<T,size>::swapBuffers()
{
if (m_dmaPending)
{
cellDmaWaitTagStatusAll(1<<m_dmaTag);
m_dmaPending = false;
}
T *tmp = m_backBuffer;
m_backBuffer = m_frontBuffer;
m_frontBuffer = tmp;
return m_frontBuffer;
}
#endif
#ifndef DOUBLE_BUFFER_H
#define DOUBLE_BUFFER_H
#include "SpuFakeDma.h"
#include <LinearMath/btScalar.h>
///DoubleBuffer
template<class T, int size>
class DoubleBuffer
{
#if defined(__CELLOS_LV2__) || defined(USE_LIBSPE2)
ATTRIBUTE_ALIGNED128( T m_buffer0[size] ) ;
ATTRIBUTE_ALIGNED128( T m_buffer1[size] ) ;
#else
T m_buffer0[size];
T m_buffer1[size];
#endif
T *m_frontBuffer;
T *m_backBuffer;
unsigned int m_dmaTag;
bool m_dmaPending;
public:
bool isPending() const { return m_dmaPending;}
DoubleBuffer();
void init ();
// dma get and put commands
void backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag);
void backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag);
// gets pointer to a buffer
T *getFront();
T *getBack();
// if back buffer dma was started, wait for it to complete
// then move back to front and vice versa
T *swapBuffers();
};
template<class T, int size>
DoubleBuffer<T,size>::DoubleBuffer()
{
init ();
}
template<class T, int size>
void DoubleBuffer<T,size>::init()
{
this->m_dmaPending = false;
this->m_frontBuffer = &this->m_buffer0[0];
this->m_backBuffer = &this->m_buffer1[0];
}
template<class T, int size>
void
DoubleBuffer<T,size>::backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag)
{
m_dmaPending = true;
m_dmaTag = tag;
cellDmaLargeGet(m_backBuffer, ea, numBytes, tag, 0, 0);
}
template<class T, int size>
void
DoubleBuffer<T,size>::backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag)
{
m_dmaPending = true;
m_dmaTag = tag;
cellDmaLargePut(m_backBuffer, ea, numBytes, tag, 0, 0);
}
template<class T, int size>
T *
DoubleBuffer<T,size>::getFront()
{
return m_frontBuffer;
}
template<class T, int size>
T *
DoubleBuffer<T,size>::getBack()
{
return m_backBuffer;
}
template<class T, int size>
T *
DoubleBuffer<T,size>::swapBuffers()
{
if (m_dmaPending)
{
cellDmaWaitTagStatusAll(1<<m_dmaTag);
m_dmaPending = false;
}
T *tmp = m_backBuffer;
m_backBuffer = m_frontBuffer;
m_frontBuffer = tmp;
return m_frontBuffer;
}
#endif

View File

@@ -1,106 +1,106 @@
#include "SpuFakeDma.h"
#include <LinearMath/btScalar.h> //for btAssert
//Disabling memcpy sometimes helps debugging DMA
#define USE_MEMCPY 1
///this unalignedDma should not be frequently used, only for small data. It handles alignment and performs check on size (<16 bytes)
int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size)
{
btAssert(size<16);
ATTRIBUTE_ALIGNED16(char tmpBuffer[32]);
char* mainMem = (char*)ea;
char* localStore = (char*)ls;
uint32_t i;
///make sure last 4 bits are the same, for cellDmaSmallGet
uint32_t last4BitsOffset = ea & 0x0f;
char* tmpTarget = tmpBuffer + last4BitsOffset;
#ifdef WIN32
#ifdef USE_MEMCPY
memcpy(tmpTarget,mainMem,size);
#else
for ( i=0;i<size;i++)
{
tmpTarget[i] = mainMem[i];
}
#endif
#else
cellDmaSmallGet(tmpTarget,ea,size,DMA_TAG(1),0,0);
//copy into final destination
#endif
cellDmaWaitTagStatusAll(DMA_MASK(1));
//this is slowish, perhaps memcpy on SPU is smarter?
for (i=0;i<size;i++)
{
localStore[i] = tmpTarget[i];
}
return 0;
}
#ifdef WIN32
int cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
char* mainMem = (char*)ea;
char* localStore = (char*)ls;
#ifdef USE_MEMCPY
memcpy(localStore,mainMem,size);
#else
for (uint32_t i=0;i<size;i++)
{
localStore[i] = mainMem[i];
}
#endif
return 0;
}
int cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
char* mainMem = (char*)ea;
char* localStore = (char*)ls;
#ifdef USE_MEMCPY
memcpy(localStore,mainMem,size);
#else
for (uint32_t i=0;i<size;i++)
{
localStore[i] = mainMem[i];
}
#endif //#ifdef USE_MEMCPY
return 0;
}
int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
char* mainMem = (char*)ea;
const char* localStore = (const char*)ls;
#ifdef USE_MEMCPY
memcpy(mainMem,localStore,size);
#else
for (uint32_t i=0;i<size;i++)
{
mainMem[i] = localStore[i];
}
#endif //#ifdef USE_MEMCPY
return 0;
}
void cellDmaWaitTagStatusAll(int ignore)
{
}
#endif //WIN32
#include "SpuFakeDma.h"
#include <LinearMath/btScalar.h> //for btAssert
//Disabling memcpy sometimes helps debugging DMA
#define USE_MEMCPY 1
///this unalignedDma should not be frequently used, only for small data. It handles alignment and performs check on size (<16 bytes)
int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size)
{
btAssert(size<16);
ATTRIBUTE_ALIGNED16(char tmpBuffer[32]);
char* mainMem = (char*)ea;
char* localStore = (char*)ls;
uint32_t i;
///make sure last 4 bits are the same, for cellDmaSmallGet
uint32_t last4BitsOffset = ea & 0x0f;
char* tmpTarget = tmpBuffer + last4BitsOffset;
#ifdef WIN32
#ifdef USE_MEMCPY
memcpy(tmpTarget,mainMem,size);
#else
for ( i=0;i<size;i++)
{
tmpTarget[i] = mainMem[i];
}
#endif //USE_MEMCPY
#else
cellDmaSmallGet(tmpTarget,ea,size,DMA_TAG(1),0,0);
//copy into final destination
#endif //WIN32
cellDmaWaitTagStatusAll(DMA_MASK(1));
//this is slowish, perhaps memcpy on SPU is smarter?
for (i=0; btLikely( i<size );i++)
{
localStore[i] = tmpTarget[i];
}
return 0;
}
#ifdef WIN32
int cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
char* mainMem = (char*)ea;
char* localStore = (char*)ls;
#ifdef USE_MEMCPY
memcpy(localStore,mainMem,size);
#else
for (uint32_t i=0;i<size;i++)
{
localStore[i] = mainMem[i];
}
#endif
return 0;
}
int cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
char* mainMem = (char*)ea;
char* localStore = (char*)ls;
#ifdef USE_MEMCPY
memcpy(localStore,mainMem,size);
#else
for (uint32_t i=0;i<size;i++)
{
localStore[i] = mainMem[i];
}
#endif //#ifdef USE_MEMCPY
return 0;
}
int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
char* mainMem = (char*)ea;
const char* localStore = (const char*)ls;
#ifdef USE_MEMCPY
memcpy(mainMem,localStore,size);
#else
for (uint32_t i=0;i<size;i++)
{
mainMem[i] = localStore[i];
}
#endif //#ifdef USE_MEMCPY
return 0;
}
void cellDmaWaitTagStatusAll(int ignore)
{
}
#endif //WIN32

View File

@@ -1,36 +1,121 @@
#ifndef FAKE_DMA_H
#define FAKE_DMA_H
#include "PlatformDefinitions.h"
#ifdef __CELLOS_LV2__
#include <cell/dma.h>
#include <stdint.h>
#define DMA_TAG(xfer) (xfer + 1)
#define DMA_MASK(xfer) (1 << DMA_TAG(xfer))
#elif defined (WIN32)
#define DMA_TAG(a) (a)
#define DMA_MASK(a) (a)
/// cellDmaLargeGet Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
int cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
int cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
/// cellDmaLargePut Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
/// cellDmaWaitTagStatusAll Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
void cellDmaWaitTagStatusAll(int ignore);
#endif //WIN32
///stallingUnalignedDmaSmallGet internally uses DMA_TAG(1)
int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size);
#endif //FAKE_DMA_H
#ifndef FAKE_DMA_H
#define FAKE_DMA_H
#include "PlatformDefinitions.h"
#ifdef __CELLOS_LV2__
#include <cell/dma.h>
#include <stdint.h>
#define DMA_TAG(xfer) (xfer + 1)
#define DMA_MASK(xfer) (1 << DMA_TAG(xfer))
#else
#ifdef WIN32
#define DMA_TAG(a) (a)
#define DMA_MASK(a) (a)
/// cellDmaLargeGet Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
int cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
int cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
/// cellDmaLargePut Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
/// cellDmaWaitTagStatusAll Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
void cellDmaWaitTagStatusAll(int ignore);
#elif defined(USE_LIBSPE2)
#define DMA_TAG(xfer) (xfer + 1)
#define DMA_MASK(xfer) (1 << DMA_TAG(xfer))
#include <spu_mfcio.h>
//#define DEBUG_DMA
#ifdef DEBUG_DMA
#define dUASSERT(a,b) if (!(a)) { printf(b);}
#ifdef USE_ADDR64
#define uintsize unsigned long long
#else
#define uintsize unsigned int
#endif
#define cellDmaLargeGet(ls, ea, size, tag, tid, rid) if ( (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0)) || (size > 16), "Not naturally aligned: "); \
dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
dUASSERT(size < 16384, "size too big: "); \
dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
} \
mfc_get(ls, ea, size, tag, tid, rid)
#define cellDmaGet(ls, ea, size, tag, tid, rid) if ( (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0)) || (size > 16), "Not naturally aligned: "); \
dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
dUASSERT(size < 16384, "size too big: "); \
dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
} \
mfc_get(ls, ea, size, tag, tid, rid)
#define cellDmaLargePut(ls, ea, size, tag, tid, rid) if ( (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0)) || (size > 16), "Not naturally aligned: "); \
dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
dUASSERT(size < 16384, "size too big: "); \
dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
printf("PUT %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ls,(unsigned int)ea,(unsigned int)size); \
} \
mfc_put(ls, ea, size, tag, tid, rid)
#define cellDmaSmallGet(ls, ea, size, tag, tid, rid) if ( (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0)) || (size > 16), "Not naturally aligned: "); \
dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
dUASSERT(size < 16384, "size too big: "); \
dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
} \
mfc_get(ls, ea, size, tag, tid, rid)
#define cellDmaWaitTagStatusAll(ignore) mfc_write_tag_mask(ignore) ; mfc_read_tag_status_all()
#else
#define cellDmaLargeGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
#define cellDmaGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
#define cellDmaLargePut(ls, ea, size, tag, tid, rid) mfc_put(ls, ea, size, tag, tid, rid)
#define cellDmaSmallGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
#define cellDmaWaitTagStatusAll(ignore) mfc_write_tag_mask(ignore) ; mfc_read_tag_status_all()
#endif // DEBUG_DMA
#endif // WIN32
#endif //__CELLOS_LV2__
///stallingUnalignedDmaSmallGet internally uses DMA_TAG(1)
int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size);
#endif //FAKE_DMA_H