bt -> b3 and BT -> B3 rename for content and filenames

This commit is contained in:
erwin coumans
2013-04-28 23:11:10 -07:00
parent 6bcb5b9d5f
commit 7366e262fd
178 changed files with 5218 additions and 5218 deletions

View File

@@ -20,12 +20,12 @@ subject to the following restrictions:
#define KERNEL2 "SubtractKernel"
#include "btBoundSearchCL.h"
#include "b3BoundSearchCL.h"
#include "../../basic_initialize/b3OpenCLUtils.h"
#include "btLauncherCL.h"
#include "b3LauncherCL.h"
#include "../kernels/BoundSearchKernelsCL.h"
btBoundSearchCL::btBoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
b3BoundSearchCL::b3BoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
:m_context(ctx),
m_device(device),
m_queue(queue)
@@ -38,31 +38,31 @@ btBoundSearchCL::btBoundSearchCL(cl_context ctx, cl_device_id device, cl_command
const char* kernelSource = boundSearchKernelsCL;
cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH);
btAssert(boundSearchProg);
b3Assert(boundSearchProg);
m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros );
btAssert(m_lowerSortDataKernel );
b3Assert(m_lowerSortDataKernel );
m_upperSortDataKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros );
btAssert(m_upperSortDataKernel);
b3Assert(m_upperSortDataKernel);
m_subtractKernel = 0;
if( maxSize )
{
m_subtractKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros );
btAssert(m_subtractKernel);
b3Assert(m_subtractKernel);
}
//m_constBuffer = new btOpenCLArray<btInt4>( device, 1, BufferBase::BUFFER_CONST );
//m_constBuffer = new b3OpenCLArray<b3Int4>( device, 1, BufferBase::BUFFER_CONST );
m_lower = (maxSize == 0)? 0: new btOpenCLArray<unsigned int>(ctx,queue,maxSize );
m_upper = (maxSize == 0)? 0: new btOpenCLArray<unsigned int>(ctx,queue, maxSize );
m_lower = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue,maxSize );
m_upper = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue, maxSize );
m_filler = new btFillCL(ctx,device,queue);
m_filler = new b3FillCL(ctx,device,queue);
}
btBoundSearchCL::~btBoundSearchCL()
b3BoundSearchCL::~b3BoundSearchCL()
{
delete m_lower;
@@ -77,18 +77,18 @@ btBoundSearchCL::~btBoundSearchCL()
}
void btBoundSearchCL::execute(btOpenCLArray<btSortData>& src, int nSrc, btOpenCLArray<unsigned int>& dst, int nDst, Option option )
void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option )
{
btInt4 constBuffer;
b3Int4 constBuffer;
constBuffer.x = nSrc;
constBuffer.y = nDst;
if( option == BOUND_LOWER )
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL()) };
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL()) };
btLauncherCL launcher( m_queue, m_lowerSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
b3LauncherCL launcher( m_queue, m_lowerSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
@@ -96,10 +96,10 @@ void btBoundSearchCL::execute(btOpenCLArray<btSortData>& src, int nSrc, btOpenCL
}
else if( option == BOUND_UPPER )
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
btLauncherCL launcher(m_queue, m_upperSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
b3LauncherCL launcher(m_queue, m_upperSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
@@ -107,10 +107,10 @@ void btBoundSearchCL::execute(btOpenCLArray<btSortData>& src, int nSrc, btOpenCL
}
else if( option == COUNT )
{
btAssert( m_lower );
btAssert( m_upper );
btAssert( m_lower->capacity() <= (int)nDst );
btAssert( m_upper->capacity() <= (int)nDst );
b3Assert( m_lower );
b3Assert( m_upper );
b3Assert( m_lower->capacity() <= (int)nDst );
b3Assert( m_upper->capacity() <= (int)nDst );
int zero = 0;
m_filler->execute( *m_lower, zero, nDst );
@@ -120,10 +120,10 @@ void btBoundSearchCL::execute(btOpenCLArray<btSortData>& src, int nSrc, btOpenCL
execute( src, nSrc, *m_upper, nDst, BOUND_UPPER );
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_upper->getBufferCL(), true ), btBufferInfoCL( m_lower->getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_upper->getBufferCL(), true ), b3BufferInfoCL( m_lower->getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
btLauncherCL launcher( m_queue, m_subtractKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
b3LauncherCL launcher( m_queue, m_subtractKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
@@ -132,21 +132,21 @@ void btBoundSearchCL::execute(btOpenCLArray<btSortData>& src, int nSrc, btOpenCL
}
else
{
btAssert( 0 );
b3Assert( 0 );
}
}
void btBoundSearchCL::executeHost( b3AlignedObjectArray<btSortData>& src, int nSrc,
void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc,
b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option )
{
for(int i=0; i<nSrc-1; i++)
btAssert( src[i].m_key <= src[i+1].m_key );
b3Assert( src[i].m_key <= src[i+1].m_key );
btSortData minData,zeroData,maxData;
b3SortData minData,zeroData,maxData;
minData.m_key = -1;
minData.m_value = -1;
zeroData.m_key=0;
@@ -158,8 +158,8 @@ void btBoundSearchCL::executeHost( b3AlignedObjectArray<btSortData>& src, int nS
{
for(int i=0; i<nSrc; i++)
{
btSortData& iData = (i==0)? minData: src[i-1];
btSortData& jData = (i==nSrc)? maxData: src[i];
b3SortData& iData = (i==0)? minData: src[i-1];
b3SortData& jData = (i==nSrc)? maxData: src[i];
if( iData.m_key != jData.m_key )
{
@@ -174,8 +174,8 @@ void btBoundSearchCL::executeHost( b3AlignedObjectArray<btSortData>& src, int nS
{
for(int i=1; i<nSrc+1; i++)
{
btSortData& iData = src[i-1];
btSortData& jData = (i==nSrc)? maxData: src[i];
b3SortData& iData = src[i-1];
b3SortData& jData = (i==nSrc)? maxData: src[i];
if( iData.m_key != jData.m_key )
{
@@ -208,6 +208,6 @@ void btBoundSearchCL::executeHost( b3AlignedObjectArray<btSortData>& src, int nS
}
else
{
btAssert( 0 );
b3Assert( 0 );
}
}

View File

@@ -13,8 +13,8 @@ subject to the following restrictions:
*/
//Originally written by Takahiro Harada
#ifndef BT_BOUNDSEARCH_H
#define BT_BOUNDSEARCH_H
#ifndef B3_BOUNDSEARCH_H
#define B3_BOUNDSEARCH_H
#pragma once
@@ -24,10 +24,10 @@ subject to the following restrictions:
#include <AdlPrimitives/Fill/Fill.h>
*/
#include "btOpenCLArray.h"
#include "btFillCL.h"
#include "btRadixSort32CL.h" //for btSortData (perhaps move it?)
class btBoundSearchCL
#include "b3OpenCLArray.h"
#include "b3FillCL.h"
#include "b3RadixSort32CL.h" //for b3SortData (perhaps move it?)
class b3BoundSearchCL
{
public:
@@ -47,21 +47,21 @@ class btBoundSearchCL
cl_kernel m_upperSortDataKernel;
cl_kernel m_subtractKernel;
btOpenCLArray<btInt4>* m_constbtOpenCLArray;
btOpenCLArray<unsigned int>* m_lower;
btOpenCLArray<unsigned int>* m_upper;
b3OpenCLArray<b3Int4>* m_constbtOpenCLArray;
b3OpenCLArray<unsigned int>* m_lower;
b3OpenCLArray<unsigned int>* m_upper;
btFillCL* m_filler;
b3FillCL* m_filler;
btBoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
virtual ~btBoundSearchCL();
virtual ~b3BoundSearchCL();
// src has to be src[i].m_key <= src[i+1].m_key
void execute( btOpenCLArray<btSortData>& src, int nSrc, btOpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER );
void execute( b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER );
void executeHost( b3AlignedObjectArray<btSortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
void executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
};
#endif //BT_BOUNDSEARCH_H
#endif //B3_BOUNDSEARCH_H

View File

@@ -0,0 +1,19 @@
#ifndef B3_BUFFER_INFO_CL_H
#define B3_BUFFER_INFO_CL_H
#include "b3OpenCLArray.h"
struct b3BufferInfoCL
{
//b3BufferInfoCL(){}
// template<typename T>
b3BufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){}
cl_mem m_clBuffer;
bool m_isReadOnly;
};
#endif //B3_BUFFER_INFO_CL_H

View File

@@ -1,13 +1,13 @@
#include "btFillCL.h"
#include "b3FillCL.h"
#include "../../basic_initialize/b3OpenCLUtils.h"
#include "btBufferInfoCL.h"
#include "btLauncherCL.h"
#include "b3BufferInfoCL.h"
#include "b3LauncherCL.h"
#define FILL_CL_PROGRAM_PATH "opencl/parallel_primitives/kernels/FillKernels.cl"
#include "../kernels/FillKernelsCL.h"
btFillCL::btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
b3FillCL::b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
:m_commandQueue(queue)
{
const char* kernelSource = fillKernelsCL;
@@ -15,25 +15,25 @@ btFillCL::btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
const char* additionalMacros = "";
cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH);
btAssert(fillProg);
b3Assert(fillProg);
m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros );
btAssert(m_fillIntKernel);
b3Assert(m_fillIntKernel);
m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros );
btAssert(m_fillIntKernel);
b3Assert(m_fillIntKernel);
m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros );
btAssert(m_fillFloatKernel);
b3Assert(m_fillFloatKernel);
m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros );
btAssert(m_fillKernelInt2);
b3Assert(m_fillKernelInt2);
}
btFillCL::~btFillCL()
b3FillCL::~b3FillCL()
{
clReleaseKernel(m_fillKernelInt2);
clReleaseKernel(m_fillIntKernel);
@@ -42,12 +42,12 @@ btFillCL::~btFillCL()
}
void btFillCL::execute(btOpenCLArray<float>& src, const float value, int n, int offset)
void b3FillCL::execute(b3OpenCLArray<float>& src, const float value, int n, int offset)
{
btAssert( n>0 );
b3Assert( n>0 );
{
btLauncherCL launcher( m_commandQueue, m_fillFloatKernel );
b3LauncherCL launcher( m_commandQueue, m_fillFloatKernel );
launcher.setBuffer( src.getBufferCL());
launcher.setConst( n );
launcher.setConst( value );
@@ -57,13 +57,13 @@ void btFillCL::execute(btOpenCLArray<float>& src, const float value, int n, int
}
}
void btFillCL::execute(btOpenCLArray<int>& src, const int value, int n, int offset)
void b3FillCL::execute(b3OpenCLArray<int>& src, const int value, int n, int offset)
{
btAssert( n>0 );
b3Assert( n>0 );
{
btLauncherCL launcher( m_commandQueue, m_fillIntKernel );
b3LauncherCL launcher( m_commandQueue, m_fillIntKernel );
launcher.setBuffer(src.getBufferCL());
launcher.setConst( n);
launcher.setConst( value);
@@ -73,15 +73,15 @@ void btFillCL::execute(btOpenCLArray<int>& src, const int value, int n, int offs
}
void btFillCL::execute(btOpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
void b3FillCL::execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
{
btAssert( n>0 );
b3Assert( n>0 );
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
b3LauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( n );
launcher.setConst(value);
launcher.setConst(offset);
@@ -90,7 +90,7 @@ void btFillCL::execute(btOpenCLArray<unsigned int>& src, const unsigned int valu
}
}
void btFillCL::executeHost(b3AlignedObjectArray<btInt2> &src, const btInt2 &value, int n, int offset)
void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
{
for (int i=0;i<n;i++)
{
@@ -98,7 +98,7 @@ void btFillCL::executeHost(b3AlignedObjectArray<btInt2> &src, const btInt2 &valu
}
}
void btFillCL::executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset)
void b3FillCL::executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset)
{
for (int i=0;i<n;i++)
{
@@ -106,16 +106,16 @@ void btFillCL::executeHost(b3AlignedObjectArray<int> &src, const int value, int
}
}
void btFillCL::execute(btOpenCLArray<btInt2> &src, const btInt2 &value, int n, int offset)
void b3FillCL::execute(b3OpenCLArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
{
btAssert( n>0 );
b3Assert( n>0 );
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
btLauncherCL launcher(m_commandQueue, m_fillKernelInt2);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(n);
launcher.setConst(value);
launcher.setConst(offset);

View File

@@ -0,0 +1,63 @@
#ifndef B3_FILL_CL_H
#define B3_FILL_CL_H
#include "b3OpenCLArray.h"
#include "Bullet3Common/b3Scalar.h"
#include "b3Int2.h"
#include "b3Int4.h"
class b3FillCL
{
cl_command_queue m_commandQueue;
cl_kernel m_fillKernelInt2;
cl_kernel m_fillIntKernel;
cl_kernel m_fillUnsignedIntKernel;
cl_kernel m_fillFloatKernel;
public:
struct b3ConstData
{
union
{
b3Int4 m_data;
b3UnsignedInt4 m_UnsignedData;
};
int m_offset;
int m_n;
int m_padding[2];
};
protected:
public:
b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
virtual ~b3FillCL();
void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0);
void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0);
void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0);
void executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset);
void executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset);
// void execute(b3OpenCLArray<b3Int4>& src, const b3Int4& value, int n, int offset = 0);
};
#endif //B3_FILL_CL_H

View File

@@ -1,7 +1,7 @@
#ifndef BT_INT2_H
#define BT_INT2_H
#ifndef B3_INT2_H
#define B3_INT2_H
struct btUnsignedInt2
struct b3UnsignedInt2
{
union
{
@@ -16,7 +16,7 @@ struct btUnsignedInt2
};
};
struct btInt2
struct b3Int2
{
union
{

View File

@@ -1,11 +1,11 @@
#ifndef BT_INT4_H
#define BT_INT4_H
#ifndef B3_INT4_H
#define B3_INT4_H
#include "Bullet3Common/b3Scalar.h"
ATTRIBUTE_ALIGNED16(struct) btUnsignedInt4
ATTRIBUTE_ALIGNED16(struct) b3UnsignedInt4
{
BT_DECLARE_ALIGNED_ALLOCATOR();
B3_DECLARE_ALIGNED_ALLOCATOR();
union
{
@@ -20,9 +20,9 @@ ATTRIBUTE_ALIGNED16(struct) btUnsignedInt4
};
};
ATTRIBUTE_ALIGNED16(struct) btInt4
ATTRIBUTE_ALIGNED16(struct) b3Int4
{
BT_DECLARE_ALIGNED_ALLOCATOR();
B3_DECLARE_ALIGNED_ALLOCATOR();
union
{
@@ -37,19 +37,19 @@ ATTRIBUTE_ALIGNED16(struct) btInt4
};
};
SIMD_FORCE_INLINE btInt4 btMakeInt4(int x, int y, int z, int w = 0)
SIMD_FORCE_INLINE b3Int4 b3MakeInt4(int x, int y, int z, int w = 0)
{
btInt4 v;
b3Int4 v;
v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
return v;
}
SIMD_FORCE_INLINE btUnsignedInt4 btMakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0)
SIMD_FORCE_INLINE b3UnsignedInt4 b3MakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0)
{
btUnsignedInt4 v;
b3UnsignedInt4 v;
v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
return v;
}
#endif //BT_INT4_H
#endif //B3_INT4_H

View File

@@ -1,17 +1,17 @@
#ifndef BT_LAUNCHER_CL_H
#define BT_LAUNCHER_CL_H
#ifndef B3_LAUNCHER_CL_H
#define B3_LAUNCHER_CL_H
#include "btBufferInfoCL.h"
#include "b3BufferInfoCL.h"
#include "Bullet3Common/b3MinMax.h"
#include "btOpenCLArray.h"
#include "b3OpenCLArray.h"
#include <stdio.h>
#ifdef _WIN32
#pragma warning(disable :4996)
#endif
#define BT_CL_MAX_ARG_SIZE 16
struct btKernelArgData
#define B3_CL_MAX_ARG_SIZE 16
struct b3KernelArgData
{
int m_isBuffer;
int m_argIndex;
@@ -19,28 +19,28 @@ struct btKernelArgData
union
{
cl_mem m_clBuffer;
unsigned char m_argData[BT_CL_MAX_ARG_SIZE];
unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
};
};
class btLauncherCL
class b3LauncherCL
{
cl_command_queue m_commandQueue;
cl_kernel m_kernel;
int m_idx;
b3AlignedObjectArray<btKernelArgData> m_kernelArguments;
b3AlignedObjectArray<b3KernelArgData> m_kernelArguments;
int m_serializationSizeInBytes;
public:
b3AlignedObjectArray<btOpenCLArray<unsigned char>* > m_arrays;
b3AlignedObjectArray<b3OpenCLArray<unsigned char>* > m_arrays;
btLauncherCL(cl_command_queue queue, cl_kernel kernel)
b3LauncherCL(cl_command_queue queue, cl_kernel kernel)
:m_commandQueue(queue),
m_kernel(kernel),
m_idx(0)
@@ -48,7 +48,7 @@ class btLauncherCL
m_serializationSizeInBytes = sizeof(int);
}
virtual ~btLauncherCL()
virtual ~b3LauncherCL()
{
for (int i=0;i<m_arrays.size();i++)
{
@@ -59,7 +59,7 @@ class btLauncherCL
inline void setBuffer( cl_mem clBuffer)
{
btKernelArgData kernelArg;
b3KernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 1;
kernelArg.m_clBuffer = clBuffer;
@@ -75,23 +75,23 @@ class btLauncherCL
&param_value,
&actualSizeInBytes);
btAssert( err == CL_SUCCESS );
b3Assert( err == CL_SUCCESS );
kernelArg.m_argSizeInBytes = param_value;
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+= sizeof(btKernelArgData);
m_serializationSizeInBytes+= sizeof(b3KernelArgData);
m_serializationSizeInBytes+=param_value;
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
btAssert( status == CL_SUCCESS );
b3Assert( status == CL_SUCCESS );
}
inline void setBuffers( btBufferInfoCL* buffInfo, int n )
inline void setBuffers( b3BufferInfoCL* buffInfo, int n )
{
for(int i=0; i<n; i++)
{
btKernelArgData kernelArg;
b3KernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 1;
kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
@@ -107,15 +107,15 @@ class btLauncherCL
&param_value,
&actualSizeInBytes);
btAssert( err == CL_SUCCESS );
b3Assert( err == CL_SUCCESS );
kernelArg.m_argSizeInBytes = param_value;
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+= sizeof(btKernelArgData);
m_serializationSizeInBytes+= sizeof(b3KernelArgData);
m_serializationSizeInBytes+=param_value;
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
btAssert( status == CL_SUCCESS );
b3Assert( status == CL_SUCCESS );
}
}
@@ -133,12 +133,12 @@ class btLauncherCL
for (int i=0;i<numArguments;i++)
{
btKernelArgData* arg = (btKernelArgData*)&buf[index];
b3KernelArgData* arg = (b3KernelArgData*)&buf[index];
index+=sizeof(btKernelArgData);
index+=sizeof(b3KernelArgData);
if (arg->m_isBuffer)
{
btOpenCLArray<unsigned char>* clData = new btOpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes);
b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes);
clData->resize(arg->m_argSizeInBytes);
clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
@@ -148,12 +148,12 @@ class btLauncherCL
m_arrays.push_back(clData);
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
btAssert( status == CL_SUCCESS );
b3Assert( status == CL_SUCCESS );
index+=arg->m_argSizeInBytes;
} else
{
cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
btAssert( status == CL_SUCCESS );
b3Assert( status == CL_SUCCESS );
}
m_kernelArguments.push_back(*arg);
}
@@ -176,7 +176,7 @@ class btLauncherCL
for (int ii=0;ii<numArguments;ii++)
{
btKernelArgData* argGold = (btKernelArgData*)&goldBuffer[index];
b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index];
if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
{
@@ -194,7 +194,7 @@ class btLauncherCL
return -3;
}
}
index+=sizeof(btKernelArgData);
index+=sizeof(b3KernelArgData);
if (argGold->m_isBuffer)
{
@@ -209,7 +209,7 @@ class btLauncherCL
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
memBuf, 0,0,0 );
btAssert( status==CL_SUCCESS );
b3Assert( status==CL_SUCCESS );
clFinish(m_commandQueue);
for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
@@ -256,7 +256,7 @@ class btLauncherCL
assert(destBufferCapacity>=m_serializationSizeInBytes);
//todo: use the btSerializer for this to allow for 32/64bit, endianness etc
//todo: use the b3Serializer for this to allow for 32/64bit, endianness etc
int numArguments = m_kernelArguments.size();
int curBufferSize = 0;
int* dest = (int*)&destBuffer[curBufferSize];
@@ -267,16 +267,16 @@ class btLauncherCL
for (int i=0;i<this->m_kernelArguments.size();i++)
{
btKernelArgData* arg = (btKernelArgData*) &destBuffer[curBufferSize];
b3KernelArgData* arg = (b3KernelArgData*) &destBuffer[curBufferSize];
*arg = m_kernelArguments[i];
curBufferSize+=sizeof(btKernelArgData);
curBufferSize+=sizeof(b3KernelArgData);
if (arg->m_isBuffer==1)
{
//copy the OpenCL buffer content
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
&destBuffer[curBufferSize], 0,0,0 );
btAssert( status==CL_SUCCESS );
b3Assert( status==CL_SUCCESS );
clFinish(m_commandQueue);
curBufferSize+=arg->m_argSizeInBytes;
}
@@ -317,18 +317,18 @@ class btLauncherCL
inline void setConst( const T& consts )
{
int sz=sizeof(T);
btAssert(sz<=BT_CL_MAX_ARG_SIZE);
btKernelArgData kernelArg;
b3Assert(sz<=B3_CL_MAX_ARG_SIZE);
b3KernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 0;
T* destArg = (T*)kernelArg.m_argData;
*destArg = consts;
kernelArg.m_argSizeInBytes = sizeof(T);
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+=sizeof(btKernelArgData);
m_serializationSizeInBytes+=sizeof(b3KernelArgData);
cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts );
btAssert( status == CL_SUCCESS );
b3Assert( status == CL_SUCCESS );
}
inline void launch1D( int numThreads, int localSize = 64)
@@ -342,9 +342,9 @@ class btLauncherCL
size_t lRange[3] = {1,1,1};
lRange[0] = localSizeX;
lRange[1] = localSizeY;
gRange[0] = btMax((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
gRange[0] = b3Max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
gRange[0] *= lRange[0];
gRange[1] = btMax((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
gRange[1] = b3Max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
gRange[1] *= lRange[1];
cl_int status = clEnqueueNDRangeKernel( m_commandQueue,
@@ -353,11 +353,11 @@ class btLauncherCL
{
printf("Error: OpenCL status = %d\n",status);
}
btAssert( status == CL_SUCCESS );
b3Assert( status == CL_SUCCESS );
}
};
#endif //BT_LAUNCHER_CL_H
#endif //B3_LAUNCHER_CL_H

View File

@@ -1,11 +1,11 @@
#ifndef BT_OPENCL_ARRAY_H
#define BT_OPENCL_ARRAY_H
#ifndef B3_OPENCL_ARRAY_H
#define B3_OPENCL_ARRAY_H
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "../../basic_initialize/b3OpenCLInclude.h"
template <typename T>
class btOpenCLArray
class b3OpenCLArray
{
int m_size;
int m_capacity;
@@ -28,7 +28,7 @@ class btOpenCLArray
m_capacity=0;
}
btOpenCLArray<T>& operator=(const btOpenCLArray<T>& src);
b3OpenCLArray<T>& operator=(const b3OpenCLArray<T>& src);
SIMD_FORCE_INLINE int allocSize(int size)
{
@@ -37,7 +37,7 @@ class btOpenCLArray
public:
btOpenCLArray(cl_context ctx, cl_command_queue queue, int initialCapacity=0, bool allowGrowingCapacity=true)
b3OpenCLArray(cl_context ctx, cl_command_queue queue, int initialCapacity=0, bool allowGrowingCapacity=true)
:m_size(0), m_capacity(0),m_clBuffer(0),
m_clContext(ctx),m_commandQueue(queue),
m_ownsMemory(true),m_allowGrowingCapacity(true)
@@ -61,7 +61,7 @@ public:
}
// we could enable this assignment, but need to make sure to avoid accidental deep copies
// btOpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
// b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
// {
// copyFromArray(src);
// return *this;
@@ -74,7 +74,7 @@ public:
}
virtual ~btOpenCLArray()
virtual ~b3OpenCLArray()
{
deallocate();
m_size=0;
@@ -94,8 +94,8 @@ public:
SIMD_FORCE_INLINE T forcedAt(int n) const
{
btAssert(n>=0);
btAssert(n<capacity());
b3Assert(n>=0);
b3Assert(n<capacity());
T elem;
copyToHostPointer(&elem,1,n,true);
return elem;
@@ -103,8 +103,8 @@ public:
SIMD_FORCE_INLINE T at(int n) const
{
btAssert(n>=0);
btAssert(n<size());
b3Assert(n>=0);
b3Assert(n<size());
T elem;
copyToHostPointer(&elem,1,n,true);
return elem;
@@ -152,18 +152,18 @@ public:
//create a new OpenCL buffer
int memSizeInBytes = sizeof(T)*_Count;
cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
btAssert(ciErrNum==CL_SUCCESS);
b3Assert(ciErrNum==CL_SUCCESS);
//#define BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
#ifdef BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
//#define B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
#ifdef B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
for (int i=0;i<memSizeInBytes;i++)
src[i] = 0xbb;
ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 );
btAssert(ciErrNum==CL_SUCCESS);
b3Assert(ciErrNum==CL_SUCCESS);
clFinish(m_commandQueue);
free(src);
#endif //BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
#endif //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
if (copyOldContents)
copyToCL(buf, size());
@@ -177,7 +177,7 @@ public:
} else
{
//fail: assert and
btAssert(0);
b3Assert(0);
deallocate();
}
}
@@ -189,19 +189,19 @@ public:
if (numElements<=0)
return;
btAssert(m_clBuffer);
btAssert(destination);
b3Assert(m_clBuffer);
b3Assert(destination);
//likely some error, destination is same as source
btAssert(m_clBuffer != destination);
b3Assert(m_clBuffer != destination);
btAssert((firstElem+numElements)<=m_size);
b3Assert((firstElem+numElements)<=m_size);
cl_int status = 0;
btAssert(numElements>0);
btAssert(numElements<=m_size);
b3Assert(numElements>0);
b3Assert(numElements<=m_size);
int srcOffsetBytes = sizeof(T)*firstElem;
int dstOffsetInBytes = sizeof(T)*dstOffsetInElems;
@@ -209,7 +209,7 @@ public:
status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination,
srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 );
btAssert( status == CL_SUCCESS );
b3Assert( status == CL_SUCCESS );
}
void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion=true)
@@ -225,13 +225,13 @@ public:
void copyFromHostPointer(const T* src, int numElems, int destFirstElem= 0, bool waitForCompletion=true)
{
btAssert(numElems+destFirstElem <= capacity());
b3Assert(numElems+destFirstElem <= capacity());
cl_int status = 0;
int sizeInBytes=sizeof(T)*numElems;
status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes,
src, 0,0,0 );
btAssert(status == CL_SUCCESS );
b3Assert(status == CL_SUCCESS );
if (waitForCompletion)
clFinish(m_commandQueue);
@@ -247,18 +247,18 @@ public:
void copyToHostPointer(T* destPtr, int numElem, int srcFirstElem=0, bool waitForCompletion=true) const
{
btAssert(numElem+srcFirstElem <= capacity());
b3Assert(numElem+srcFirstElem <= capacity());
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem,
destPtr, 0,0,0 );
btAssert( status==CL_SUCCESS );
b3Assert( status==CL_SUCCESS );
if (waitForCompletion)
clFinish(m_commandQueue);
}
void copyFromOpenCLArray(const btOpenCLArray& src)
void copyFromOpenCLArray(const b3OpenCLArray& src)
{
int newSize = src.size();
resize(newSize);
@@ -271,4 +271,4 @@ public:
};
#endif //BT_OPENCL_ARRAY_H
#endif //B3_OPENCL_ARRAY_H

View File

@@ -1,32 +1,32 @@
#include "btPrefixScanCL.h"
#include "btFillCL.h"
#define BT_PREFIXSCAN_PROG_PATH "opencl/parallel_primitives/kernels/PrefixScanKernels.cl"
#include "b3PrefixScanCL.h"
#include "b3FillCL.h"
#define B3_PREFIXSCAN_PROG_PATH "opencl/parallel_primitives/kernels/PrefixScanKernels.cl"
#include "btLauncherCL.h"
#include "b3LauncherCL.h"
#include "../../basic_initialize/b3OpenCLUtils.h"
#include "../kernels/PrefixScanKernelsCL.h"
btPrefixScanCL::btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
b3PrefixScanCL::b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
:m_commandQueue(queue)
{
const char* scanKernelSource = prefixScanKernelsCL;
cl_int pErrNum;
char* additionalMacros=0;
m_workBuffer = new btOpenCLArray<unsigned int>(ctx,queue,size);
cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, BT_PREFIXSCAN_PROG_PATH);
btAssert(scanProg);
m_workBuffer = new b3OpenCLArray<unsigned int>(ctx,queue,size);
cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_PROG_PATH);
b3Assert(scanProg);
m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
btAssert(m_localScanKernel );
b3Assert(m_localScanKernel );
m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
btAssert(m_blockSumKernel );
b3Assert(m_blockSumKernel );
m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
btAssert(m_propagationKernel );
b3Assert(m_propagationKernel );
}
btPrefixScanCL::~btPrefixScanCL()
b3PrefixScanCL::~b3PrefixScanCL()
{
delete m_workBuffer;
clReleaseKernel(m_localScanKernel);
@@ -35,7 +35,7 @@ btPrefixScanCL::~btPrefixScanCL()
}
template<class T>
T btNextPowerOf2(T n)
T b3NextPowerOf2(T n)
{
n -= 1;
for(int i=0; i<sizeof(T)*8; i++)
@@ -43,37 +43,37 @@ T btNextPowerOf2(T n)
return n+1;
}
void btPrefixScanCL::execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
{
// btAssert( data->m_option == EXCLUSIVE );
// b3Assert( data->m_option == EXCLUSIVE );
const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
dst.resize(src.size());
m_workBuffer->resize(src.size());
btInt4 constBuffer;
b3Int4 constBuffer;
constBuffer.x = n;
constBuffer.y = numBlocks;
constBuffer.z = (int)btNextPowerOf2( numBlocks );
constBuffer.z = (int)b3NextPowerOf2( numBlocks );
btOpenCLArray<unsigned int>* srcNative = &src;
btOpenCLArray<unsigned int>* dstNative = &dst;
b3OpenCLArray<unsigned int>* srcNative = &src;
b3OpenCLArray<unsigned int>* dstNative = &dst;
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( srcNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) };
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_localScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
b3LauncherCL launcher( m_commandQueue, m_localScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
}
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_workBuffer->getBufferCL() ) };
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_blockSumKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
b3LauncherCL launcher( m_commandQueue, m_blockSumKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
}
@@ -81,9 +81,9 @@ void btPrefixScanCL::execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<uns
if( numBlocks > 1 )
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_propagationKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_propagationKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
}
@@ -98,7 +98,7 @@ void btPrefixScanCL::execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<uns
}
void btPrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
{
unsigned int s = 0;
//if( data->m_option == EXCLUSIVE )

View File

@@ -1,12 +1,12 @@
#ifndef BT_PREFIX_SCAN_CL_H
#define BT_PREFIX_SCAN_CL_H
#ifndef B3_PREFIX_SCAN_CL_H
#define B3_PREFIX_SCAN_CL_H
#include "btOpenCLArray.h"
#include "btBufferInfoCL.h"
#include "b3OpenCLArray.h"
#include "b3BufferInfoCL.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
class btPrefixScanCL
class b3PrefixScanCL
{
enum
{
@@ -21,17 +21,17 @@ class btPrefixScanCL
cl_kernel m_blockSumKernel;
cl_kernel m_propagationKernel;
btOpenCLArray<unsigned int>* m_workBuffer;
b3OpenCLArray<unsigned int>* m_workBuffer;
public:
btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
virtual ~btPrefixScanCL();
virtual ~b3PrefixScanCL();
void execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
void execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum);
};
#endif //BT_PREFIX_SCAN_CL_H
#endif //B3_PREFIX_SCAN_CL_H

View File

@@ -1,27 +1,27 @@
#include "btRadixSort32CL.h"
#include "btLauncherCL.h"
#include "b3RadixSort32CL.h"
#include "b3LauncherCL.h"
#include "../../basic_initialize/b3OpenCLUtils.h"
#include "btPrefixScanCL.h"
#include "btFillCL.h"
#include "b3PrefixScanCL.h"
#include "b3FillCL.h"
#define RADIXSORT32_PATH "opencl/parallel_primitives/kernels/RadixSort32Kernels.cl"
#include "../kernels/RadixSort32KernelsCL.h"
btRadixSort32CL::btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
:m_commandQueue(queue)
{
btOpenCLDeviceInfo info;
b3OpenCLDeviceInfo info;
b3OpenCLUtils::getDeviceInfo(device,&info);
m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0;
m_workBuffer1 = new btOpenCLArray<unsigned int>(ctx,queue);
m_workBuffer2 = new btOpenCLArray<unsigned int>(ctx,queue);
m_workBuffer3 = new btOpenCLArray<btSortData>(ctx,queue);
m_workBuffer3a = new btOpenCLArray<unsigned int>(ctx,queue);
m_workBuffer4 = new btOpenCLArray<btSortData>(ctx,queue);
m_workBuffer4a = new btOpenCLArray<unsigned int>(ctx,queue);
m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx,queue);
m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx,queue);
m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx,queue);
m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx,queue);
m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx,queue);
m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx,queue);
if (initialCapacity>0)
@@ -33,8 +33,8 @@ btRadixSort32CL::btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command
m_workBuffer4a->resize(initialCapacity);
}
m_scan = new btPrefixScanCL(ctx,device,queue);
m_fill = new btFillCL(ctx,device,queue);
m_scan = new b3PrefixScanCL(ctx,device,queue);
m_fill = new b3FillCL(ctx,device,queue);
const char* additionalMacros = "";
const char* srcFileNameForCaching="";
@@ -43,15 +43,15 @@ btRadixSort32CL::btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command
const char* kernelSource = radixSort32KernelsCL;
cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH);
btAssert(sortProg);
b3Assert(sortProg);
m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_streamCountSortDataKernel );
b3Assert(m_streamCountSortDataKernel );
m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_streamCountKernel);
b3Assert(m_streamCountKernel);
@@ -59,23 +59,23 @@ btRadixSort32CL::btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command
{
m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros );
btAssert(m_sortAndScatterSortDataKernel);
b3Assert(m_sortAndScatterSortDataKernel);
m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros );
btAssert(m_sortAndScatterKernel);
b3Assert(m_sortAndScatterKernel);
} else
{
m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_sortAndScatterSortDataKernel);
b3Assert(m_sortAndScatterSortDataKernel);
m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_sortAndScatterKernel);
b3Assert(m_sortAndScatterKernel);
}
m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_prefixScanKernel);
b3Assert(m_prefixScanKernel);
}
btRadixSort32CL::~btRadixSort32CL()
b3RadixSort32CL::~b3RadixSort32CL()
{
delete m_scan;
delete m_fill;
@@ -93,7 +93,7 @@ btRadixSort32CL::~btRadixSort32CL()
clReleaseKernel(m_prefixScanKernel);
}
void btRadixSort32CL::executeHost(b3AlignedObjectArray<btSortData>& inout, int sortBits /* = 32 */)
void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
{
int n = inout.size();
const int BITS_PER_PASS = 8;
@@ -103,10 +103,10 @@ void btRadixSort32CL::executeHost(b3AlignedObjectArray<btSortData>& inout, int s
int tables[NUM_TABLES];
int counter[NUM_TABLES];
btSortData* src = &inout[0];
b3AlignedObjectArray<btSortData> workbuffer;
b3SortData* src = &inout[0];
b3AlignedObjectArray<b3SortData> workbuffer;
workbuffer.resize(inout.size());
btSortData* dst = &workbuffer[0];
b3SortData* dst = &workbuffer[0];
int count=0;
for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
@@ -152,21 +152,21 @@ void btRadixSort32CL::executeHost(b3AlignedObjectArray<btSortData>& inout, int s
counter[tableIdx] ++;
}
btSwap( src, dst );
b3Swap( src, dst );
count++;
}
if (count&1)
{
btAssert(0);//need to copy
b3Assert(0);//need to copy
}
}
void btRadixSort32CL::executeHost(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
{
b3AlignedObjectArray<btSortData> inout;
b3AlignedObjectArray<b3SortData> inout;
keyValuesInOut.copyToHost(inout);
executeHost(inout,sortBits);
@@ -174,8 +174,8 @@ void btRadixSort32CL::executeHost(btOpenCLArray<btSortData>& keyValuesInOut, int
keyValuesInOut.copyFromHost(inout);
}
void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysIn, btOpenCLArray<unsigned int>& keysOut, btOpenCLArray<unsigned int>& valuesIn,
btOpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
{
}
@@ -184,7 +184,7 @@ void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysIn, btOpenCLArray
//#define DEBUG_RADIXSORT2
void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
{
int originalSize = keyValuesInOut.size();
@@ -194,7 +194,7 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
int dataAlignment = DATA_ALIGNMENT;
#ifdef DEBUG_RADIXSORT2
b3AlignedObjectArray<btSortData> test2;
b3AlignedObjectArray<b3SortData> test2;
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n",test2.size());
for (int i=0;i<test2.size();i++)
@@ -204,20 +204,20 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
}
#endif //DEBUG_RADIXSORT2
btOpenCLArray<btSortData>* src = 0;
b3OpenCLArray<b3SortData>* src = 0;
if (workingSize%dataAlignment)
{
workingSize += dataAlignment-(workingSize%dataAlignment);
m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
m_workBuffer4->resize(workingSize);
btSortData fillValue;
b3SortData fillValue;
fillValue.m_key = 0xffffffff;
fillValue.m_value = 0xffffffff;
#define USE_BTFILL
#ifdef USE_BTFILL
m_fill->execute((btOpenCLArray<btInt2>&)*m_workBuffer4,(btInt2&)fillValue,workingSize-originalSize,originalSize);
m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize);
#else
//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
@@ -234,7 +234,7 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
m_workBuffer4->resize(0);
}
btAssert( workingSize%DATA_ALIGNMENT == 0 );
b3Assert( workingSize%DATA_ALIGNMENT == 0 );
int minCap = NUM_BUCKET*NUM_WGS;
@@ -245,20 +245,20 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
btAssert( BITS_PER_PASS == 4 );
btAssert( WG_SIZE == 64 );
btAssert( (sortBits&0x3) == 0 );
b3Assert( BITS_PER_PASS == 4 );
b3Assert( WG_SIZE == 64 );
b3Assert( (sortBits&0x3) == 0 );
btOpenCLArray<btSortData>* dst = m_workBuffer3;
b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;
b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
int nWGs = NUM_WGS;
btConstData cdata;
b3ConstData cdata;
{
int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
@@ -294,10 +294,10 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
if (src->size())
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
btLauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel);
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
int num = NUM_WGS*WG_SIZE;
@@ -328,9 +328,9 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
if (fastScan)
{// prefix scan group histogram
btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( 128, 128 );
destHisto = srcHisto;
@@ -362,9 +362,9 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
if (src->size())
{// local sort and distribute
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
btLauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
@@ -379,8 +379,8 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
int startBit = ib;
destHisto->copyToHost(testHist);
b3AlignedObjectArray<btSortData> srcHost;
b3AlignedObjectArray<btSortData> dstHost;
b3AlignedObjectArray<b3SortData> srcHost;
b3AlignedObjectArray<b3SortData> dstHost;
dstHost.resize(src->size());
src->copyToHost(srcHost);
@@ -405,11 +405,11 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int tables[NUM_TABLES];
b3AlignedObjectArray<btSortData> dstHostOK;
b3AlignedObjectArray<b3SortData> dstHostOK;
dstHostOK.resize(src->size());
destHisto->copyToHost(testHist);
b3AlignedObjectArray<btSortData> srcHost;
b3AlignedObjectArray<b3SortData> srcHost;
src->copyToHost(srcHost);
int blockSize = 256;
@@ -435,7 +435,7 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
}
b3AlignedObjectArray<btSortData> dstHost;
b3AlignedObjectArray<b3SortData> dstHost;
dstHost.resize(src->size());
@@ -449,7 +449,7 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
for(int iblock=0; iblock<btMin(cdata.m_nBlocksPerWG, nBlocks); iblock++)
for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
{
for (int lIdx = 0;lIdx < 64;lIdx++)
{
@@ -470,7 +470,7 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
btSortData ok = dstHostOK[destIndex];
b3SortData ok = dstHostOK[destIndex];
if (ok.m_key != srcHost[i].m_key)
{
@@ -512,8 +512,8 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
printf("testHist[%d]=%d\n",i,testHist[i]);
}
#endif //DEBUG_RADIXSORT
btSwap(src, dst );
btSwap(srcHisto,destHisto);
b3Swap(src, dst );
b3Swap(srcHisto,destHisto);
#ifdef DEBUG_RADIXSORT2
keyValuesInOut.copyToHost(test2);
@@ -537,7 +537,7 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
if (count&1)
{
btAssert(0);//need to copy from workbuffer to keyValuesInOut
b3Assert(0);//need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4->size())
@@ -565,7 +565,7 @@ void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sor
void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
{
int originalSize = keysInOut.size();
int workingSize = originalSize;
@@ -573,7 +573,7 @@ void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysInOut, int sortBi
int dataAlignment = DATA_ALIGNMENT;
btOpenCLArray<unsigned int>* src = 0;
b3OpenCLArray<unsigned int>* src = 0;
if (workingSize%dataAlignment)
{
@@ -593,7 +593,7 @@ void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysInOut, int sortBi
btAssert( workingSize%DATA_ALIGNMENT == 0 );
b3Assert( workingSize%DATA_ALIGNMENT == 0 );
int minCap = NUM_BUCKET*NUM_WGS;
@@ -605,20 +605,20 @@ void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysInOut, int sortBi
m_workBuffer3a->resize(workingSize);
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
btAssert( BITS_PER_PASS == 4 );
btAssert( WG_SIZE == 64 );
btAssert( (sortBits&0x3) == 0 );
b3Assert( BITS_PER_PASS == 4 );
b3Assert( WG_SIZE == 64 );
b3Assert( (sortBits&0x3) == 0 );
btOpenCLArray<unsigned int>* dst = m_workBuffer3a;
b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;
b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
int nWGs = NUM_WGS;
btConstData cdata;
b3ConstData cdata;
{
int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
@@ -641,10 +641,10 @@ void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysInOut, int sortBi
if (src->size())
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
btLauncherCL launcher(m_commandQueue, m_streamCountKernel);
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
b3LauncherCL launcher(m_commandQueue, m_streamCountKernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
int num = NUM_WGS*WG_SIZE;
@@ -663,9 +663,9 @@ void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysInOut, int sortBi
if (fastScan)
{// prefix scan group histogram
btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( 128, 128 );
destHisto = srcHisto;
@@ -677,23 +677,23 @@ void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysInOut, int sortBi
if (src->size())
{// local sort and distribute
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
btLauncherCL launcher( m_commandQueue, m_sortAndScatterKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
b3LauncherCL launcher( m_commandQueue, m_sortAndScatterKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
}
btSwap(src, dst );
btSwap(srcHisto,destHisto);
b3Swap(src, dst );
b3Swap(srcHisto,destHisto);
count++;
}
if (count&1)
{
btAssert(0);//need to copy from workbuffer to keyValuesInOut
b3Assert(0);//need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4a->size())

View File

@@ -0,0 +1,85 @@
#ifndef B3_RADIXSORT32_H
#define B3_RADIXSORT32_H
#include "b3OpenCLArray.h"
struct b3SortData
{
int m_key;
int m_value;
};
#include "b3BufferInfoCL.h"
class b3RadixSort32CL
{
b3OpenCLArray<unsigned int>* m_workBuffer1;
b3OpenCLArray<unsigned int>* m_workBuffer2;
b3OpenCLArray<b3SortData>* m_workBuffer3;
b3OpenCLArray<b3SortData>* m_workBuffer4;
b3OpenCLArray<unsigned int>* m_workBuffer3a;
b3OpenCLArray<unsigned int>* m_workBuffer4a;
cl_command_queue m_commandQueue;
cl_kernel m_streamCountSortDataKernel;
cl_kernel m_streamCountKernel;
cl_kernel m_prefixScanKernel;
cl_kernel m_sortAndScatterSortDataKernel;
cl_kernel m_sortAndScatterKernel;
bool m_deviceCPU;
class b3PrefixScanCL* m_scan;
class b3FillCL* m_fill;
public:
struct b3ConstData
{
int m_n;
int m_nWGs;
int m_startBit;
int m_nBlocksPerWG;
};
enum
{
DATA_ALIGNMENT = 256,
WG_SIZE = 64,
BLOCK_SIZE = 256,
ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
BITS_PER_PASS = 4,
NUM_BUCKET=(1<<BITS_PER_PASS),
// if you change this, change nPerWI in kernel as well
NUM_WGS = 20*6, // cypress
// NUM_WGS = 24*6, // cayman
// NUM_WGS = 32*4, // nv
};
private:
public:
b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
virtual ~b3RadixSort32CL();
void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
///keys only
void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32 );
void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32 );
void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
};
#endif //B3_RADIXSORT32_H

View File

@@ -1,19 +0,0 @@
#ifndef BT_BUFFER_INFO_CL_H
#define BT_BUFFER_INFO_CL_H
#include "btOpenCLArray.h"
struct btBufferInfoCL
{
//btBufferInfoCL(){}
// template<typename T>
btBufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){}
cl_mem m_clBuffer;
bool m_isReadOnly;
};
#endif //BT_BUFFER_INFO_CL_H

View File

@@ -1,63 +0,0 @@
#ifndef BT_FILL_CL_H
#define BT_FILL_CL_H
#include "btOpenCLArray.h"
#include "Bullet3Common/b3Scalar.h"
#include "btInt2.h"
#include "btInt4.h"
class btFillCL
{
cl_command_queue m_commandQueue;
cl_kernel m_fillKernelInt2;
cl_kernel m_fillIntKernel;
cl_kernel m_fillUnsignedIntKernel;
cl_kernel m_fillFloatKernel;
public:
struct btConstData
{
union
{
btInt4 m_data;
btUnsignedInt4 m_UnsignedData;
};
int m_offset;
int m_n;
int m_padding[2];
};
protected:
public:
btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
virtual ~btFillCL();
void execute(btOpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
void execute(btOpenCLArray<int>& src, const int value, int n, int offset = 0);
void execute(btOpenCLArray<float>& src, const float value, int n, int offset = 0);
void execute(btOpenCLArray<btInt2>& src, const btInt2& value, int n, int offset = 0);
void executeHost(b3AlignedObjectArray<btInt2> &src, const btInt2 &value, int n, int offset);
void executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset);
// void execute(btOpenCLArray<btInt4>& src, const btInt4& value, int n, int offset = 0);
};
#endif //BT_FILL_CL_H

View File

@@ -1,85 +0,0 @@
#ifndef BT_RADIXSORT32_H
#define BT_RADIXSORT32_H
#include "btOpenCLArray.h"
struct btSortData
{
int m_key;
int m_value;
};
#include "btBufferInfoCL.h"
class btRadixSort32CL
{
btOpenCLArray<unsigned int>* m_workBuffer1;
btOpenCLArray<unsigned int>* m_workBuffer2;
btOpenCLArray<btSortData>* m_workBuffer3;
btOpenCLArray<btSortData>* m_workBuffer4;
btOpenCLArray<unsigned int>* m_workBuffer3a;
btOpenCLArray<unsigned int>* m_workBuffer4a;
cl_command_queue m_commandQueue;
cl_kernel m_streamCountSortDataKernel;
cl_kernel m_streamCountKernel;
cl_kernel m_prefixScanKernel;
cl_kernel m_sortAndScatterSortDataKernel;
cl_kernel m_sortAndScatterKernel;
bool m_deviceCPU;
class btPrefixScanCL* m_scan;
class btFillCL* m_fill;
public:
struct btConstData
{
int m_n;
int m_nWGs;
int m_startBit;
int m_nBlocksPerWG;
};
enum
{
DATA_ALIGNMENT = 256,
WG_SIZE = 64,
BLOCK_SIZE = 256,
ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
BITS_PER_PASS = 4,
NUM_BUCKET=(1<<BITS_PER_PASS),
// if you change this, change nPerWI in kernel as well
NUM_WGS = 20*6, // cypress
// NUM_WGS = 24*6, // cayman
// NUM_WGS = 32*4, // nv
};
private:
public:
btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
virtual ~btRadixSort32CL();
void execute(btOpenCLArray<unsigned int>& keysIn, btOpenCLArray<unsigned int>& keysOut, btOpenCLArray<unsigned int>& valuesIn,
btOpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
///keys only
void execute(btOpenCLArray<unsigned int>& keysInOut, int sortBits = 32 );
void execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits = 32 );
void executeHost(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits = 32);
void executeHost(b3AlignedObjectArray<btSortData>& keyValuesInOut, int sortBits = 32);
};
#endif //BT_RADIXSORT32_H