Add the GPU rigid body pipeline from https://github.com/erwincoumans/experiments as a Bullet 3.x preview for Bullet 2.80
This commit is contained in:
19
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.cpp
Normal file
19
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.cpp
Normal file
@@ -0,0 +1,19 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#include <Adl/Adl.h>
|
||||
|
||||
//KernelManager* KernelManager::s_kManager = NULL;
|
||||
235
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.h
Normal file
235
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.h
Normal file
@@ -0,0 +1,235 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#ifndef ADL_H
|
||||
#define ADL_H
|
||||
|
||||
#pragma warning( disable : 4996 )
|
||||
#include <Adl/AdlConfig.h>
|
||||
#include <Adl/AdlError.h>
|
||||
#include <algorithm>
|
||||
|
||||
#ifndef max
|
||||
#define max(a,b) (((a) > (b)) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
#ifndef min
|
||||
#define min(a,b) (((a) < (b)) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
enum DeviceType
|
||||
{
|
||||
TYPE_CL = 0,
|
||||
TYPE_DX11 = 1,
|
||||
TYPE_HOST,
|
||||
};
|
||||
|
||||
|
||||
struct Device;
|
||||
|
||||
struct BufferBase
|
||||
{
|
||||
enum BufferType
|
||||
{
|
||||
BUFFER,
|
||||
|
||||
// for dx
|
||||
BUFFER_CONST,
|
||||
BUFFER_STAGING,
|
||||
BUFFER_APPEND,
|
||||
BUFFER_RAW,
|
||||
BUFFER_W_COUNTER,
|
||||
BUFFER_INDEX,
|
||||
BUFFER_VERTEX,
|
||||
|
||||
// for cl
|
||||
BUFFER_ZERO_COPY,
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
class DeviceUtils
|
||||
{
|
||||
public:
|
||||
struct Config
|
||||
{
|
||||
enum DeviceType
|
||||
{
|
||||
DEVICE_GPU,
|
||||
DEVICE_CPU,
|
||||
};
|
||||
|
||||
// for CL
|
||||
enum DeviceVendor
|
||||
{
|
||||
VD_AMD,
|
||||
VD_INTEL,
|
||||
VD_NV,
|
||||
};
|
||||
|
||||
Config() : m_type(DEVICE_GPU), m_deviceIdx(0), m_vendor(VD_AMD){}
|
||||
|
||||
DeviceType m_type;
|
||||
int m_deviceIdx;
|
||||
DeviceVendor m_vendor;
|
||||
};
|
||||
|
||||
__inline
|
||||
static
|
||||
int getNDevices( DeviceType type );
|
||||
__inline
|
||||
static Device* allocate( DeviceType type, Config& cfg );
|
||||
__inline
|
||||
static void deallocate( Device* deviceData );
|
||||
__inline
|
||||
static void waitForCompletion( const Device* deviceData );
|
||||
};
|
||||
|
||||
//==========================
|
||||
// DeviceData
|
||||
//==========================
|
||||
struct Kernel;
|
||||
|
||||
struct Device
|
||||
{
|
||||
typedef DeviceUtils::Config Config;
|
||||
|
||||
Device( DeviceType type ) : m_type( type ), m_memoryUsage(0)
|
||||
{
|
||||
}
|
||||
|
||||
virtual void* getContext() const { return 0; }
|
||||
virtual void initialize(const Config& cfg){}
|
||||
virtual void release(){}
|
||||
virtual void waitForCompletion() const {}
|
||||
virtual void getDeviceName( char nameOut[128] ) const {}
|
||||
virtual Kernel* getKernel(const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL, bool cacheKernel = true ) const { ADLASSERT(0); return 0;}
|
||||
virtual unsigned int getUsedMemory() const { return m_memoryUsage; }
|
||||
|
||||
DeviceType m_type;
|
||||
unsigned int m_memoryUsage;
|
||||
};
|
||||
|
||||
//==========================
|
||||
// Buffer
|
||||
//==========================
|
||||
|
||||
template<typename T>
|
||||
struct HostBuffer;
|
||||
// overload each deviceDatas
|
||||
template<typename T>
|
||||
struct Buffer : public BufferBase
|
||||
{
|
||||
__inline
|
||||
Buffer();
|
||||
__inline
|
||||
Buffer(const Device* device, int nElems, BufferType type = BUFFER );
|
||||
__inline
|
||||
virtual ~Buffer();
|
||||
|
||||
__inline
|
||||
void setRawPtr( const Device* device, T* ptr, int size, BufferType type = BUFFER );
|
||||
__inline
|
||||
void allocate(const Device* device, int nElems, BufferType type = BUFFER );
|
||||
__inline
|
||||
void write(T* hostSrcPtr, int nElems, int dstOffsetNElems = 0);
|
||||
__inline
|
||||
void read(T* hostDstPtr, int nElems, int srcOffsetNElems = 0) const;
|
||||
__inline
|
||||
void write(Buffer<T>& src, int nElems);
|
||||
__inline
|
||||
void read(Buffer<T>& dst, int nElems) const;
|
||||
// __inline
|
||||
// Buffer<T>& operator = (const Buffer<T>& buffer);
|
||||
__inline
|
||||
int getSize() const { return m_size; }
|
||||
|
||||
DeviceType getType() const { ADLASSERT( m_device ); return m_device->m_type; }
|
||||
|
||||
|
||||
const Device* m_device;
|
||||
int m_size;
|
||||
T* m_ptr;
|
||||
// for DX11
|
||||
void* m_uav;
|
||||
void* m_srv;
|
||||
bool m_allocated; // todo. move this to a bit
|
||||
};
|
||||
|
||||
class BufferUtils
|
||||
{
|
||||
public:
|
||||
template<DeviceType TYPE, bool COPY, typename T>
|
||||
__inline
|
||||
static
|
||||
typename Buffer<T>* map(const Device* device, const Buffer<T>* in, int copySize = -1);
|
||||
|
||||
template<bool COPY, typename T>
|
||||
__inline
|
||||
static
|
||||
void unmap( Buffer<T>* native, const Buffer<T>* orig, int copySize = -1 );
|
||||
};
|
||||
|
||||
//==========================
|
||||
// HostBuffer
|
||||
//==========================
|
||||
struct DeviceHost;
|
||||
|
||||
template<typename T>
|
||||
struct HostBuffer : public Buffer<T>
|
||||
{
|
||||
__inline
|
||||
HostBuffer():Buffer<T>(){}
|
||||
__inline
|
||||
HostBuffer(const Device* device, int nElems, BufferType type = BUFFER ) : Buffer<T>(device, nElems, type) {}
|
||||
// HostBuffer(const Device* deviceData, T* rawPtr, int nElems);
|
||||
|
||||
|
||||
__inline
|
||||
T& operator[](int idx);
|
||||
__inline
|
||||
const T& operator[](int idx) const;
|
||||
__inline
|
||||
T* begin() { return m_ptr; }
|
||||
|
||||
__inline
|
||||
HostBuffer<T>& operator = (const Buffer<T>& device);
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
#include <Adl/AdlKernel.h>
|
||||
#if defined(ADL_ENABLE_CL)
|
||||
#include <Adl/CL/AdlCL.inl>
|
||||
#endif
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
#include <Adl/DX11/AdlDX11.inl>
|
||||
#endif
|
||||
|
||||
#include <Adl/Host/AdlHost.inl>
|
||||
#include <Adl/AdlKernel.inl>
|
||||
#include <Adl/Adl.inl>
|
||||
|
||||
|
||||
#include <Adl/AdlStopwatch.h>
|
||||
|
||||
#include <Adl/Host/AdlStopwatchHost.inl>
|
||||
#include <Adl/AdlStopwatch.inl>
|
||||
|
||||
#endif
|
||||
344
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.inl
Normal file
344
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.inl
Normal file
@@ -0,0 +1,344 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
int DeviceUtils::getNDevices( DeviceType type )
|
||||
{
|
||||
switch( type )
|
||||
{
|
||||
#if defined(ADL_ENABLE_CL)
|
||||
case TYPE_CL:
|
||||
return DeviceCL::getNDevices();
|
||||
#endif
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
case TYPE_DX11:
|
||||
return DeviceDX11::getNDevices();
|
||||
#endif
|
||||
default:
|
||||
return 1;
|
||||
};
|
||||
}
|
||||
|
||||
Device* DeviceUtils::allocate( DeviceType type, Config& cfg )
|
||||
{
|
||||
Device* deviceData;
|
||||
switch( type )
|
||||
{
|
||||
#if defined(ADL_ENABLE_CL)
|
||||
case TYPE_CL:
|
||||
deviceData = new DeviceCL();
|
||||
break;
|
||||
#endif
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
case TYPE_DX11:
|
||||
deviceData = new DeviceDX11();
|
||||
break;
|
||||
#endif
|
||||
case TYPE_HOST:
|
||||
deviceData = new DeviceHost();
|
||||
break;
|
||||
default:
|
||||
ADLASSERT( 0 );
|
||||
break;
|
||||
};
|
||||
deviceData->initialize( cfg );
|
||||
return deviceData;
|
||||
}
|
||||
|
||||
void DeviceUtils::deallocate( Device* deviceData )
|
||||
{
|
||||
ADLASSERT( deviceData->getUsedMemory() == 0 );
|
||||
deviceData->release();
|
||||
delete deviceData;
|
||||
}
|
||||
|
||||
void DeviceUtils::waitForCompletion( const Device* deviceData )
|
||||
{
|
||||
deviceData->waitForCompletion();
|
||||
}
|
||||
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
#if defined(ADL_ENABLE_CL)
|
||||
#define SELECT_DEVICEDATA( type, func ) \
|
||||
switch( type ) \
|
||||
{ \
|
||||
case TYPE_CL: ((DeviceCL*)m_device)->func; break; \
|
||||
case TYPE_DX11: ((DeviceDX11*)m_device)->func; break; \
|
||||
case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
|
||||
default: ADLASSERT(0); break; \
|
||||
}
|
||||
|
||||
#define SELECT_DEVICEDATA1( deviceData, func ) \
|
||||
switch( deviceData->m_type ) \
|
||||
{ \
|
||||
case TYPE_CL: ((DeviceCL*)deviceData)->func; break; \
|
||||
case TYPE_DX11: ((DeviceDX11*)deviceData)->func; break; \
|
||||
case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
|
||||
default: ADLASSERT(0); break; \
|
||||
}
|
||||
#else
|
||||
#define SELECT_DEVICEDATA( type, func ) \
|
||||
switch( type ) \
|
||||
{ \
|
||||
case TYPE_DX11: ((DeviceDX11*)m_device)->func; break; \
|
||||
case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
|
||||
default: ADLASSERT(0); break; \
|
||||
}
|
||||
|
||||
#define SELECT_DEVICEDATA1( deviceData, func ) \
|
||||
switch( deviceData->m_type ) \
|
||||
{ \
|
||||
case TYPE_DX11: ((DeviceDX11*)deviceData)->func; break; \
|
||||
case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
|
||||
default: ADLASSERT(0); break; \
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#if defined(ADL_ENABLE_CL)
|
||||
#define SELECT_DEVICEDATA( type, func ) \
|
||||
switch( type ) \
|
||||
{ \
|
||||
case TYPE_CL: ((DeviceCL*)m_device)->func; break; \
|
||||
case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
|
||||
default: ADLASSERT(0); break; \
|
||||
}
|
||||
|
||||
#define SELECT_DEVICEDATA1( deviceData, func ) \
|
||||
switch( deviceData->m_type ) \
|
||||
{ \
|
||||
case TYPE_CL: ((DeviceCL*)deviceData)->func; break; \
|
||||
case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
|
||||
default: ADLASSERT(0); break; \
|
||||
}
|
||||
#else
|
||||
#define SELECT_DEVICEDATA( type, func ) \
|
||||
switch( type ) \
|
||||
{ \
|
||||
case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
|
||||
default: ADLASSERT(0); break; \
|
||||
}
|
||||
|
||||
#define SELECT_DEVICEDATA1( deviceData, func ) \
|
||||
switch( deviceData->m_type ) \
|
||||
{ \
|
||||
case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
|
||||
default: ADLASSERT(0); break; \
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
Buffer<T>::Buffer()
|
||||
{
|
||||
m_device = 0;
|
||||
m_size = 0;
|
||||
m_ptr = 0;
|
||||
|
||||
m_uav = 0;
|
||||
m_srv = 0;
|
||||
|
||||
m_allocated = false;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
Buffer<T>::Buffer(const Device* deviceData, int nElems, BufferType type )
|
||||
{
|
||||
m_device = 0;
|
||||
allocate( deviceData, nElems, type );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
Buffer<T>::~Buffer()
|
||||
{
|
||||
if( m_allocated )
|
||||
{
|
||||
if( m_device )
|
||||
SELECT_DEVICEDATA( m_device->m_type, deallocate( this ) );
|
||||
}
|
||||
|
||||
m_device = 0;
|
||||
m_ptr = 0;
|
||||
m_size = 0;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void Buffer<T>::setRawPtr( const Device* device, T* ptr, int size, BufferType type )
|
||||
{
|
||||
ADLASSERT( m_device == 0 );
|
||||
ADLASSERT( type == BUFFER ); // todo. implement
|
||||
ADLASSERT( device->m_type != TYPE_DX11 ); // todo. implement set srv, uav
|
||||
|
||||
m_device = device;
|
||||
m_ptr = ptr;
|
||||
m_size = size;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void Buffer<T>::allocate(const Device* deviceData, int nElems, BufferType type )
|
||||
{
|
||||
ADLASSERT( m_device == 0 );
|
||||
m_device = deviceData;
|
||||
m_size = 0;
|
||||
m_ptr = 0;
|
||||
|
||||
m_uav = 0;
|
||||
m_srv = 0;
|
||||
|
||||
SELECT_DEVICEDATA( m_device->m_type, allocate( this, nElems, type ) );
|
||||
m_allocated = true;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void Buffer<T>::write(T* hostPtr, int nElems, int offsetNElems)
|
||||
{
|
||||
ADLASSERT( nElems+offsetNElems <= m_size );
|
||||
SELECT_DEVICEDATA( m_device->m_type, copy(this, hostPtr, nElems, offsetNElems) );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void Buffer<T>::read(T* hostPtr, int nElems, int offsetNElems) const
|
||||
{
|
||||
SELECT_DEVICEDATA( m_device->m_type, copy(hostPtr,this, nElems, offsetNElems) );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void Buffer<T>::write(Buffer<T>& src, int nElems)
|
||||
{
|
||||
ADLASSERT( nElems <= m_size );
|
||||
SELECT_DEVICEDATA( m_device->m_type, copy(this, &src, nElems) );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void Buffer<T>::read(Buffer<T>& dst, int nElems) const
|
||||
{
|
||||
SELECT_DEVICEDATA( m_device->m_type, copy(&dst, this, nElems) );
|
||||
}
|
||||
/*
|
||||
template<typename T>
|
||||
Buffer<T>& Buffer<T>::operator = ( const Buffer<T>& buffer )
|
||||
{
|
||||
// ADLASSERT( buffer.m_size <= m_size );
|
||||
|
||||
SELECT_DEVICEDATA( m_device->m_type, copy(this, &buffer, min2( m_size, buffer.m_size) ) );
|
||||
|
||||
return *this;
|
||||
}
|
||||
*/
|
||||
|
||||
template<DeviceType TYPE, bool COPY, typename T>
|
||||
__inline
|
||||
static
|
||||
typename Buffer<T>* BufferUtils::map(const Device* device, const Buffer<T>* in, int copySize)
|
||||
{
|
||||
Buffer<T>* native;
|
||||
ADLASSERT( device->m_type == TYPE );
|
||||
|
||||
if( in->getType() == TYPE )
|
||||
native = (Buffer<T>*)in;
|
||||
else
|
||||
{
|
||||
ADLASSERT( copySize <= in->getSize() );
|
||||
copySize = (copySize==-1)? in->getSize() : copySize;
|
||||
|
||||
native = new Buffer<T>( device, copySize );
|
||||
if( COPY )
|
||||
{
|
||||
if( in->getType() == TYPE_HOST )
|
||||
native->write( in->m_ptr, copySize );
|
||||
else if( native->getType() == TYPE_HOST )
|
||||
{
|
||||
in->read( native->m_ptr, copySize );
|
||||
DeviceUtils::waitForCompletion( in->m_device );
|
||||
}
|
||||
else
|
||||
{
|
||||
T* tmp = new T[copySize];
|
||||
in->read( tmp, copySize );
|
||||
DeviceUtils::waitForCompletion( in->m_device );
|
||||
native->write( tmp, copySize );
|
||||
DeviceUtils::waitForCompletion( native->m_device );
|
||||
delete [] tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
return native;
|
||||
}
|
||||
|
||||
template<bool COPY, typename T>
|
||||
__inline
|
||||
static
|
||||
void BufferUtils::unmap( Buffer<T>* native, const Buffer<T>* orig, int copySize )
|
||||
{
|
||||
if( native != orig )
|
||||
{
|
||||
if( COPY )
|
||||
{
|
||||
copySize = (copySize==-1)? orig->getSize() : copySize;
|
||||
ADLASSERT( copySize <= orig->getSize() );
|
||||
if( orig->getType() == TYPE_HOST )
|
||||
{
|
||||
native->read( orig->m_ptr, copySize );
|
||||
DeviceUtils::waitForCompletion( native->m_device );
|
||||
}
|
||||
else if( native->getType() == TYPE_HOST )
|
||||
{
|
||||
Buffer<T>* dst = (Buffer<T>*)orig;
|
||||
dst->write( native->m_ptr, copySize );
|
||||
DeviceUtils::waitForCompletion( dst->m_device );
|
||||
}
|
||||
else
|
||||
{
|
||||
T* tmp = new T[copySize];
|
||||
native->read( tmp, copySize );
|
||||
DeviceUtils::waitForCompletion( native->m_device );
|
||||
Buffer<T>* dst = (Buffer<T>*)orig;
|
||||
dst->write( tmp, copySize );
|
||||
DeviceUtils::waitForCompletion( dst->m_device );
|
||||
delete [] tmp;
|
||||
}
|
||||
}
|
||||
delete native;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
T& HostBuffer<T>::operator[](int idx)
|
||||
{
|
||||
return m_ptr[idx];
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
const T& HostBuffer<T>::operator[](int idx) const
|
||||
{
|
||||
return m_ptr[idx];
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
HostBuffer<T>& HostBuffer<T>::operator = ( const Buffer<T>& device )
|
||||
{
|
||||
ADLASSERT( device.m_size <= m_size );
|
||||
|
||||
SELECT_DEVICEDATA1( device.m_device, copy( m_ptr, &device, device.m_size ) );
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
#undef SELECT_DEVICEDATA
|
||||
|
||||
};
|
||||
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
|
||||
//ADL_ENABLE_CL and ADL_ENABLE_DX11 can be set in the build system using C/C++ preprocessor defines
|
||||
//#define ADL_ENABLE_CL
|
||||
//#define ADL_ENABLE_DX11
|
||||
|
||||
//#define ADL_CL_FORCE_UNCACHE_KERNEL
|
||||
#define ADL_CL_DUMP_MEMORY_LOG
|
||||
|
||||
//load the kernels from string instead of loading them from file
|
||||
#define ADL_LOAD_KERNEL_FROM_STRING
|
||||
#define ADL_DUMP_DX11_ERROR
|
||||
80
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlError.h
Normal file
80
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlError.h
Normal file
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#ifndef ADL_ERROR_H
|
||||
#define ADL_ERROR_H
|
||||
|
||||
#if defined(ADL_DUMP_DX11_ERROR)
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#ifdef _DEBUG
|
||||
#include <assert.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
#ifdef _DEBUG
|
||||
#define ADLASSERT(x) if(!(x)){__debugbreak(); }
|
||||
#else
|
||||
#define ADLASSERT(x) if(x){}
|
||||
#endif
|
||||
|
||||
#ifdef _DEBUG
|
||||
#define COMPILE_TIME_ASSERT(x) {int compileTimeAssertFailed[x]; compileTimeAssertFailed[0];}
|
||||
#else
|
||||
#define COMPILE_TIME_ASSERT(x)
|
||||
#endif
|
||||
|
||||
#ifdef _DEBUG
|
||||
__inline
|
||||
void debugPrintf(const char *fmt, ...)
|
||||
{
|
||||
va_list arg;
|
||||
va_start(arg, fmt);
|
||||
#if defined(ADL_DUMP_DX11_ERROR)
|
||||
const int size = 1024*10;
|
||||
char buf[size];
|
||||
vsprintf_s( buf, size, fmt, arg );
|
||||
#ifdef UNICODE
|
||||
WCHAR wbuf[size];
|
||||
int sizeWide = MultiByteToWideChar(0,0,buf,-1,wbuf,0);
|
||||
MultiByteToWideChar(0,0,buf,-1,wbuf,sizeWide);
|
||||
|
||||
// swprintf_s( wbuf, 256, L"%s", buf );
|
||||
OutputDebugString( wbuf );
|
||||
#else
|
||||
OutputDebugString( buf );
|
||||
#endif
|
||||
#else
|
||||
vprintf(fmt, arg);
|
||||
#endif
|
||||
va_end(arg);
|
||||
}
|
||||
#else
|
||||
__inline
|
||||
void debugPrintf(const char *fmt, ...)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
142
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlKernel.h
Normal file
142
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlKernel.h
Normal file
@@ -0,0 +1,142 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#ifndef ADL_KERNEL_H
|
||||
#define ADL_KERNEL_H
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
//==========================
|
||||
// Kernel
|
||||
//==========================
|
||||
struct Kernel
|
||||
{
|
||||
DeviceType m_type;
|
||||
void* m_kernel;
|
||||
};
|
||||
|
||||
//==========================
|
||||
// KernelManager
|
||||
//==========================
|
||||
class KernelManager
|
||||
{
|
||||
public:
|
||||
typedef std::map<std::string, Kernel*> KMap;
|
||||
|
||||
__inline
|
||||
~KernelManager();
|
||||
|
||||
__inline
|
||||
// static
|
||||
Kernel* query(const Device* dd, const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL,
|
||||
bool cacheKernel = true);
|
||||
|
||||
public:
|
||||
KMap m_map;
|
||||
};
|
||||
|
||||
//==========================
|
||||
// Launcher
|
||||
//==========================
|
||||
class Launcher
|
||||
{
|
||||
public:
|
||||
struct BufferInfo
|
||||
{
|
||||
BufferInfo(){}
|
||||
template<typename T>
|
||||
BufferInfo(Buffer<T>* buff, bool isReadOnly = false): m_buffer(buff), m_isReadOnly(isReadOnly){}
|
||||
|
||||
void* m_buffer;
|
||||
bool m_isReadOnly;
|
||||
};
|
||||
|
||||
__inline
|
||||
Launcher(const Device* dd, char* fileName, char* funcName, char* option = NULL);
|
||||
__inline
|
||||
Launcher(const Device* dd, Kernel* kernel);
|
||||
__inline
|
||||
void setBuffers( BufferInfo* buffInfo, int n );
|
||||
template<typename T>
|
||||
__inline
|
||||
void setConst( Buffer<T>& constBuff, const T& consts );
|
||||
__inline
|
||||
void launch1D( int numThreads, int localSize = 64 );
|
||||
__inline
|
||||
void launch2D( int numThreadsX, int numThreadsY, int localSizeX = 8, int localSizeY = 8 );
|
||||
|
||||
public:
|
||||
enum
|
||||
{
|
||||
CONST_BUFFER_SIZE = 512,
|
||||
};
|
||||
|
||||
const Device* m_deviceData;
|
||||
Kernel* m_kernel;
|
||||
int m_idx;
|
||||
int m_idxRw;
|
||||
};
|
||||
|
||||
template<DeviceType TYPE>
|
||||
class KernelBuilder
|
||||
{
|
||||
public:
|
||||
|
||||
__inline
|
||||
KernelBuilder(): m_ptr(0){}
|
||||
|
||||
__inline
|
||||
void setFromFile( const Device* deviceData, const char* fileName, const char* option = NULL, bool addExtension = false,
|
||||
bool cacheKernel = true);
|
||||
|
||||
__inline
|
||||
void setFromSrc( const Device* deviceData, const char* src, const char* option = NULL );
|
||||
|
||||
__inline
|
||||
void setFromSrcCached( const Device* deviceData, const char* src, const char* fileName, const char* option );
|
||||
|
||||
|
||||
__inline
|
||||
void createKernel( const char* funcName, Kernel& kernelOut );
|
||||
|
||||
__inline
|
||||
~KernelBuilder();
|
||||
// todo. implemement in kernel destructor?
|
||||
__inline
|
||||
static void deleteKernel( Kernel& kernel );
|
||||
|
||||
private:
|
||||
enum
|
||||
{
|
||||
MAX_PATH_LENGTH = 260,
|
||||
};
|
||||
const Device* m_deviceData;
|
||||
#ifdef UNICODE
|
||||
wchar_t m_path[MAX_PATH_LENGTH];
|
||||
#else
|
||||
char m_path[MAX_PATH_LENGTH];
|
||||
#endif
|
||||
void* m_ptr;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
#endif //ADL_KERNEL_H
|
||||
223
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlKernel.inl
Normal file
223
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlKernel.inl
Normal file
@@ -0,0 +1,223 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
|
||||
#ifdef ADL_ENABLE_CL
|
||||
#include <Adl/CL/AdlKernelUtilsCL.inl>
|
||||
#endif
|
||||
#ifdef ADL_ENABLE_DX11
|
||||
#include <Adl/DX11/AdlKernelUtilsDX11.inl>
|
||||
#endif
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
//==========================
|
||||
// KernelManager
|
||||
//==========================
|
||||
Kernel* KernelManager::query(const Device* dd, const char* fileName, const char* funcName, const char* option, const char* src,
|
||||
bool cacheKernel)
|
||||
{
|
||||
printf("compiling kernel %s",funcName);
|
||||
const int charSize = 1024*2;
|
||||
KernelManager* s_kManager = this;
|
||||
|
||||
char fullFineName[charSize];
|
||||
switch( dd->m_type )
|
||||
{
|
||||
case TYPE_CL:
|
||||
#if defined(ADL_ENABLE_CL)
|
||||
sprintf_s(fullFineName,charSize,"%s.cl", fileName);
|
||||
break;
|
||||
#endif
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
case TYPE_DX11:
|
||||
sprintf_s(fullFineName,charSize,"%s.hlsl", fileName);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
ADLASSERT(0);
|
||||
break;
|
||||
};
|
||||
|
||||
char mapName[charSize];
|
||||
{
|
||||
if( option )
|
||||
sprintf_s(mapName, charSize, "%d%s%s%s", (int)dd->getContext(), fullFineName, funcName, option);
|
||||
else
|
||||
sprintf_s(mapName, charSize, "%d%s%s", (int)dd->getContext(), fullFineName, funcName);
|
||||
}
|
||||
|
||||
std::string str(mapName);
|
||||
|
||||
KMap::iterator iter = s_kManager->m_map.find( str );
|
||||
|
||||
Kernel* kernelOut;
|
||||
if( iter == s_kManager->m_map.end() )
|
||||
{
|
||||
kernelOut = new Kernel();
|
||||
|
||||
switch( dd->m_type )
|
||||
{
|
||||
#if defined(ADL_ENABLE_CL)
|
||||
case TYPE_CL:
|
||||
{
|
||||
KernelBuilder<TYPE_CL> builder;
|
||||
if( src )
|
||||
if (cacheKernel)
|
||||
{
|
||||
builder.setFromSrcCached( dd, src, fileName, option );
|
||||
} else
|
||||
{
|
||||
builder.setFromSrc( dd, src, option );
|
||||
}
|
||||
else
|
||||
builder.setFromFile( dd, fileName, option, true, cacheKernel );
|
||||
builder.createKernel( funcName, *kernelOut );
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
case TYPE_DX11:
|
||||
{
|
||||
KernelBuilder<TYPE_DX11> builder;
|
||||
if( src )
|
||||
builder.setFromSrc( dd, src, option );
|
||||
else
|
||||
builder.setFromFile( dd, fileName, option, true, cacheKernel );
|
||||
builder.createKernel( funcName, *kernelOut );
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
ADLASSERT(0);
|
||||
break;
|
||||
};
|
||||
s_kManager->m_map.insert( KMap::value_type(str,kernelOut) );
|
||||
}
|
||||
else
|
||||
{
|
||||
kernelOut = iter->second;
|
||||
}
|
||||
|
||||
printf(" ready\n");
|
||||
return kernelOut;
|
||||
}
|
||||
|
||||
KernelManager::~KernelManager()
|
||||
{
|
||||
for(KMap::iterator iter = m_map.begin(); iter != m_map.end(); iter++)
|
||||
{
|
||||
Kernel* k = iter->second;
|
||||
switch( k->m_type )
|
||||
{
|
||||
#if defined(ADL_ENABLE_CL)
|
||||
case TYPE_CL:
|
||||
KernelBuilder<TYPE_CL>::deleteKernel( *k );
|
||||
delete k;
|
||||
break;
|
||||
#endif
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
case TYPE_DX11:
|
||||
KernelBuilder<TYPE_DX11>::deleteKernel( *k );
|
||||
delete k;
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
ADLASSERT(0);
|
||||
break;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
//==========================
|
||||
// Launcher
|
||||
//==========================
|
||||
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
#if defined(ADL_ENABLE_CL)
|
||||
#define SELECT_LAUNCHER( type, func ) \
|
||||
switch( type ) \
|
||||
{ \
|
||||
case TYPE_CL: LauncherCL::func; break; \
|
||||
case TYPE_DX11: LauncherDX11::func; break; \
|
||||
default: ADLASSERT(0); break; \
|
||||
};
|
||||
#else
|
||||
#define SELECT_LAUNCHER( type, func ) \
|
||||
switch( type ) \
|
||||
{ \
|
||||
case TYPE_DX11: LauncherDX11::func; break; \
|
||||
default: ADLASSERT(0); break; \
|
||||
};
|
||||
#endif
|
||||
#else
|
||||
#if defined(ADL_ENABLE_CL)
|
||||
#define SELECT_LAUNCHER( type, func ) \
|
||||
switch( type ) \
|
||||
{ \
|
||||
case TYPE_CL: LauncherCL::func; break; \
|
||||
default: ADLASSERT(0); break; \
|
||||
};
|
||||
#else
|
||||
#define SELECT_LAUNCHER( type, func ) \
|
||||
switch( type ) \
|
||||
{ \
|
||||
default: ADLASSERT(0); break; \
|
||||
};
|
||||
#endif
|
||||
#endif
|
||||
|
||||
Launcher::Launcher(const Device *dd, char *fileName, char *funcName, char *option)
|
||||
{
|
||||
m_kernel = dd->getKernel( fileName, funcName, option );
|
||||
m_deviceData = dd;
|
||||
m_idx = 0;
|
||||
m_idxRw = 0;
|
||||
}
|
||||
|
||||
Launcher::Launcher(const Device* dd, Kernel* kernel)
|
||||
{
|
||||
m_kernel = kernel;
|
||||
m_deviceData = dd;
|
||||
m_idx = 0;
|
||||
m_idxRw = 0;
|
||||
}
|
||||
|
||||
void Launcher::setBuffers( BufferInfo* buffInfo, int n )
|
||||
{
|
||||
SELECT_LAUNCHER( m_deviceData->m_type, setBuffers( this, buffInfo, n ) );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void Launcher::setConst( Buffer<T>& constBuff, const T& consts )
|
||||
{
|
||||
SELECT_LAUNCHER( m_deviceData->m_type, setConst( this, constBuff, consts ) );
|
||||
}
|
||||
|
||||
void Launcher::launch1D( int numThreads, int localSize )
|
||||
{
|
||||
SELECT_LAUNCHER( m_deviceData->m_type, launch2D( this, numThreads, 1, localSize, 1 ) );
|
||||
}
|
||||
|
||||
void Launcher::launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
|
||||
{
|
||||
SELECT_LAUNCHER( m_deviceData->m_type, launch2D( this, numThreadsX, numThreadsY, localSizeX, localSizeY ) );
|
||||
}
|
||||
|
||||
#undef SELECT_LAUNCHER
|
||||
|
||||
};
|
||||
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
|
||||
#include <windows.h>
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
struct StopwatchBase
|
||||
{
|
||||
__inline
|
||||
StopwatchBase(): m_device(0){}
|
||||
__inline
|
||||
StopwatchBase( const Device* deviceData ){ init(deviceData); }
|
||||
__inline
|
||||
virtual ~StopwatchBase(){}
|
||||
|
||||
__inline
|
||||
virtual void init( const Device* deviceData ) = 0;
|
||||
__inline
|
||||
virtual void start() = 0;
|
||||
__inline
|
||||
virtual void split() = 0;
|
||||
__inline
|
||||
virtual void stop() = 0;
|
||||
__inline
|
||||
virtual float getMs(int index=0) = 0;
|
||||
__inline
|
||||
virtual void getMs( float* times, int capacity ) = 0;
|
||||
__inline
|
||||
int getNIntervals() const{ return m_idx-1;}
|
||||
|
||||
enum
|
||||
{
|
||||
CAPACITY = 64,
|
||||
};
|
||||
|
||||
const Device* m_device;
|
||||
int m_idx;
|
||||
};
|
||||
|
||||
struct Stopwatch
|
||||
{
|
||||
__inline
|
||||
Stopwatch( const Device* deviceData = NULL ) { m_impl=0; if(deviceData) init(deviceData);}
|
||||
__inline
|
||||
~Stopwatch();
|
||||
|
||||
__inline
|
||||
void init( const Device* deviceData );
|
||||
__inline
|
||||
void start(){if(!m_impl) init(0); m_impl->start();}
|
||||
__inline
|
||||
void split(){m_impl->split();}
|
||||
__inline
|
||||
void stop(){m_impl->stop();}
|
||||
__inline
|
||||
float getMs(){ return m_impl->getMs();}
|
||||
__inline
|
||||
void getMs( float* times, int capacity ){m_impl->getMs(times, capacity);}
|
||||
__inline
|
||||
int getNIntervals() const{return m_impl->getNIntervals();}
|
||||
|
||||
StopwatchBase* m_impl;
|
||||
};
|
||||
|
||||
};
|
||||
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
void Stopwatch::init( const Device* deviceData )
|
||||
{
|
||||
ADLASSERT( m_impl == 0 );
|
||||
|
||||
if( deviceData )
|
||||
{
|
||||
switch( deviceData->m_type )
|
||||
{
|
||||
#if defined(ADL_ENABLE_CL)
|
||||
case TYPE_CL:
|
||||
m_impl = new StopwatchHost;//StopwatchCL
|
||||
break;
|
||||
#endif
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
case TYPE_DX11:
|
||||
m_impl = new StopwatchHost;//StopwatchDX11;
|
||||
break;
|
||||
#endif
|
||||
case TYPE_HOST:
|
||||
m_impl = new StopwatchHost;
|
||||
break;
|
||||
default:
|
||||
ADLASSERT(0);
|
||||
break;
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
m_impl = new StopwatchHost;
|
||||
}
|
||||
m_impl->init( deviceData );
|
||||
}
|
||||
|
||||
Stopwatch::~Stopwatch()
|
||||
{
|
||||
if( m_impl == 0 ) return;
|
||||
delete m_impl;
|
||||
}
|
||||
|
||||
};
|
||||
384
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/CL/AdlCL.inl
Normal file
384
Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/CL/AdlCL.inl
Normal file
@@ -0,0 +1,384 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
|
||||
#pragma comment(lib,"OpenCL.lib")
|
||||
#include <CL/cl.h>
|
||||
#include <CL/cl_ext.h>
|
||||
#include <CL/cl_platform.h>
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
struct DeviceCL : public Device
|
||||
{
|
||||
typedef DeviceUtils::Config Config;
|
||||
|
||||
|
||||
__inline
|
||||
DeviceCL() : Device( TYPE_CL ), m_kernelManager(0){}
|
||||
__inline
|
||||
void* getContext() const { return m_context; }
|
||||
__inline
|
||||
void initialize(const Config& cfg);
|
||||
__inline
|
||||
void release();
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type);
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void deallocate(Buffer<T>* buf);
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void copy(Buffer<T>* dst, const Buffer<T>* src, int nElems,int srcOffsetNElems = 0,int dstOffsetNElems = 0);
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems = 0);
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems = 0);
|
||||
|
||||
__inline
|
||||
void waitForCompletion() const;
|
||||
|
||||
__inline
|
||||
void getDeviceName( char nameOut[128] ) const;
|
||||
|
||||
__inline
|
||||
static
|
||||
int getNDevices();
|
||||
|
||||
__inline
|
||||
Kernel* getKernel(const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL, bool cacheKernel = true )const;
|
||||
|
||||
|
||||
enum
|
||||
{
|
||||
MAX_NUM_DEVICES = 6,
|
||||
};
|
||||
|
||||
cl_context m_context;
|
||||
cl_command_queue m_commandQueue;
|
||||
|
||||
cl_device_id m_deviceIdx;
|
||||
|
||||
KernelManager* m_kernelManager;
|
||||
};
|
||||
|
||||
//===
|
||||
//===
|
||||
|
||||
void DeviceCL::initialize(const Config& cfg)
|
||||
{
|
||||
// DeviceUtils::create( cfg, (DeviceCL*)this );
|
||||
{
|
||||
// dd = new DeviceCL();
|
||||
|
||||
DeviceCL* deviceData = (DeviceCL*)this;
|
||||
|
||||
// cl_device_type deviceType = (driverType == DRIVER_HARDWARE)? CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU;
|
||||
cl_device_type deviceType = (cfg.m_type== Config::DEVICE_GPU)? CL_DEVICE_TYPE_GPU: CL_DEVICE_TYPE_CPU;
|
||||
// int numContextQueuePairsToCreate = 1;
|
||||
bool enableProfiling = false;
|
||||
#ifdef _DEBUG
|
||||
enableProfiling = true;
|
||||
#endif
|
||||
cl_int status;
|
||||
|
||||
cl_platform_id platform;
|
||||
{
|
||||
cl_uint nPlatforms = 0;
|
||||
status = clGetPlatformIDs(0, NULL, &nPlatforms);
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
cl_platform_id pIdx[5];
|
||||
status = clGetPlatformIDs(nPlatforms, pIdx, NULL);
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
cl_uint atiIdx = -1;
|
||||
cl_uint intelIdx = -1;
|
||||
cl_uint nvIdx = -1;
|
||||
|
||||
for(cl_uint i=0; i<nPlatforms; i++)
|
||||
{
|
||||
char buff[512];
|
||||
status = clGetPlatformInfo( pIdx[i], CL_PLATFORM_VENDOR, 512, buff, 0 );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
//skip the platform if there are no devices available
|
||||
cl_uint numDevice;
|
||||
status = clGetDeviceIDs( pIdx[i], deviceType, 0, NULL, &numDevice );
|
||||
if (numDevice>0)
|
||||
{
|
||||
if( strcmp( buff, "NVIDIA Corporation" )==0 ) nvIdx = i;
|
||||
if( strcmp( buff, "Advanced Micro Devices, Inc." )==0 ) atiIdx = i;
|
||||
if( strcmp( buff, "Intel(R) Corporation" )==0 ) intelIdx = i;
|
||||
}
|
||||
}
|
||||
|
||||
if( deviceType == CL_DEVICE_TYPE_GPU )
|
||||
{
|
||||
switch( cfg.m_vendor )
|
||||
{
|
||||
case DeviceUtils::Config::VD_AMD:
|
||||
if( atiIdx == -1 && nvIdx != -1 ) goto USE_NV_GPU;
|
||||
USE_AMD_GPU:
|
||||
ADLASSERT(atiIdx != -1 );
|
||||
platform = pIdx[atiIdx];
|
||||
break;
|
||||
case DeviceUtils::Config::VD_NV:
|
||||
if( atiIdx != -1 && nvIdx == -1 ) goto USE_AMD_GPU;
|
||||
USE_NV_GPU:
|
||||
ADLASSERT(nvIdx != -1 );
|
||||
platform = pIdx[nvIdx];
|
||||
break;
|
||||
default:
|
||||
ADLASSERT(0);
|
||||
break;
|
||||
};
|
||||
}
|
||||
else if( deviceType == CL_DEVICE_TYPE_CPU )
|
||||
{
|
||||
switch( cfg.m_vendor )
|
||||
{
|
||||
case DeviceUtils::Config::VD_AMD:
|
||||
ADLASSERT(atiIdx != -1 );
|
||||
platform = pIdx[atiIdx];
|
||||
break;
|
||||
case DeviceUtils::Config::VD_INTEL:
|
||||
ADLASSERT(intelIdx != -1 );
|
||||
platform = pIdx[intelIdx];
|
||||
break;
|
||||
default:
|
||||
ADLASSERT(0);
|
||||
break;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
cl_uint numDevice;
|
||||
status = clGetDeviceIDs( platform, deviceType, 0, NULL, &numDevice );
|
||||
|
||||
// ADLASSERT( cfg.m_deviceIdx < (int)numDevice );
|
||||
|
||||
debugPrintf("CL: %d %s Devices ", numDevice, (deviceType==CL_DEVICE_TYPE_GPU)? "GPU":"CPU");
|
||||
|
||||
// numContextQueuePairsToCreate = min( (int)numDevice, numContextQueuePairsToCreate );
|
||||
// numContextQueuePairsToCreate = ( (int)numDevice < numContextQueuePairsToCreate )? numDevice : numContextQueuePairsToCreate;
|
||||
|
||||
cl_device_id deviceIds[ MAX_NUM_DEVICES ];
|
||||
|
||||
status = clGetDeviceIDs( platform, deviceType, numDevice, deviceIds, NULL );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
{ int i = min( (int)numDevice-1, cfg.m_deviceIdx );
|
||||
m_deviceIdx = deviceIds[i];
|
||||
deviceData->m_context = clCreateContext( NULL, 1, &deviceData->m_deviceIdx, NULL, NULL, &status );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
char buff[512];
|
||||
status = clGetDeviceInfo( deviceData->m_deviceIdx, CL_DEVICE_NAME, sizeof(buff), &buff, NULL );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
debugPrintf("[%s]\n", buff);
|
||||
|
||||
deviceData->m_commandQueue = clCreateCommandQueue( deviceData->m_context, deviceData->m_deviceIdx, (enableProfiling)?CL_QUEUE_PROFILING_ENABLE:NULL, NULL );
|
||||
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
// status = clSetCommandQueueProperty( commandQueue, CL_QUEUE_PROFILING_ENABLE, CL_TRUE, 0 );
|
||||
// CLASSERT( status == CL_SUCCESS );
|
||||
|
||||
if(0)
|
||||
{
|
||||
cl_bool image_support;
|
||||
clGetDeviceInfo(deviceData->m_deviceIdx, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL);
|
||||
debugPrintf(" CL_DEVICE_IMAGE_SUPPORT : %s\n", image_support?"Yes":"No");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m_kernelManager = new KernelManager;
|
||||
}
|
||||
|
||||
void DeviceCL::release()
|
||||
{
|
||||
clReleaseCommandQueue( m_commandQueue );
|
||||
clReleaseContext( m_context );
|
||||
|
||||
if( m_kernelManager ) delete m_kernelManager;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceCL::allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type)
|
||||
{
|
||||
buf->m_device = this;
|
||||
buf->m_size = nElems;
|
||||
buf->m_ptr = 0;
|
||||
|
||||
if( type == BufferBase::BUFFER_CONST ) return;
|
||||
|
||||
#if defined(ADL_CL_DUMP_MEMORY_LOG)
|
||||
char deviceName[256];
|
||||
getDeviceName( deviceName );
|
||||
printf( "adlCLMemoryLog %s : %3.2fMB Allocation: %3.2fKB ", deviceName, m_memoryUsage/1024.f/1024.f, sizeof(T)*nElems/1024.f );
|
||||
fflush( stdout );
|
||||
#endif
|
||||
|
||||
int sz=sizeof(T)*nElems;
|
||||
|
||||
cl_int status = 0;
|
||||
if( type == BufferBase::BUFFER_ZERO_COPY )
|
||||
buf->m_ptr = (T*)clCreateBuffer( m_context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sz, 0, &status );
|
||||
else if( type == BufferBase::BUFFER_RAW )
|
||||
buf->m_ptr = (T*)clCreateBuffer( m_context, CL_MEM_WRITE_ONLY, sz, 0, &status );
|
||||
else
|
||||
buf->m_ptr = (T*)clCreateBuffer( m_context, CL_MEM_READ_WRITE, sz, 0, &status );
|
||||
|
||||
m_memoryUsage += buf->m_size*sizeof(T);
|
||||
#if defined(ADL_CL_DUMP_MEMORY_LOG)
|
||||
printf( "%s\n", (status==CL_SUCCESS)? "Succeed": "Failed" );
|
||||
fflush( stdout );
|
||||
#endif
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceCL::deallocate(Buffer<T>* buf)
|
||||
{
|
||||
if( buf->m_ptr )
|
||||
{
|
||||
m_memoryUsage -= buf->m_size*sizeof(T);
|
||||
clReleaseMemObject( (cl_mem)buf->m_ptr );
|
||||
}
|
||||
buf->m_device = 0;
|
||||
buf->m_size = 0;
|
||||
buf->m_ptr = 0;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceCL::copy(Buffer<T>* dst, const Buffer<T>* src, int nElems,int srcOffsetNElems,int dstOffsetNElems )
|
||||
{
|
||||
if( dst->m_device->m_type == TYPE_CL && src->m_device->m_type == TYPE_CL )
|
||||
{
|
||||
cl_int status = 0;
|
||||
status = clEnqueueCopyBuffer( m_commandQueue, (cl_mem)src->m_ptr, (cl_mem)dst->m_ptr, sizeof(T)*srcOffsetNElems, sizeof(T)*dstOffsetNElems, sizeof(T)*nElems, 0, 0, 0 );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
}
|
||||
else if( src->m_device->m_type == TYPE_HOST )
|
||||
{
|
||||
ADLASSERT( dst->getType() == TYPE_CL );
|
||||
dst->write( src->m_ptr, nElems );
|
||||
}
|
||||
else if( dst->m_device->m_type == TYPE_HOST )
|
||||
{
|
||||
ADLASSERT( src->getType() == TYPE_CL );
|
||||
src->read( dst->m_ptr, nElems );
|
||||
}
|
||||
else
|
||||
{
|
||||
ADLASSERT( 0 );
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceCL::copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems )
|
||||
{
|
||||
cl_int status = 0;
|
||||
status = clEnqueueReadBuffer( m_commandQueue, (cl_mem)src->m_ptr, 0, sizeof(T)*srcOffsetNElems, sizeof(T)*nElems,
|
||||
dst, 0,0,0 );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceCL::copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems )
|
||||
{
|
||||
cl_int status = 0;
|
||||
int sz=sizeof(T)*nElems;
|
||||
status = clEnqueueWriteBuffer( m_commandQueue, (cl_mem)dst->m_ptr, 0, sizeof(T)*dstOffsetNElems, sz,
|
||||
src, 0,0,0 );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
}
|
||||
|
||||
void DeviceCL::waitForCompletion() const
|
||||
{
|
||||
clFinish( m_commandQueue );
|
||||
}
|
||||
|
||||
int DeviceCL::getNDevices()
|
||||
{
|
||||
cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
|
||||
cl_int status;
|
||||
|
||||
cl_platform_id platform;
|
||||
{
|
||||
cl_uint nPlatforms = 0;
|
||||
status = clGetPlatformIDs(0, NULL, &nPlatforms);
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
cl_platform_id pIdx[5];
|
||||
status = clGetPlatformIDs(nPlatforms, pIdx, NULL);
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
cl_uint nvIdx = -1;
|
||||
cl_uint atiIdx = -1;
|
||||
for(cl_uint i=0; i<nPlatforms; i++)
|
||||
{
|
||||
char buff[512];
|
||||
status = clGetPlatformInfo( pIdx[i], CL_PLATFORM_VENDOR, 512, buff, 0 );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
if( strcmp( buff, "NVIDIA Corporation" )==0 ) nvIdx = i;
|
||||
if( strcmp( buff, "Advanced Micro Devices, Inc." )==0 ) atiIdx = i;
|
||||
}
|
||||
|
||||
if( deviceType == CL_DEVICE_TYPE_GPU )
|
||||
{
|
||||
if( nvIdx != -1 ) platform = pIdx[nvIdx];
|
||||
else platform = pIdx[atiIdx];
|
||||
}
|
||||
else if( deviceType == CL_DEVICE_TYPE_CPU )
|
||||
{
|
||||
platform = pIdx[atiIdx];
|
||||
}
|
||||
}
|
||||
|
||||
cl_uint numDevice;
|
||||
status = clGetDeviceIDs( platform, deviceType, 0, NULL, &numDevice );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
return numDevice;
|
||||
}
|
||||
|
||||
void DeviceCL::getDeviceName( char nameOut[128] ) const
|
||||
{
|
||||
cl_int status;
|
||||
status = clGetDeviceInfo( m_deviceIdx, CL_DEVICE_NAME, sizeof(char)*128, nameOut, NULL );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
}
|
||||
|
||||
Kernel* DeviceCL::getKernel(const char* fileName, const char* funcName, const char* option, const char* src, bool cacheKernel )const
|
||||
{
|
||||
return m_kernelManager->query( this, fileName, funcName, option, src, cacheKernel );
|
||||
}
|
||||
|
||||
};
|
||||
@@ -0,0 +1,541 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
struct KernelCL : public Kernel
|
||||
{
|
||||
cl_kernel& getKernel() { return (cl_kernel&)m_kernel; }
|
||||
};
|
||||
|
||||
static const char* strip(const char* name, const char* pattern)
|
||||
{
|
||||
size_t const patlen = strlen(pattern);
|
||||
size_t patcnt = 0;
|
||||
const char * oriptr;
|
||||
const char * patloc;
|
||||
// find how many times the pattern occurs in the original string
|
||||
for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
|
||||
{
|
||||
patcnt++;
|
||||
}
|
||||
return oriptr;
|
||||
}
|
||||
|
||||
static bool isFileUpToDate(const char* binaryFileName,const char* srcFileName)
|
||||
|
||||
{
|
||||
bool fileUpToDate = false;
|
||||
|
||||
bool binaryFileValid=false;
|
||||
FILETIME modtimeBinary;
|
||||
|
||||
int nameLength = (int)strlen(binaryFileName)+1;
|
||||
#ifdef UNICODE
|
||||
WCHAR* fName = new WCHAR[nameLength];
|
||||
MultiByteToWideChar(CP_ACP,0,binaryFileName,-1, fName, nameLength);
|
||||
HANDLE binaryFileHandle = CreateFile(fName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
|
||||
delete [] fName;
|
||||
#else
|
||||
HANDLE binaryFileHandle = CreateFile(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
|
||||
#endif
|
||||
if (binaryFileHandle ==INVALID_HANDLE_VALUE)
|
||||
{
|
||||
DWORD errorCode;
|
||||
errorCode = GetLastError();
|
||||
switch (errorCode)
|
||||
{
|
||||
case ERROR_FILE_NOT_FOUND:
|
||||
{
|
||||
debugPrintf("\nCached file not found %s\n", binaryFileName);
|
||||
break;
|
||||
}
|
||||
case ERROR_PATH_NOT_FOUND:
|
||||
{
|
||||
debugPrintf("\nCached file path not found %s\n", binaryFileName);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
debugPrintf("\nFailed reading cached file with errorCode = %d\n", errorCode);
|
||||
}
|
||||
}
|
||||
} else
|
||||
{
|
||||
if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0)
|
||||
{
|
||||
DWORD errorCode;
|
||||
errorCode = GetLastError();
|
||||
debugPrintf("\nGetFileTime errorCode = %d\n", errorCode);
|
||||
} else
|
||||
{
|
||||
binaryFileValid = true;
|
||||
}
|
||||
CloseHandle(binaryFileHandle);
|
||||
}
|
||||
|
||||
if (binaryFileValid)
|
||||
{
|
||||
#ifdef UNICODE
|
||||
int nameLength = (int)strlen(srcFileName)+1;
|
||||
WCHAR* fName = new WCHAR[nameLength];
|
||||
MultiByteToWideChar(CP_ACP,0,srcFileName,-1, fName, nameLength);
|
||||
HANDLE srcFileHandle = CreateFile(fName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
|
||||
delete [] fName;
|
||||
#else
|
||||
HANDLE srcFileHandle = CreateFile(srcFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
|
||||
#endif
|
||||
if (srcFileHandle!=INVALID_HANDLE_VALUE)
|
||||
{
|
||||
FILETIME modtimeSrc;
|
||||
if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0)
|
||||
{
|
||||
DWORD errorCode;
|
||||
errorCode = GetLastError();
|
||||
debugPrintf("\nGetFileTime errorCode = %d\n", errorCode);
|
||||
}
|
||||
if ( ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime)
|
||||
||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
|
||||
{
|
||||
fileUpToDate=true;
|
||||
} else
|
||||
{
|
||||
debugPrintf("\nCached binary file found (%s), but out-of-date\n",binaryFileName);
|
||||
}
|
||||
CloseHandle(srcFileHandle);
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef _DEBUG
|
||||
DWORD errorCode;
|
||||
errorCode = GetLastError();
|
||||
switch (errorCode)
|
||||
{
|
||||
case ERROR_FILE_NOT_FOUND:
|
||||
{
|
||||
debugPrintf("\nSrc file not found %s\n", srcFileName);
|
||||
break;
|
||||
}
|
||||
case ERROR_PATH_NOT_FOUND:
|
||||
{
|
||||
debugPrintf("\nSrc path not found %s\n", srcFileName);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
debugPrintf("\nnSrc file reading errorCode = %d\n", errorCode);
|
||||
}
|
||||
}
|
||||
ADLASSERT(0);
|
||||
#else
|
||||
//if we cannot find the source, assume it is OK in release builds
|
||||
fileUpToDate = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return fileUpToDate;
|
||||
}
|
||||
|
||||
template<>
|
||||
void KernelBuilder<TYPE_CL>::setFromFile( const Device* deviceData, const char* fileName, const char* option, bool addExtension,
|
||||
bool cacheKernel)
|
||||
{
|
||||
m_deviceData = deviceData;
|
||||
|
||||
char fileNameWithExtension[256];
|
||||
|
||||
if( addExtension )
|
||||
sprintf_s( fileNameWithExtension, "%s.cl", fileName );
|
||||
else
|
||||
sprintf_s( fileNameWithExtension, "%s", fileName );
|
||||
|
||||
class File
|
||||
{
|
||||
public:
|
||||
__inline
|
||||
bool open(const char* fileNameWithExtension)
|
||||
{
|
||||
size_t size;
|
||||
char* str;
|
||||
|
||||
// Open file stream
|
||||
std::fstream f(fileNameWithExtension, (std::fstream::in | std::fstream::binary));
|
||||
|
||||
// Check if we have opened file stream
|
||||
if (f.is_open()) {
|
||||
size_t sizeFile;
|
||||
// Find the stream size
|
||||
f.seekg(0, std::fstream::end);
|
||||
size = sizeFile = (size_t)f.tellg();
|
||||
f.seekg(0, std::fstream::beg);
|
||||
|
||||
str = new char[size + 1];
|
||||
if (!str) {
|
||||
f.close();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Read file
|
||||
f.read(str, sizeFile);
|
||||
f.close();
|
||||
str[size] = '\0';
|
||||
|
||||
m_source = str;
|
||||
|
||||
delete[] str;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
const std::string& getSource() const {return m_source;}
|
||||
|
||||
private:
|
||||
std::string m_source;
|
||||
};
|
||||
|
||||
cl_program& program = (cl_program&)m_ptr;
|
||||
cl_int status = 0;
|
||||
|
||||
bool cacheBinary = cacheKernel;
|
||||
#if defined(ADL_CL_FORCE_UNCACHE_KERNEL)
|
||||
cacheBinary = false;
|
||||
#endif
|
||||
|
||||
char binaryFileName[512];
|
||||
{
|
||||
char deviceName[256];
|
||||
deviceData->getDeviceName(deviceName);
|
||||
char driverVersion[256];
|
||||
const DeviceCL* dd = (const DeviceCL*) deviceData;
|
||||
clGetDeviceInfo(dd->m_deviceIdx, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
|
||||
const char* strippedFileName = strip(fileName,"\\");
|
||||
strippedFileName = strip(strippedFileName,"/");
|
||||
|
||||
sprintf_s(binaryFileName,"cache/%s.%s.%s.bin",strippedFileName, deviceName,driverVersion );
|
||||
}
|
||||
|
||||
bool upToDate = isFileUpToDate(binaryFileName,fileNameWithExtension);
|
||||
|
||||
if( cacheBinary && upToDate)
|
||||
{
|
||||
FILE* file = fopen(binaryFileName, "rb");
|
||||
|
||||
if( file )
|
||||
{
|
||||
fseek( file, 0L, SEEK_END );
|
||||
size_t binarySize = ftell( file );
|
||||
|
||||
rewind( file );
|
||||
char* binary = new char[binarySize];
|
||||
fread( binary, sizeof(char), binarySize, file );
|
||||
fclose( file );
|
||||
|
||||
if (binarySize)
|
||||
{
|
||||
const DeviceCL* dd = (const DeviceCL*) deviceData;
|
||||
program = clCreateProgramWithBinary( dd->m_context, 1, &dd->m_deviceIdx, &binarySize, (const unsigned char**)&binary, 0, &status );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
status = clBuildProgram( program, 1, &dd->m_deviceIdx, option, 0, 0 );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
if( status != CL_SUCCESS )
|
||||
{
|
||||
char *build_log;
|
||||
size_t ret_val_size;
|
||||
clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
|
||||
build_log = new char[ret_val_size+1];
|
||||
clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
|
||||
|
||||
build_log[ret_val_size] = '\0';
|
||||
|
||||
debugPrintf("%s\n", build_log);
|
||||
|
||||
delete build_log;
|
||||
ADLASSERT(0);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
if( !m_ptr )
|
||||
{
|
||||
File kernelFile;
|
||||
ADLASSERT( kernelFile.open( fileNameWithExtension ) );
|
||||
const char* source = kernelFile.getSource().c_str();
|
||||
setFromSrc( m_deviceData, source, option );
|
||||
|
||||
if( cacheBinary )
|
||||
{ // write to binary
|
||||
size_t binarySize;
|
||||
status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
char* binary = new char[binarySize];
|
||||
|
||||
status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
{
|
||||
FILE* file = fopen(binaryFileName, "wb");
|
||||
if (file)
|
||||
{
|
||||
fwrite( binary, sizeof(char), binarySize, file );
|
||||
fclose( file );
|
||||
}
|
||||
}
|
||||
|
||||
delete [] binary;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<>
|
||||
void KernelBuilder<TYPE_CL>::setFromSrcCached( const Device* deviceData, const char* src, const char* fileName, const char* option )
|
||||
{
|
||||
m_deviceData = deviceData;
|
||||
|
||||
bool cacheBinary = true;
|
||||
cl_program& program = (cl_program&)m_ptr;
|
||||
cl_int status = 0;
|
||||
|
||||
char binaryFileName[512];
|
||||
{
|
||||
char deviceName[256];
|
||||
deviceData->getDeviceName(deviceName);
|
||||
char driverVersion[256];
|
||||
const DeviceCL* dd = (const DeviceCL*) deviceData;
|
||||
clGetDeviceInfo(dd->m_deviceIdx, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
|
||||
|
||||
const char* strippedFileName = strip(fileName,"\\");
|
||||
strippedFileName = strip(strippedFileName,"/");
|
||||
|
||||
sprintf_s(binaryFileName,"cache/%s.%s.%s.bin",strippedFileName, deviceName,driverVersion );
|
||||
}
|
||||
|
||||
|
||||
char fileNameWithExtension[256];
|
||||
sprintf_s(fileNameWithExtension,"%s.cl",fileName, ".cl");
|
||||
|
||||
bool upToDate = isFileUpToDate(binaryFileName,fileNameWithExtension);
|
||||
|
||||
|
||||
if( cacheBinary )
|
||||
{
|
||||
|
||||
bool fileUpToDate = isFileUpToDate(binaryFileName,fileNameWithExtension);
|
||||
|
||||
if( fileUpToDate)
|
||||
{
|
||||
FILE* file = fopen(binaryFileName, "rb");
|
||||
if (file)
|
||||
{
|
||||
fseek( file, 0L, SEEK_END );
|
||||
size_t binarySize = ftell( file );
|
||||
rewind( file );
|
||||
char* binary = new char[binarySize];
|
||||
fread( binary, sizeof(char), binarySize, file );
|
||||
fclose( file );
|
||||
|
||||
const DeviceCL* dd = (const DeviceCL*) deviceData;
|
||||
program = clCreateProgramWithBinary( dd->m_context, 1, &dd->m_deviceIdx, &binarySize, (const unsigned char**)&binary, 0, &status );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
status = clBuildProgram( program, 1, &dd->m_deviceIdx, option, 0, 0 );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
if( status != CL_SUCCESS )
|
||||
{
|
||||
char *build_log;
|
||||
size_t ret_val_size;
|
||||
clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
|
||||
build_log = new char[ret_val_size+1];
|
||||
clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
|
||||
|
||||
build_log[ret_val_size] = '\0';
|
||||
|
||||
debugPrintf("%s\n", build_log);
|
||||
|
||||
delete build_log;
|
||||
ADLASSERT(0);
|
||||
}
|
||||
delete[] binary;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if( !m_ptr )
|
||||
{
|
||||
|
||||
setFromSrc( deviceData, src, option );
|
||||
|
||||
if( cacheBinary )
|
||||
{ // write to binary
|
||||
cl_uint numAssociatedDevices;
|
||||
status = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
if (numAssociatedDevices==1)
|
||||
{
|
||||
|
||||
|
||||
size_t binarySize;
|
||||
status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
char* binary = new char[binarySize];
|
||||
|
||||
status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
{
|
||||
FILE* file = fopen(binaryFileName, "wb");
|
||||
if (file)
|
||||
{
|
||||
fwrite( binary, sizeof(char), binarySize, file );
|
||||
fclose( file );
|
||||
}
|
||||
}
|
||||
|
||||
delete [] binary;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
void KernelBuilder<TYPE_CL>::setFromSrc( const Device* deviceData, const char* src, const char* option )
|
||||
{
|
||||
ADLASSERT( deviceData->m_type == TYPE_CL );
|
||||
m_deviceData = deviceData;
|
||||
const DeviceCL* dd = (const DeviceCL*) deviceData;
|
||||
|
||||
cl_program& program = (cl_program&)m_ptr;
|
||||
cl_int status = 0;
|
||||
size_t srcSize[] = {strlen( src )};
|
||||
program = clCreateProgramWithSource( dd->m_context, 1, &src, srcSize, &status );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
status = clBuildProgram( program, 1, &dd->m_deviceIdx, option, NULL, NULL );
|
||||
if( status != CL_SUCCESS )
|
||||
{
|
||||
char *build_log;
|
||||
size_t ret_val_size;
|
||||
clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
|
||||
build_log = new char[ret_val_size+1];
|
||||
clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
|
||||
|
||||
build_log[ret_val_size] = '\0';
|
||||
|
||||
debugPrintf("%s\n", build_log);
|
||||
printf("%s\n", build_log);
|
||||
|
||||
ADLASSERT(0);
|
||||
delete build_log;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
KernelBuilder<TYPE_CL>::~KernelBuilder()
|
||||
{
|
||||
cl_program program = (cl_program)m_ptr;
|
||||
clReleaseProgram( program );
|
||||
}
|
||||
|
||||
template<>
|
||||
void KernelBuilder<TYPE_CL>::createKernel( const char* funcName, Kernel& kernelOut )
|
||||
{
|
||||
KernelCL* clKernel = (KernelCL*)&kernelOut;
|
||||
|
||||
cl_program program = (cl_program)m_ptr;
|
||||
cl_int status = 0;
|
||||
clKernel->getKernel() = clCreateKernel(program, funcName, &status );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
|
||||
kernelOut.m_type = TYPE_CL;
|
||||
}
|
||||
|
||||
template<>
|
||||
void KernelBuilder<TYPE_CL>::deleteKernel( Kernel& kernel )
|
||||
{
|
||||
KernelCL* clKernel = (KernelCL*)&kernel;
|
||||
clReleaseKernel( clKernel->getKernel() );
|
||||
}
|
||||
|
||||
|
||||
|
||||
class LauncherCL
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
__inline
|
||||
static void setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n );
|
||||
template<typename T>
|
||||
__inline
|
||||
static void setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts );
|
||||
__inline
|
||||
static void launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY );
|
||||
};
|
||||
|
||||
void LauncherCL::setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n )
|
||||
{
|
||||
KernelCL* clKernel = (KernelCL*)launcher->m_kernel;
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
Buffer<int>* buff = (Buffer<int>*)buffInfo[i].m_buffer;
|
||||
cl_int status = clSetKernelArg( clKernel->getKernel(), launcher->m_idx++, sizeof(cl_mem), &buff->m_ptr );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void LauncherCL::setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts )
|
||||
{
|
||||
KernelCL* clKernel = (KernelCL*)launcher->m_kernel;
|
||||
int sz=sizeof(T);
|
||||
cl_int status = clSetKernelArg( clKernel->getKernel(), launcher->m_idx++, sz, &consts );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
}
|
||||
|
||||
void LauncherCL::launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
|
||||
{
|
||||
KernelCL* clKernel = (KernelCL*)launcher->m_kernel;
|
||||
const DeviceCL* ddcl = (const DeviceCL*)launcher->m_deviceData;
|
||||
size_t gRange[3] = {1,1,1};
|
||||
size_t lRange[3] = {1,1,1};
|
||||
lRange[0] = localSizeX;
|
||||
lRange[1] = localSizeY;
|
||||
gRange[0] = max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
|
||||
gRange[0] *= lRange[0];
|
||||
gRange[1] = max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
|
||||
gRange[1] *= lRange[1];
|
||||
|
||||
cl_int status = clEnqueueNDRangeKernel( ddcl->m_commandQueue,
|
||||
clKernel->getKernel(), 2, NULL, gRange, lRange, 0,0,0 );
|
||||
ADLASSERT( status == CL_SUCCESS );
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
@@ -0,0 +1,512 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#include <windows.h>
|
||||
#include <d3d11.h>
|
||||
#include <d3dx11.h>
|
||||
#include <d3dcompiler.h>
|
||||
#include <DXGI.h>
|
||||
#pragma comment(lib,"d3dx11.lib")
|
||||
#pragma comment(lib,"d3d11.lib")
|
||||
#pragma comment(lib,"DXGI.lib")
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
#define u32 unsigned int
|
||||
|
||||
struct DeviceDX11 : public Device
|
||||
{
|
||||
typedef DeviceUtils::Config Config;
|
||||
|
||||
|
||||
__inline
|
||||
DeviceDX11() : Device( TYPE_DX11 ), m_kernelManager(0){}
|
||||
__inline
|
||||
void* getContext() const { return m_context; }
|
||||
__inline
|
||||
void initialize(const Config& cfg);
|
||||
__inline
|
||||
void release();
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type);
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void deallocate(Buffer<T>* buf);
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void copy(Buffer<T>* dst, const Buffer<T>* src, int nElems);
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems = 0);
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems = 0);
|
||||
|
||||
__inline
|
||||
void waitForCompletion() const;
|
||||
|
||||
__inline
|
||||
void getDeviceName( char nameOut[128] ) const;
|
||||
|
||||
__inline
|
||||
static
|
||||
int getNDevices();
|
||||
|
||||
__inline
|
||||
Kernel* getKernel(const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL, bool cacheKernel = true )const;
|
||||
|
||||
|
||||
ID3D11DeviceContext* m_context;
|
||||
ID3D11Device* m_device;
|
||||
IDXGISwapChain* m_swapChain;
|
||||
|
||||
KernelManager* m_kernelManager;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct BufferDX11 : public Buffer<T>
|
||||
{
|
||||
ID3D11Buffer* getBuffer() { return (ID3D11Buffer*)m_ptr; }
|
||||
ID3D11UnorderedAccessView* getUAV() { return (ID3D11UnorderedAccessView*)m_uav; }
|
||||
ID3D11ShaderResourceView* getSRV() { return (ID3D11ShaderResourceView*)m_srv; }
|
||||
|
||||
ID3D11Buffer** getBufferPtr() { return (ID3D11Buffer**)&m_ptr; }
|
||||
ID3D11UnorderedAccessView** getUAVPtr() { return (ID3D11UnorderedAccessView**)&m_uav; }
|
||||
ID3D11ShaderResourceView** getSRVPtr() { return (ID3D11ShaderResourceView**)&m_srv; }
|
||||
};
|
||||
|
||||
#define SAFE_RELEASE(p) { if(p) { (p)->Release(); (p)=NULL; } }
|
||||
|
||||
|
||||
void DeviceDX11::initialize(const Config& cfg)
|
||||
{
|
||||
DeviceDX11* deviceData = this;
|
||||
|
||||
HRESULT hr = S_OK;
|
||||
UINT createDeviceFlg = 0;
|
||||
#ifdef _DEBUG
|
||||
createDeviceFlg |= D3D11_CREATE_DEVICE_DEBUG;
|
||||
#endif
|
||||
D3D_FEATURE_LEVEL fl[] = {
|
||||
D3D_FEATURE_LEVEL_11_0,
|
||||
D3D_FEATURE_LEVEL_10_1,
|
||||
D3D_FEATURE_LEVEL_10_0
|
||||
};
|
||||
|
||||
typedef HRESULT (WINAPI * LPD3D11CREATEDEVICE)( IDXGIAdapter*, D3D_DRIVER_TYPE, HMODULE, u32, D3D_FEATURE_LEVEL*, UINT, u32, ID3D11Device**, D3D_FEATURE_LEVEL*, ID3D11DeviceContext** );
|
||||
|
||||
HMODULE moduleD3D11 = 0;
|
||||
#ifdef UNICODE
|
||||
moduleD3D11 = LoadLibrary( L"d3d11.dll" );
|
||||
#else
|
||||
moduleD3D11 = LoadLibrary( "d3d11.dll" );
|
||||
#endif
|
||||
ADLASSERT( moduleD3D11 );
|
||||
|
||||
LPD3D11CREATEDEVICE _DynamicD3D11CreateDevice;
|
||||
_DynamicD3D11CreateDevice = ( LPD3D11CREATEDEVICE )GetProcAddress( moduleD3D11, "D3D11CreateDevice" );
|
||||
|
||||
D3D_DRIVER_TYPE type = D3D_DRIVER_TYPE_HARDWARE;
|
||||
// http://msdn.microsoft.com/en-us/library/ff476082(v=VS.85).aspx
|
||||
// If you set the pAdapter parameter to a non-NULL value, you must also set the DriverType parameter to the D3D_DRIVER_TYPE_UNKNOWN value. If you set the pAdapter parameter to a non-NULL value and the DriverType parameter to the D3D_DRIVER_TYPE_HARDWARE value, D3D11CreateDevice returns an HRESULT of E_INVALIDARG.
|
||||
type = D3D_DRIVER_TYPE_UNKNOWN;
|
||||
/*
|
||||
// Create a hardware Direct3D 11 device
|
||||
hr = _DynamicD3D11CreateDevice( NULL,
|
||||
type, NULL, createDeviceFlg,
|
||||
fl, _countof(fl), D3D11_SDK_VERSION, &deviceData->m_device, NULL, &deviceData->m_context );
|
||||
*/
|
||||
IDXGIAdapter* adapter = NULL;
|
||||
{// get adapter of the index
|
||||
IDXGIFactory* factory = NULL;
|
||||
int targetAdapterIdx = cfg.m_deviceIdx;//min( cfg.m_deviceIdx, getNDevices()-1 );
|
||||
CreateDXGIFactory( __uuidof(IDXGIFactory), (void**)&factory );
|
||||
|
||||
u32 i = 0;
|
||||
while( factory->EnumAdapters( i, &adapter ) != DXGI_ERROR_NOT_FOUND )
|
||||
{
|
||||
if( i== targetAdapterIdx ) break;
|
||||
i++;
|
||||
}
|
||||
factory->Release();
|
||||
}
|
||||
|
||||
// Create a hardware Direct3D 11 device
|
||||
hr = D3D11CreateDevice( adapter,
|
||||
type,
|
||||
NULL, createDeviceFlg,
|
||||
fl, _countof(fl), D3D11_SDK_VERSION, &deviceData->m_device, NULL, &deviceData->m_context );
|
||||
|
||||
ADLASSERT( hr == S_OK );
|
||||
|
||||
// Check if the hardware device supports Compute Shader 4.0
|
||||
D3D11_FEATURE_DATA_D3D10_X_HARDWARE_OPTIONS hwopts;
|
||||
deviceData->m_device->CheckFeatureSupport(D3D11_FEATURE_D3D10_X_HARDWARE_OPTIONS, &hwopts, sizeof(hwopts));
|
||||
|
||||
if( !hwopts.ComputeShaders_Plus_RawAndStructuredBuffers_Via_Shader_4_x )
|
||||
{
|
||||
SAFE_RELEASE( deviceData->m_context );
|
||||
SAFE_RELEASE( deviceData->m_device );
|
||||
|
||||
debugPrintf("DX11 GPU is not present\n");
|
||||
ADLASSERT( 0 );
|
||||
}
|
||||
|
||||
m_kernelManager = new KernelManager;
|
||||
}
|
||||
|
||||
void DeviceDX11::release()
|
||||
{
|
||||
SAFE_RELEASE( m_context );
|
||||
SAFE_RELEASE( m_device );
|
||||
|
||||
if( m_kernelManager ) delete m_kernelManager;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceDX11::allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type)
|
||||
{
|
||||
ADLASSERT( type != BufferBase::BUFFER_ZERO_COPY );
|
||||
|
||||
DeviceDX11* deviceData = this;
|
||||
buf->m_device = deviceData;
|
||||
buf->m_size = nElems;
|
||||
BufferDX11<T>* dBuf = (BufferDX11<T>*)buf;
|
||||
|
||||
// if( type & BufferBase::BUFFER )
|
||||
{
|
||||
HRESULT hr = S_OK;
|
||||
|
||||
if( type == BufferBase::BUFFER_CONST )
|
||||
{
|
||||
ADLASSERT( nElems == 1 );
|
||||
D3D11_BUFFER_DESC constant_buffer_desc;
|
||||
ZeroMemory( &constant_buffer_desc, sizeof(constant_buffer_desc) );
|
||||
// constant_buffer_desc.ByteWidth = NEXTMULTIPLEOF( sizeof(T), 16 );
|
||||
constant_buffer_desc.ByteWidth = (((sizeof(T))/(16) + (((sizeof(T))%(16)==0)?0:1))*(16));
|
||||
// constant_buffer_desc.Usage = D3D11_USAGE_DYNAMIC;
|
||||
// constant_buffer_desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
|
||||
// constant_buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
|
||||
constant_buffer_desc.Usage = D3D11_USAGE_DEFAULT;
|
||||
constant_buffer_desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
|
||||
constant_buffer_desc.CPUAccessFlags = 0;
|
||||
|
||||
hr = deviceData->m_device->CreateBuffer( &constant_buffer_desc, NULL, dBuf->getBufferPtr() );
|
||||
ADLASSERT( hr == S_OK );
|
||||
return;
|
||||
}
|
||||
|
||||
D3D11_BUFFER_DESC buffer_desc;
|
||||
ZeroMemory(&buffer_desc, sizeof(buffer_desc));
|
||||
buffer_desc.ByteWidth = nElems * sizeof(T);
|
||||
|
||||
if( type != BufferBase::BUFFER_RAW )
|
||||
{
|
||||
buffer_desc.StructureByteStride = sizeof(T);
|
||||
// buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
|
||||
}
|
||||
|
||||
if( type == BufferBase::BUFFER_STAGING )
|
||||
{
|
||||
buffer_desc.Usage = D3D11_USAGE_STAGING;
|
||||
buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
|
||||
}
|
||||
else if( type == BufferBase::BUFFER_INDEX )
|
||||
{
|
||||
buffer_desc.Usage = D3D11_USAGE_DEFAULT;
|
||||
buffer_desc.BindFlags = D3D11_BIND_INDEX_BUFFER;
|
||||
}
|
||||
else if( type == BufferBase::BUFFER_VERTEX )
|
||||
{
|
||||
buffer_desc.Usage = D3D11_USAGE_DEFAULT;
|
||||
buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
|
||||
}
|
||||
else
|
||||
{
|
||||
buffer_desc.Usage = D3D11_USAGE_DEFAULT;
|
||||
|
||||
buffer_desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
|
||||
buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
|
||||
|
||||
// check this
|
||||
if(type == BufferBase::BUFFER_RAW)
|
||||
{
|
||||
// buffer_desc.BindFlags |= D3D11_BIND_INDEX_BUFFER | D3D11_BIND_VERTEX_BUFFER;
|
||||
buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS | D3D11_RESOURCE_MISC_DRAWINDIRECT_ARGS; // need this to be used for DispatchIndirect
|
||||
}
|
||||
}
|
||||
hr = deviceData->m_device->CreateBuffer(&buffer_desc, NULL, dBuf->getBufferPtr());
|
||||
|
||||
ADLASSERT( hr == S_OK );
|
||||
|
||||
if( type == BufferBase::BUFFER_INDEX ) return;
|
||||
|
||||
if( type == BufferBase::BUFFER ||
|
||||
type == BufferBase::BUFFER_RAW ||
|
||||
type == BufferBase::BUFFER_W_COUNTER )
|
||||
{
|
||||
// Create UAVs for all CS buffers
|
||||
D3D11_UNORDERED_ACCESS_VIEW_DESC uavbuffer_desc;
|
||||
ZeroMemory(&uavbuffer_desc, sizeof(uavbuffer_desc));
|
||||
uavbuffer_desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
|
||||
|
||||
if( type == BufferBase::BUFFER_RAW )
|
||||
{
|
||||
uavbuffer_desc.Format = DXGI_FORMAT_R32_TYPELESS;
|
||||
uavbuffer_desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW;
|
||||
uavbuffer_desc.Buffer.NumElements = buffer_desc.ByteWidth / 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
uavbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
|
||||
uavbuffer_desc.Buffer.NumElements = nElems;
|
||||
}
|
||||
|
||||
if( type == BufferBase::BUFFER_W_COUNTER )
|
||||
{
|
||||
uavbuffer_desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_COUNTER;
|
||||
}
|
||||
|
||||
hr = deviceData->m_device->CreateUnorderedAccessView(dBuf->getBuffer(), &uavbuffer_desc, dBuf->getUAVPtr());
|
||||
ADLASSERT( hr == S_OK );
|
||||
|
||||
// Create SRVs for all CS buffers
|
||||
D3D11_SHADER_RESOURCE_VIEW_DESC srvbuffer_desc;
|
||||
ZeroMemory(&srvbuffer_desc, sizeof(srvbuffer_desc));
|
||||
if( type == BufferBase::BUFFER_RAW )
|
||||
{
|
||||
ADLASSERT( sizeof(T) <= 16 );
|
||||
srvbuffer_desc.Format = DXGI_FORMAT_R32_UINT;
|
||||
srvbuffer_desc.Buffer.ElementWidth = nElems;
|
||||
// if ( buffer_desc.MiscFlags & D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS )
|
||||
// {
|
||||
// srvbuffer_desc.Format = DXGI_FORMAT_R32_TYPELESS;
|
||||
// srvbuffer_desc.BufferEx.Flags = D3D11_BUFFEREX_SRV_FLAG_RAW;
|
||||
// srvbuffer_desc.BufferEx.NumElements = buffer_desc.ByteWidth / 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
srvbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
|
||||
srvbuffer_desc.Buffer.ElementWidth = nElems;
|
||||
}
|
||||
srvbuffer_desc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
|
||||
|
||||
hr = deviceData->m_device->CreateShaderResourceView(dBuf->getBuffer(), &srvbuffer_desc, dBuf->getSRVPtr());
|
||||
ADLASSERT( hr == S_OK );
|
||||
}
|
||||
else if( type == BufferBase::BUFFER_APPEND )
|
||||
{
|
||||
D3D11_UNORDERED_ACCESS_VIEW_DESC desc;
|
||||
ZeroMemory( &desc, sizeof(desc) );
|
||||
desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
|
||||
desc.Buffer.FirstElement = 0;
|
||||
|
||||
desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_APPEND;
|
||||
|
||||
desc.Format = DXGI_FORMAT_UNKNOWN; // Format must be must be DXGI_FORMAT_UNKNOWN, when creating a View of a Structured Buffer
|
||||
desc.Buffer.NumElements = buffer_desc.ByteWidth / buffer_desc.StructureByteStride;
|
||||
|
||||
hr = deviceData->m_device->CreateUnorderedAccessView( dBuf->getBuffer(), &desc, dBuf->getUAVPtr() );
|
||||
ADLASSERT( hr == S_OK );
|
||||
}
|
||||
}
|
||||
// else
|
||||
// {
|
||||
// ADLASSERT(0);
|
||||
// }
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceDX11::deallocate(Buffer<T>* buf)
|
||||
{
|
||||
BufferDX11<T>* dBuf = (BufferDX11<T>*)buf;
|
||||
|
||||
if( dBuf->getBuffer() )
|
||||
{
|
||||
dBuf->getBuffer()->Release();
|
||||
dBuf->m_ptr = NULL;
|
||||
}
|
||||
if( dBuf->getUAV() )
|
||||
{
|
||||
dBuf->getUAV()->Release();
|
||||
dBuf->m_uav = NULL;
|
||||
}
|
||||
if( dBuf->getSRV() )
|
||||
{
|
||||
dBuf->getSRV()->Release();
|
||||
dBuf->m_srv = NULL;
|
||||
}
|
||||
buf->m_device = 0;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceDX11::copy(Buffer<T>* dst, const Buffer<T>* src, int nElems)
|
||||
{
|
||||
if( dst->m_device->m_type == TYPE_DX11 || src->m_device->m_type == TYPE_DX11 )
|
||||
{
|
||||
DeviceDX11* deviceData = this;
|
||||
BufferDX11<T>* dDst = (BufferDX11<T>*)dst;
|
||||
BufferDX11<T>* dSrc = (BufferDX11<T>*)src;
|
||||
|
||||
D3D11_MAPPED_SUBRESOURCE MappedVelResource = {0};
|
||||
|
||||
D3D11_BOX destRegion;
|
||||
destRegion.left = 0*sizeof(T);
|
||||
destRegion.front = 0;
|
||||
destRegion.top = 0;
|
||||
destRegion.bottom = 1;
|
||||
destRegion.back = 1;
|
||||
destRegion.right = (0+nElems)*sizeof(T);
|
||||
|
||||
deviceData->m_context->CopySubresourceRegion(
|
||||
dDst->getBuffer(),
|
||||
0, 0, 0, 0,
|
||||
dSrc->getBuffer(),
|
||||
0,
|
||||
&destRegion );
|
||||
|
||||
}
|
||||
else if( src->m_device->m_type == TYPE_HOST )
|
||||
{
|
||||
ADLASSERT( dst->getType() == TYPE_DX11 );
|
||||
dst->write( src->m_ptr, nElems );
|
||||
}
|
||||
else if( dst->m_device->m_type == TYPE_HOST )
|
||||
{
|
||||
ADLASSERT( src->getType() == TYPE_DX11 );
|
||||
src->read( dst->m_ptr, nElems );
|
||||
}
|
||||
else
|
||||
{
|
||||
ADLASSERT( 0 );
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceDX11::copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems)
|
||||
{
|
||||
DeviceDX11* deviceData = this;
|
||||
BufferDX11<T>* dSrc = (BufferDX11<T>*)src;
|
||||
Buffer<T> sBuf( deviceData, nElems, BufferBase::BUFFER_STAGING );
|
||||
BufferDX11<T>* dStagingBuf = (BufferDX11<T>*)&sBuf;
|
||||
|
||||
|
||||
ID3D11Buffer *StagingBuffer = dStagingBuf->getBuffer();
|
||||
D3D11_MAPPED_SUBRESOURCE MappedVelResource = {0};
|
||||
|
||||
D3D11_BOX destRegion;
|
||||
destRegion.left = srcOffsetNElems*sizeof(T);
|
||||
destRegion.front = 0;
|
||||
destRegion.top = 0;
|
||||
destRegion.bottom = 1;
|
||||
destRegion.back = 1;
|
||||
destRegion.right = (srcOffsetNElems+nElems)*sizeof(T);
|
||||
|
||||
deviceData->m_context->CopySubresourceRegion(
|
||||
StagingBuffer,
|
||||
0, 0, 0, 0,
|
||||
dSrc->getBuffer(),
|
||||
0,
|
||||
&destRegion);
|
||||
|
||||
deviceData->m_context->Map(StagingBuffer, 0, D3D11_MAP_READ, 0, &MappedVelResource);
|
||||
memcpy(dst, MappedVelResource.pData, nElems*sizeof(T));
|
||||
deviceData->m_context->Unmap(StagingBuffer, 0);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceDX11::copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems)
|
||||
{
|
||||
BufferDX11<T>* dBuf = (BufferDX11<T>*)dst;
|
||||
|
||||
DeviceDX11* deviceData = this;
|
||||
|
||||
D3D11_BOX destRegion;
|
||||
destRegion.left = dstOffsetNElems*sizeof(T);
|
||||
destRegion.front = 0;
|
||||
destRegion.top = 0;
|
||||
destRegion.bottom = 1;
|
||||
destRegion.back = 1;
|
||||
destRegion.right = (dstOffsetNElems+nElems)*sizeof(T);
|
||||
deviceData->m_context->UpdateSubresource(dBuf->getBuffer(), 0, &destRegion, src, 0, 0);
|
||||
}
|
||||
|
||||
void DeviceDX11::waitForCompletion() const
|
||||
{
|
||||
const DeviceDX11* deviceData = this;
|
||||
|
||||
ID3D11Query* syncQuery;
|
||||
D3D11_QUERY_DESC qDesc;
|
||||
qDesc.Query = D3D11_QUERY_EVENT;
|
||||
qDesc.MiscFlags = 0;
|
||||
deviceData->m_device->CreateQuery( &qDesc, &syncQuery );
|
||||
deviceData->m_context->End( syncQuery );
|
||||
while( deviceData->m_context->GetData( syncQuery, 0,0,0 ) == S_FALSE ){}
|
||||
syncQuery->Release();
|
||||
}
|
||||
|
||||
int DeviceDX11::getNDevices()
|
||||
{
|
||||
IDXGIFactory1* factory = NULL;
|
||||
IDXGIAdapter1* adapter = NULL;
|
||||
CreateDXGIFactory1( __uuidof(IDXGIFactory1), (void**)&factory );
|
||||
|
||||
u32 i = 0;
|
||||
while( factory->EnumAdapters1( i, &adapter ) != DXGI_ERROR_NOT_FOUND )
|
||||
{
|
||||
i++;
|
||||
}
|
||||
|
||||
factory->Release();
|
||||
return i;
|
||||
}
|
||||
|
||||
void DeviceDX11::getDeviceName( char nameOut[128] ) const
|
||||
{
|
||||
IDXGIAdapter* adapter;// = getAdapterFromDevice( this );
|
||||
{
|
||||
IDXGIDevice* pDXGIDevice;
|
||||
|
||||
ADLASSERT( m_device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice) == S_OK );
|
||||
ADLASSERT( pDXGIDevice->GetParent(__uuidof(IDXGIAdapter), (void **)&adapter) == S_OK );
|
||||
|
||||
pDXGIDevice->Release();
|
||||
}
|
||||
DXGI_ADAPTER_DESC adapterDesc;
|
||||
adapter->GetDesc( &adapterDesc );
|
||||
|
||||
// wcstombs( nameOut, adapterDesc.Description, 128 );
|
||||
size_t i;
|
||||
wcstombs_s( &i, nameOut, 128, adapterDesc.Description, 128 );
|
||||
}
|
||||
|
||||
Kernel* DeviceDX11::getKernel(const char* fileName, const char* funcName, const char* option, const char* src, bool cacheKernel ) const
|
||||
{
|
||||
return m_kernelManager->query( this, fileName, funcName, option, src, cacheKernel );
|
||||
}
|
||||
|
||||
#undef u32
|
||||
|
||||
#undef SAFE_RELEASE
|
||||
|
||||
};
|
||||
@@ -0,0 +1,348 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
#define SAFE_RELEASE(p) { if(p) { (p)->Release(); (p)=NULL; } }
|
||||
|
||||
struct KernelDX11 : public Kernel
|
||||
{
|
||||
ID3D11ComputeShader* getKernel() { return (ID3D11ComputeShader*)m_kernel; }
|
||||
ID3D11ComputeShader** getKernelPtr() { return (ID3D11ComputeShader**)&m_kernel; }
|
||||
};
|
||||
|
||||
|
||||
__inline
|
||||
#ifdef UNICODE
|
||||
HRESULT FindDXSDKShaderFileCch( __in_ecount(cchDest) WCHAR* strDestPath,
|
||||
int cchDest,
|
||||
__in LPCWSTR strFilename )
|
||||
#else
|
||||
HRESULT FindDXSDKShaderFileCch( __in_ecount(cchDest) CHAR* strDestPath,
|
||||
int cchDest,
|
||||
__in LPCSTR strFilename )
|
||||
#endif
|
||||
{
|
||||
if( NULL == strFilename || strFilename[0] == 0 || NULL == strDestPath || cchDest < 10 )
|
||||
return E_INVALIDARG;
|
||||
|
||||
// Get the exe name, and exe path
|
||||
#ifdef UNICODE
|
||||
WCHAR strExePath[MAX_PATH] =
|
||||
#else
|
||||
CHAR strExePath[MAX_PATH] =
|
||||
#endif
|
||||
{
|
||||
0
|
||||
};
|
||||
#ifdef UNICODE
|
||||
WCHAR strExeName[MAX_PATH] =
|
||||
#else
|
||||
CHAR strExeName[MAX_PATH] =
|
||||
#endif
|
||||
{
|
||||
0
|
||||
};
|
||||
#ifdef UNICODE
|
||||
WCHAR* strLastSlash = NULL;
|
||||
#else
|
||||
CHAR* strLastSlash = NULL;
|
||||
#endif
|
||||
GetModuleFileName( NULL, strExePath, MAX_PATH );
|
||||
strExePath[MAX_PATH - 1] = 0;
|
||||
#ifdef UNICODE
|
||||
strLastSlash = wcsrchr( strExePath, TEXT( '\\' ) );
|
||||
#else
|
||||
strLastSlash = strrchr( strExePath, TEXT( '\\' ) );
|
||||
#endif
|
||||
if( strLastSlash )
|
||||
{
|
||||
#ifdef UNICODE
|
||||
wcscpy_s( strExeName, MAX_PATH, &strLastSlash[1] );
|
||||
#else
|
||||
|
||||
#endif
|
||||
// Chop the exe name from the exe path
|
||||
*strLastSlash = 0;
|
||||
|
||||
// Chop the .exe from the exe name
|
||||
#ifdef UNICODE
|
||||
strLastSlash = wcsrchr( strExeName, TEXT( '.' ) );
|
||||
#else
|
||||
strLastSlash = strrchr( strExeName, TEXT( '.' ) );
|
||||
#endif
|
||||
if( strLastSlash )
|
||||
*strLastSlash = 0;
|
||||
}
|
||||
|
||||
// Search in directories:
|
||||
// .\
|
||||
// %EXE_DIR%\..\..\%EXE_NAME%
|
||||
#ifdef UNICODE
|
||||
wcscpy_s( strDestPath, cchDest, strFilename );
|
||||
#else
|
||||
strcpy_s( strDestPath, cchDest, strFilename );
|
||||
#endif
|
||||
if( GetFileAttributes( strDestPath ) != 0xFFFFFFFF )
|
||||
return S_OK;
|
||||
|
||||
// swprintf_s( strDestPath, cchDest, L"%s\\..\\..\\%s\\%s", strExePath, strExeName, strFilename );
|
||||
#ifdef UNICODE
|
||||
swprintf_s( strDestPath, cchDest, L"%s\\..\\%s\\%s", strExePath, strExeName, strFilename );
|
||||
#else
|
||||
sprintf_s( strDestPath, cchDest, "%s\\..\\%s\\%s", strExePath, strExeName, strFilename );
|
||||
#endif
|
||||
if( GetFileAttributes( strDestPath ) != 0xFFFFFFFF )
|
||||
return S_OK;
|
||||
|
||||
// On failure, return the file as the path but also return an error code
|
||||
#ifdef UNICODE
|
||||
wcscpy_s( strDestPath, cchDest, strFilename );
|
||||
#else
|
||||
strcpy_s( strDestPath, cchDest, strFilename );
|
||||
#endif
|
||||
|
||||
ADLASSERT( 0 );
|
||||
|
||||
return E_FAIL;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template<>
|
||||
void KernelBuilder<TYPE_DX11>::setFromFile( const Device* deviceData, const char* fileName, const char* option, bool addExtension,
|
||||
bool cacheKernel)
|
||||
{
|
||||
char fileNameWithExtension[256];
|
||||
|
||||
if( addExtension )
|
||||
sprintf_s( fileNameWithExtension, "%s.hlsl", fileName );
|
||||
else
|
||||
sprintf_s( fileNameWithExtension, "%s", fileName );
|
||||
|
||||
m_deviceData = deviceData;
|
||||
|
||||
int nameLength = (int)strlen(fileNameWithExtension)+1;
|
||||
#ifdef UNICODE
|
||||
WCHAR* wfileNameWithExtension = new WCHAR[nameLength];
|
||||
#else
|
||||
CHAR* wfileNameWithExtension = new CHAR[nameLength];
|
||||
#endif
|
||||
memset(wfileNameWithExtension,0,nameLength);
|
||||
#ifdef UNICODE
|
||||
MultiByteToWideChar(CP_ACP,0,fileNameWithExtension,-1, wfileNameWithExtension, nameLength);
|
||||
#else
|
||||
sprintf_s(wfileNameWithExtension, nameLength, "%s", fileNameWithExtension);
|
||||
#endif
|
||||
// swprintf_s(wfileNameWithExtension, nameLength*2, L"%s", fileNameWithExtension);
|
||||
|
||||
HRESULT hr;
|
||||
|
||||
// Finds the correct path for the shader file.
|
||||
// This is only required for this sample to be run correctly from within the Sample Browser,
|
||||
// in your own projects, these lines could be removed safely
|
||||
hr = FindDXSDKShaderFileCch( m_path, MAX_PATH, wfileNameWithExtension );
|
||||
|
||||
delete [] wfileNameWithExtension;
|
||||
|
||||
ADLASSERT( hr == S_OK );
|
||||
}
|
||||
|
||||
template<>
|
||||
void KernelBuilder<TYPE_DX11>::setFromSrc( const Device* deviceData, const char* src, const char* option )
|
||||
{
|
||||
m_deviceData = deviceData;
|
||||
m_ptr = (void*)src;
|
||||
m_path[0] = '0';
|
||||
}
|
||||
|
||||
template<>
|
||||
KernelBuilder<TYPE_DX11>::~KernelBuilder()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
template<>
|
||||
void KernelBuilder<TYPE_DX11>::createKernel( const char* funcName, Kernel& kernelOut )
|
||||
{
|
||||
const DeviceDX11* deviceData = (const DeviceDX11*)m_deviceData;
|
||||
KernelDX11* dxKernel = (KernelDX11*)&kernelOut;
|
||||
HRESULT hr;
|
||||
|
||||
DWORD dwShaderFlags = D3DCOMPILE_ENABLE_STRICTNESS;
|
||||
#if defined( DEBUG ) || defined( _DEBUG )
|
||||
// Set the D3DCOMPILE_DEBUG flag to embed debug information in the shaders.
|
||||
// Setting this flag improves the shader debugging experience, but still allows
|
||||
// the shaders to be optimized and to run exactly the way they will run in
|
||||
// the release configuration of this program.
|
||||
dwShaderFlags |= D3DCOMPILE_DEBUG;
|
||||
#endif
|
||||
|
||||
const D3D_SHADER_MACRO defines[] =
|
||||
{
|
||||
#ifdef USE_STRUCTURED_BUFFERS
|
||||
"USE_STRUCTURED_BUFFERS", "1",
|
||||
#endif
|
||||
|
||||
#ifdef TEST_DOUBLE
|
||||
"TEST_DOUBLE", "1",
|
||||
#endif
|
||||
NULL, NULL
|
||||
};
|
||||
|
||||
// We generally prefer to use the higher CS shader profile when possible as CS 5.0 is better performance on 11-class hardware
|
||||
LPCSTR pProfile = ( deviceData->m_device->GetFeatureLevel() >= D3D_FEATURE_LEVEL_11_0 ) ? "cs_5_0" : "cs_4_0";
|
||||
|
||||
ID3DBlob* pErrorBlob = NULL;
|
||||
ID3DBlob* pBlob = NULL;
|
||||
if( m_path[0] == '0' )
|
||||
{
|
||||
char* src = (char*)m_ptr;
|
||||
hr = D3DX11CompileFromMemory( src, strlen(src), 0, defines, NULL, funcName, pProfile,
|
||||
dwShaderFlags, NULL, NULL, &pBlob, &pErrorBlob, NULL );
|
||||
}
|
||||
else
|
||||
{
|
||||
hr = D3DX11CompileFromFile( m_path, defines, NULL, funcName, pProfile,
|
||||
dwShaderFlags, NULL, NULL, &pBlob, &pErrorBlob, NULL );
|
||||
}
|
||||
|
||||
if ( FAILED(hr) )
|
||||
{
|
||||
debugPrintf("%s", (char*)pErrorBlob->GetBufferPointer());
|
||||
}
|
||||
ADLASSERT( hr == S_OK );
|
||||
|
||||
hr = deviceData->m_device->CreateComputeShader( pBlob->GetBufferPointer(), pBlob->GetBufferSize(), NULL,
|
||||
dxKernel->getKernelPtr() );
|
||||
|
||||
#if defined(DEBUG) || defined(PROFILE)
|
||||
if ( kernelOut.m_kernel )
|
||||
kernelOut.m_kernel->SetPrivateData( WKPDID_D3DDebugObjectName, lstrlenA(pFunctionName), pFunctionName );
|
||||
#endif
|
||||
|
||||
SAFE_RELEASE( pErrorBlob );
|
||||
SAFE_RELEASE( pBlob );
|
||||
|
||||
kernelOut.m_type = TYPE_DX11;
|
||||
}
|
||||
|
||||
template<>
|
||||
void KernelBuilder<TYPE_DX11>::deleteKernel( Kernel& kernel )
|
||||
{
|
||||
KernelDX11* dxKernel = (KernelDX11*)&kernel;
|
||||
|
||||
if( kernel.m_kernel )
|
||||
{
|
||||
dxKernel->getKernel()->Release();
|
||||
kernel.m_kernel = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
class LauncherDX11
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
__inline
|
||||
static void setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n );
|
||||
template<typename T>
|
||||
__inline
|
||||
static void setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts );
|
||||
__inline
|
||||
static void launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY );
|
||||
};
|
||||
|
||||
void LauncherDX11::setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n )
|
||||
{
|
||||
KernelDX11* dxKernel = (KernelDX11*)launcher->m_kernel;
|
||||
const DeviceDX11* dddx = (const DeviceDX11*)launcher->m_deviceData;
|
||||
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
BufferDX11<int>* dBuf = (BufferDX11<int>*)buffInfo[i].m_buffer;
|
||||
if( buffInfo[i].m_isReadOnly )
|
||||
{
|
||||
dddx->m_context->CSSetShaderResources( launcher->m_idx++, 1, dBuf->getSRVPtr() );
|
||||
}
|
||||
else
|
||||
{
|
||||
// todo. cannot initialize append buffer with proper counter value which is the last arg
|
||||
dddx->m_context->CSSetUnorderedAccessViews( launcher->m_idxRw++, 1, dBuf->getUAVPtr(), 0 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void LauncherDX11::setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts )
|
||||
{
|
||||
KernelDX11* dxKernel = (KernelDX11*)launcher->m_kernel;
|
||||
const DeviceDX11* dddx = (const DeviceDX11*)launcher->m_deviceData;
|
||||
BufferDX11<T>* dBuf = (BufferDX11<T>*)&constBuff;
|
||||
/*
|
||||
D3D11_MAPPED_SUBRESOURCE MappedResource;
|
||||
dddx->m_context->Map( dBuf->getBuffer(), 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
|
||||
memcpy( MappedResource.pData, &consts, sizeof(T) );
|
||||
dddx->m_context->Unmap( dBuf->getBuffer(), 0 );
|
||||
*/
|
||||
|
||||
dddx->m_context->UpdateSubresource( dBuf->getBuffer(), 0, NULL, &consts, 0, 0 );
|
||||
|
||||
dddx->m_context->CSSetConstantBuffers( 0, 1, dBuf->getBufferPtr() );
|
||||
}
|
||||
|
||||
void LauncherDX11::launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
|
||||
{
|
||||
KernelDX11* dxKernel = (KernelDX11*)launcher->m_kernel;
|
||||
const DeviceDX11* dddx = (const DeviceDX11*)launcher->m_deviceData;
|
||||
|
||||
dddx->m_context->CSSetShader( dxKernel->getKernel(), NULL, 0 );
|
||||
|
||||
int nx, ny, nz;
|
||||
nx = max( 1, (numThreadsX/localSizeX)+(!(numThreadsX%localSizeX)?0:1) );
|
||||
ny = max( 1, (numThreadsY/localSizeY)+(!(numThreadsY%localSizeY)?0:1) );
|
||||
nz = 1;
|
||||
|
||||
dddx->m_context->Dispatch( nx, ny, nz );
|
||||
|
||||
// set 0 to registers
|
||||
{
|
||||
dddx->m_context->CSSetShader( NULL, NULL, 0 );
|
||||
|
||||
if( launcher->m_idxRw )
|
||||
{
|
||||
ID3D11UnorderedAccessView* aUAViewsNULL[ 16 ] = { 0 };
|
||||
dddx->m_context->CSSetUnorderedAccessViews( 0,
|
||||
min( (unsigned int)launcher->m_idxRw, sizeof(aUAViewsNULL)/sizeof(*aUAViewsNULL) ), aUAViewsNULL, NULL );
|
||||
}
|
||||
|
||||
if( launcher->m_idx )
|
||||
{
|
||||
ID3D11ShaderResourceView* ppSRVNULL[16] = { 0 };
|
||||
dddx->m_context->CSSetShaderResources( 0,
|
||||
min( (unsigned int)launcher->m_idx, sizeof(ppSRVNULL)/sizeof(*ppSRVNULL) ), ppSRVNULL );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef SAFE_RELEASE
|
||||
|
||||
};
|
||||
@@ -0,0 +1,131 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
struct StopwatchDX11 : public StopwatchBase
|
||||
{
|
||||
public:
|
||||
__inline
|
||||
StopwatchDX11() : StopwatchBase(){}
|
||||
__inline
|
||||
~StopwatchDX11();
|
||||
|
||||
__inline
|
||||
void init( const Device* deviceData );
|
||||
__inline
|
||||
void start();
|
||||
__inline
|
||||
void split();
|
||||
__inline
|
||||
void stop();
|
||||
__inline
|
||||
float getMs(int index=0);
|
||||
__inline
|
||||
void getMs( float* times, int capacity );
|
||||
|
||||
public:
|
||||
ID3D11Query* m_tQuery[CAPACITY+1];
|
||||
ID3D11Query* m_fQuery;
|
||||
UINT64 m_t[CAPACITY];
|
||||
};
|
||||
|
||||
void StopwatchDX11::init( const Device* deviceData )
|
||||
{
|
||||
ADLASSERT( deviceData->m_type == TYPE_DX11 );
|
||||
m_device = deviceData;
|
||||
{
|
||||
D3D11_QUERY_DESC qDesc;
|
||||
qDesc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT;
|
||||
qDesc.MiscFlags = 0;
|
||||
((const DeviceDX11*)m_device)->m_device->CreateQuery( &qDesc, &m_fQuery );
|
||||
}
|
||||
for(int i=0; i<CAPACITY+1; i++)
|
||||
{
|
||||
D3D11_QUERY_DESC qDesc;
|
||||
qDesc.Query = D3D11_QUERY_TIMESTAMP;
|
||||
qDesc.MiscFlags = 0;
|
||||
((const DeviceDX11*)m_device)->m_device->CreateQuery( &qDesc, &m_tQuery[i] );
|
||||
}
|
||||
}
|
||||
|
||||
StopwatchDX11::~StopwatchDX11()
|
||||
{
|
||||
m_fQuery->Release();
|
||||
for(int i=0; i<CAPACITY+1; i++)
|
||||
{
|
||||
m_tQuery[i]->Release();
|
||||
}
|
||||
}
|
||||
|
||||
void StopwatchDX11::start()
|
||||
{
|
||||
m_idx = 0;
|
||||
((const DeviceDX11*)m_device)->m_context->Begin( m_fQuery );
|
||||
((const DeviceDX11*)m_device)->m_context->End( m_tQuery[m_idx++] );
|
||||
}
|
||||
|
||||
void StopwatchDX11::split()
|
||||
{
|
||||
if( m_idx < CAPACITY )
|
||||
((const DeviceDX11*)m_device)->m_context->End( m_tQuery[m_idx++] );
|
||||
}
|
||||
|
||||
void StopwatchDX11::stop()
|
||||
{
|
||||
((const DeviceDX11*)m_device)->m_context->End( m_tQuery[m_idx++] );
|
||||
((const DeviceDX11*)m_device)->m_context->End( m_fQuery );
|
||||
}
|
||||
|
||||
float StopwatchDX11::getMs(int index)
|
||||
{
|
||||
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT d;
|
||||
// m_deviceData->m_context->End( m_fQuery );
|
||||
while( ((const DeviceDX11*)m_device)->m_context->GetData( m_fQuery, &d,sizeof(D3D11_QUERY_DATA_TIMESTAMP_DISJOINT),0 ) == S_FALSE ) {}
|
||||
|
||||
while( ((const DeviceDX11*)m_device)->m_context->GetData( m_tQuery[0], &m_t[index],sizeof(UINT64),0 ) == S_FALSE ){}
|
||||
while( ((const DeviceDX11*)m_device)->m_context->GetData( m_tQuery[1], &m_t[index+1],sizeof(UINT64),0 ) == S_FALSE ){}
|
||||
|
||||
ADLASSERT( d.Disjoint == false );
|
||||
|
||||
float elapsedMs = (m_t[index+1] - m_t[index])/(float)d.Frequency*1000;
|
||||
return elapsedMs;
|
||||
|
||||
}
|
||||
|
||||
void StopwatchDX11::getMs( float* times, int capacity )
|
||||
{
|
||||
ADLASSERT( capacity <= CAPACITY );
|
||||
|
||||
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT d;
|
||||
while( ((const DeviceDX11*)m_device)->m_context->GetData( m_fQuery, &d,sizeof(D3D11_QUERY_DATA_TIMESTAMP_DISJOINT),0 ) == S_FALSE ) {}
|
||||
|
||||
for(int i=0; i<m_idx; i++)
|
||||
{
|
||||
while( ((const DeviceDX11*)m_device)->m_context->GetData( m_tQuery[i], &m_t[i],sizeof(UINT64),0 ) == S_FALSE ){}
|
||||
}
|
||||
|
||||
ADLASSERT( d.Disjoint == false );
|
||||
|
||||
for(int i=0; i<capacity; i++)
|
||||
{
|
||||
times[i] = (m_t[i+1] - m_t[i])/(float)d.Frequency*1000;
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
struct DeviceHost : public Device
|
||||
{
|
||||
DeviceHost() : Device( TYPE_HOST ){}
|
||||
|
||||
__inline
|
||||
void initialize(const Config& cfg);
|
||||
__inline
|
||||
void release();
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type);
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void deallocate(Buffer<T>* buf);
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void copy(Buffer<T>* dst, const Buffer<T>* src, int nElems);
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void copy(T* dst, const Buffer<T>* src, int nElems, int offsetNElems = 0);
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
void copy(Buffer<T>* dst, const T* src, int nElems, int offsetNElems = 0);
|
||||
|
||||
__inline
|
||||
void waitForCompletion() const;
|
||||
};
|
||||
|
||||
void DeviceHost::initialize(const Config& cfg)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
void DeviceHost::release()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceHost::allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type)
|
||||
{
|
||||
buf->m_device = this;
|
||||
|
||||
if( type == BufferBase::BUFFER_CONST ) return;
|
||||
|
||||
buf->m_ptr = new T[nElems];
|
||||
ADLASSERT( buf->m_ptr );
|
||||
buf->m_size = nElems;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceHost::deallocate(Buffer<T>* buf)
|
||||
{
|
||||
if( buf->m_ptr ) delete [] buf->m_ptr;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceHost::copy(Buffer<T>* dst, const Buffer<T>* src, int nElems)
|
||||
{
|
||||
copy( dst, src->m_ptr, nElems );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceHost::copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems)
|
||||
{
|
||||
ADLASSERT( src->getType() == TYPE_HOST );
|
||||
memcpy( dst, src->m_ptr+srcOffsetNElems, nElems*sizeof(T) );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void DeviceHost::copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems)
|
||||
{
|
||||
ADLASSERT( dst->getType() == TYPE_HOST );
|
||||
memcpy( dst->m_ptr+dstOffsetNElems, src, nElems*sizeof(T) );
|
||||
}
|
||||
|
||||
void DeviceHost::waitForCompletion() const
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
};
|
||||
@@ -0,0 +1,119 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
class StopwatchHost : public StopwatchBase
|
||||
{
|
||||
public:
|
||||
__inline
|
||||
StopwatchHost();
|
||||
__inline
|
||||
void init( const Device* deviceData );
|
||||
__inline
|
||||
void start();
|
||||
__inline
|
||||
void split();
|
||||
__inline
|
||||
void stop();
|
||||
__inline
|
||||
float getMs(int index=0);
|
||||
__inline
|
||||
void getMs( float* times, int capacity );
|
||||
|
||||
private:
|
||||
#ifdef _WIN32
|
||||
LARGE_INTEGER m_frequency;
|
||||
LARGE_INTEGER m_t[CAPACITY];
|
||||
#else
|
||||
struct timeval mStartTime;
|
||||
timeval m_t[CAPACITY];
|
||||
#endif
|
||||
};
|
||||
|
||||
__inline
|
||||
StopwatchHost::StopwatchHost()
|
||||
: StopwatchBase()
|
||||
{
|
||||
}
|
||||
|
||||
__inline
|
||||
void StopwatchHost::init( const Device* deviceData )
|
||||
{
|
||||
m_device = deviceData;
|
||||
#ifdef _WIN32
|
||||
QueryPerformanceFrequency( &m_frequency );
|
||||
#else
|
||||
gettimeofday(&mStartTime, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
__inline
|
||||
void StopwatchHost::start()
|
||||
{
|
||||
m_idx = 0;
|
||||
#ifdef _WIN32
|
||||
QueryPerformanceCounter(&m_t[m_idx++]);
|
||||
#else
|
||||
gettimeofday(&m_t[m_idx++], 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
__inline
|
||||
void StopwatchHost::split()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
QueryPerformanceCounter(&m_t[m_idx++]);
|
||||
#else
|
||||
gettimeofday(&m_t[m_idx++], 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
__inline
|
||||
void StopwatchHost::stop()
|
||||
{
|
||||
split();
|
||||
}
|
||||
|
||||
__inline
|
||||
float StopwatchHost::getMs(int index)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
return (float)(1000*(m_t[index+1].QuadPart - m_t[index].QuadPart))/m_frequency.QuadPart;
|
||||
#else
|
||||
return (m_t[index+1].tv_sec - m_t[index].tv_sec) * 1000 +
|
||||
(m_t[index+1].tv_usec - m_t[index].tv_usec) / 1000;
|
||||
#endif
|
||||
}
|
||||
|
||||
__inline
|
||||
void StopwatchHost::getMs(float* times, int capacity)
|
||||
{
|
||||
for(int i=0; i<capacity; i++) times[i] = 0.f;
|
||||
|
||||
for(int i=0; i<min(capacity, m_idx-1); i++)
|
||||
{
|
||||
times[i] = getMs(i);
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Adl/Adl.h>
|
||||
#include <AdlPrimitives/Math/Math.h>
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
class CopyBase
|
||||
{
|
||||
public:
|
||||
enum Option
|
||||
{
|
||||
PER_WI_1,
|
||||
PER_WI_2,
|
||||
PER_WI_4,
|
||||
};
|
||||
};
|
||||
|
||||
template<DeviceType TYPE>
|
||||
class Copy : public CopyBase
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
struct Data
|
||||
{
|
||||
const Device* m_device;
|
||||
Kernel* m_copy1F4Kernel;
|
||||
Kernel* m_copy2F4Kernel;
|
||||
Kernel* m_copy4F4Kernel;
|
||||
Kernel* m_copyF1Kernel;
|
||||
Kernel* m_copyF2Kernel;
|
||||
Buffer<int4>* m_constBuffer;
|
||||
};
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData);
|
||||
|
||||
static
|
||||
void deallocate(Data* data);
|
||||
|
||||
static
|
||||
void execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option = PER_WI_1);
|
||||
|
||||
static
|
||||
void execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n);
|
||||
|
||||
static
|
||||
void execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n);
|
||||
};
|
||||
|
||||
|
||||
#include <AdlPrimitives/Copy/CopyHost.inl>
|
||||
#include <AdlPrimitives/Copy/Copy.inl>
|
||||
|
||||
};
|
||||
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
|
||||
#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Copy\\CopyKernels"
|
||||
#define KERNEL0 "Copy1F4Kernel"
|
||||
#define KERNEL1 "Copy2F4Kernel"
|
||||
#define KERNEL2 "Copy4F4Kernel"
|
||||
#define KERNEL3 "CopyF1Kernel"
|
||||
#define KERNEL4 "CopyF2Kernel"
|
||||
|
||||
#include <AdlPrimitives/Copy/CopyKernelsCL.h>
|
||||
#include <AdlPrimitives/Copy/CopyKernelsDX11.h>
|
||||
|
||||
|
||||
template<DeviceType TYPE>
|
||||
typename Copy<TYPE>::Data* Copy<TYPE>::allocate( const Device* device )
|
||||
{
|
||||
ADLASSERT( TYPE == device->m_type );
|
||||
|
||||
|
||||
const char* src[] =
|
||||
#if defined(ADL_LOAD_KERNEL_FROM_STRING)
|
||||
{copyKernelsCL, copyKernelsDX11};
|
||||
// ADLASSERT(0);
|
||||
#else
|
||||
{0,0};
|
||||
#endif
|
||||
|
||||
Data* data = new Data;
|
||||
data->m_device = device;
|
||||
data->m_copy1F4Kernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
|
||||
data->m_copy2F4Kernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
|
||||
data->m_copy4F4Kernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
|
||||
data->m_copyF1Kernel = device->getKernel( PATH, KERNEL3, 0, src[TYPE] );
|
||||
data->m_copyF2Kernel = device->getKernel( PATH, KERNEL4, 0, src[TYPE] );
|
||||
data->m_constBuffer = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void Copy<TYPE>::deallocate( Data* data )
|
||||
{
|
||||
delete data->m_constBuffer;
|
||||
delete data;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void Copy<TYPE>::execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option )
|
||||
{
|
||||
ADLASSERT( TYPE == dst.getType() );
|
||||
ADLASSERT( TYPE == src.getType() );
|
||||
|
||||
int4 constBuffer;
|
||||
constBuffer.x = n;
|
||||
|
||||
switch (option)
|
||||
{
|
||||
case PER_WI_1:
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_copy1F4Kernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( n/1 );
|
||||
}
|
||||
break;
|
||||
case PER_WI_2:
|
||||
{
|
||||
ADLASSERT( n%2 == 0 );
|
||||
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_copy2F4Kernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( n/2 );
|
||||
}
|
||||
break;
|
||||
case PER_WI_4:
|
||||
{
|
||||
ADLASSERT( n%4 == 0 );
|
||||
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_copy4F4Kernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( n/4 );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
ADLASSERT(0);
|
||||
break;
|
||||
};
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void Copy<TYPE>::execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n )
|
||||
{
|
||||
ADLASSERT( TYPE == dst.getType() );
|
||||
ADLASSERT( TYPE == src.getType() );
|
||||
|
||||
int4 constBuffer;
|
||||
constBuffer.x = n;
|
||||
|
||||
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_copyF2Kernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( n/1 );
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void Copy<TYPE>::execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n )
|
||||
{
|
||||
ADLASSERT( TYPE == dst.getType() );
|
||||
ADLASSERT( TYPE == src.getType() );
|
||||
|
||||
int4 constBuffer;
|
||||
constBuffer.x = n;
|
||||
|
||||
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_copyF1Kernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( n/1 );
|
||||
}
|
||||
|
||||
|
||||
#undef PATH
|
||||
#undef KERNEL0
|
||||
#undef KERNEL1
|
||||
#undef KERNEL2
|
||||
#undef KERNEL3
|
||||
#undef KERNEL4
|
||||
@@ -0,0 +1,85 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
template<>
|
||||
class Copy<TYPE_HOST> : public CopyBase
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
struct Data
|
||||
{
|
||||
};
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData)
|
||||
{
|
||||
ADLASSERT( TYPE_HOST == deviceData->m_type );
|
||||
return 0;
|
||||
}
|
||||
|
||||
static
|
||||
void deallocate(Data* data)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static
|
||||
void execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option = PER_WI_1)
|
||||
{
|
||||
ADLASSERT( TYPE_HOST == dst.getType() );
|
||||
ADLASSERT( TYPE_HOST == src.getType() );
|
||||
|
||||
HostBuffer<float4>& dstH = (HostBuffer<float4>&)dst;
|
||||
HostBuffer<float4>& srcH = (HostBuffer<float4>&)src;
|
||||
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
dstH[i] = srcH[i];
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n)
|
||||
{
|
||||
ADLASSERT( TYPE_HOST == dst.getType() );
|
||||
ADLASSERT( TYPE_HOST == src.getType() );
|
||||
|
||||
HostBuffer<float2>& dstH = (HostBuffer<float2>&)dst;
|
||||
HostBuffer<float2>& srcH = (HostBuffer<float2>&)src;
|
||||
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
dstH[i] = srcH[i];
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n)
|
||||
{
|
||||
ADLASSERT( TYPE_HOST == dst.getType() );
|
||||
ADLASSERT( TYPE_HOST == src.getType() );
|
||||
|
||||
HostBuffer<float>& dstH = (HostBuffer<float>&)dst;
|
||||
HostBuffer<float>& srcH = (HostBuffer<float>&)src;
|
||||
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
dstH[i] = srcH[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||
|
||||
typedef unsigned int u32;
|
||||
#define GET_GROUP_IDX get_group_id(0)
|
||||
#define GET_LOCAL_IDX get_local_id(0)
|
||||
#define GET_GLOBAL_IDX get_global_id(0)
|
||||
#define GET_GROUP_SIZE get_local_size(0)
|
||||
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
|
||||
#define AtomInc(x) atom_inc(&(x))
|
||||
#define AtomInc1(x, out) out = atom_inc(&(x))
|
||||
|
||||
#define make_uint4 (uint4)
|
||||
#define make_uint2 (uint2)
|
||||
#define make_int2 (int2)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int m_n;
|
||||
int m_padding[3];
|
||||
} ConstBuffer;
|
||||
|
||||
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void Copy1F4Kernel(__global float4* dst, __global float4* src,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < cb.m_n )
|
||||
{
|
||||
float4 a0 = src[gIdx];
|
||||
|
||||
dst[ gIdx ] = a0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void Copy2F4Kernel(__global float4* dst, __global float4* src,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( 2*gIdx <= cb.m_n )
|
||||
{
|
||||
float4 a0 = src[gIdx*2+0];
|
||||
float4 a1 = src[gIdx*2+1];
|
||||
|
||||
dst[ gIdx*2+0 ] = a0;
|
||||
dst[ gIdx*2+1 ] = a1;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void Copy4F4Kernel(__global float4* dst, __global float4* src,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( 4*gIdx <= cb.m_n )
|
||||
{
|
||||
int idx0 = gIdx*4+0;
|
||||
int idx1 = gIdx*4+1;
|
||||
int idx2 = gIdx*4+2;
|
||||
int idx3 = gIdx*4+3;
|
||||
|
||||
float4 a0 = src[idx0];
|
||||
float4 a1 = src[idx1];
|
||||
float4 a2 = src[idx2];
|
||||
float4 a3 = src[idx3];
|
||||
|
||||
dst[ idx0 ] = a0;
|
||||
dst[ idx1 ] = a1;
|
||||
dst[ idx2 ] = a2;
|
||||
dst[ idx3 ] = a3;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void CopyF1Kernel(__global float* dstF1, __global float* srcF1,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < cb.m_n )
|
||||
{
|
||||
float a0 = srcF1[gIdx];
|
||||
|
||||
dstF1[ gIdx ] = a0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < cb.m_n )
|
||||
{
|
||||
float2 a0 = srcF2[gIdx];
|
||||
|
||||
dstF2[ gIdx ] = a0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
typedef uint u32;
|
||||
|
||||
#define GET_GROUP_IDX groupIdx.x
|
||||
#define GET_LOCAL_IDX localIdx.x
|
||||
#define GET_GLOBAL_IDX globalIdx.x
|
||||
#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
|
||||
#define GROUP_MEM_FENCE
|
||||
#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
|
||||
#define AtomInc(x) InterlockedAdd(x, 1)
|
||||
#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
|
||||
|
||||
#define make_uint4 uint4
|
||||
#define make_uint2 uint2
|
||||
#define make_int2 int2
|
||||
|
||||
#define WG_SIZE 64
|
||||
|
||||
#define GET_GROUP_SIZE WG_SIZE
|
||||
|
||||
|
||||
|
||||
cbuffer CB : register( b0 )
|
||||
{
|
||||
int m_n;
|
||||
int m_padding[3];
|
||||
};
|
||||
|
||||
RWStructuredBuffer<float4> dst : register( u0 );
|
||||
StructuredBuffer<float4> src : register( t0 );
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void Copy1F4Kernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < m_n )
|
||||
{
|
||||
float4 a0 = src[gIdx];
|
||||
|
||||
dst[ gIdx ] = a0;
|
||||
}
|
||||
}
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void Copy2F4Kernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( 2*gIdx <= m_n )
|
||||
{
|
||||
float4 a0 = src[gIdx*2+0];
|
||||
float4 a1 = src[gIdx*2+1];
|
||||
|
||||
dst[ gIdx*2+0 ] = a0;
|
||||
dst[ gIdx*2+1 ] = a1;
|
||||
}
|
||||
}
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void Copy4F4Kernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( 4*gIdx <= m_n )
|
||||
{
|
||||
int idx0 = gIdx*4+0;
|
||||
int idx1 = gIdx*4+1;
|
||||
int idx2 = gIdx*4+2;
|
||||
int idx3 = gIdx*4+3;
|
||||
|
||||
float4 a0 = src[idx0];
|
||||
float4 a1 = src[idx1];
|
||||
float4 a2 = src[idx2];
|
||||
float4 a3 = src[idx3];
|
||||
|
||||
dst[ idx0 ] = a0;
|
||||
dst[ idx1 ] = a1;
|
||||
dst[ idx2 ] = a2;
|
||||
dst[ idx3 ] = a3;
|
||||
}
|
||||
}
|
||||
|
||||
RWStructuredBuffer<float> dstF1 : register( u0 );
|
||||
StructuredBuffer<float> srcF1 : register( t0 );
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void CopyF1Kernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < m_n )
|
||||
{
|
||||
float a0 = srcF1[gIdx];
|
||||
|
||||
dstF1[ gIdx ] = a0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
RWStructuredBuffer<float2> dstF2 : register( u0 );
|
||||
StructuredBuffer<float2> srcF2 : register( t0 );
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void CopyF2Kernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < m_n )
|
||||
{
|
||||
float2 a0 = srcF2[gIdx];
|
||||
|
||||
dstF2[ gIdx ] = a0;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
static const char* copyKernelsCL= \
|
||||
"/*\n"
|
||||
" 2011 Takahiro Harada\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
|
||||
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
|
||||
"\n"
|
||||
"typedef unsigned int u32;\n"
|
||||
"#define GET_GROUP_IDX get_group_id(0)\n"
|
||||
"#define GET_LOCAL_IDX get_local_id(0)\n"
|
||||
"#define GET_GLOBAL_IDX get_global_id(0)\n"
|
||||
"#define GET_GROUP_SIZE get_local_size(0)\n"
|
||||
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define AtomInc(x) atom_inc(&(x))\n"
|
||||
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
|
||||
"\n"
|
||||
"#define make_uint4 (uint4)\n"
|
||||
"#define make_uint2 (uint2)\n"
|
||||
"#define make_int2 (int2)\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" int m_n;\n"
|
||||
" int m_padding[3];\n"
|
||||
"} ConstBuffer;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < cb.m_n )\n"
|
||||
" {\n"
|
||||
" float4 a0 = src[gIdx];\n"
|
||||
"\n"
|
||||
" dst[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( 2*gIdx <= cb.m_n )\n"
|
||||
" {\n"
|
||||
" float4 a0 = src[gIdx*2+0];\n"
|
||||
" float4 a1 = src[gIdx*2+1];\n"
|
||||
"\n"
|
||||
" dst[ gIdx*2+0 ] = a0;\n"
|
||||
" dst[ gIdx*2+1 ] = a1;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( 4*gIdx <= cb.m_n )\n"
|
||||
" {\n"
|
||||
" int idx0 = gIdx*4+0;\n"
|
||||
" int idx1 = gIdx*4+1;\n"
|
||||
" int idx2 = gIdx*4+2;\n"
|
||||
" int idx3 = gIdx*4+3;\n"
|
||||
"\n"
|
||||
" float4 a0 = src[idx0];\n"
|
||||
" float4 a1 = src[idx1];\n"
|
||||
" float4 a2 = src[idx2];\n"
|
||||
" float4 a3 = src[idx3];\n"
|
||||
"\n"
|
||||
" dst[ idx0 ] = a0;\n"
|
||||
" dst[ idx1 ] = a1;\n"
|
||||
" dst[ idx2 ] = a2;\n"
|
||||
" dst[ idx3 ] = a3;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < cb.m_n )\n"
|
||||
" {\n"
|
||||
" float a0 = srcF1[gIdx];\n"
|
||||
"\n"
|
||||
" dstF1[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < cb.m_n )\n"
|
||||
" {\n"
|
||||
" float2 a0 = srcF2[gIdx];\n"
|
||||
"\n"
|
||||
" dstF2[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
;
|
||||
@@ -0,0 +1,120 @@
|
||||
static const char* copyKernelsDX11= \
|
||||
"/*\n"
|
||||
" 2011 Takahiro Harada\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"typedef uint u32;\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_IDX groupIdx.x\n"
|
||||
"#define GET_LOCAL_IDX localIdx.x\n"
|
||||
"#define GET_GLOBAL_IDX globalIdx.x\n"
|
||||
"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
|
||||
"#define GROUP_MEM_FENCE\n"
|
||||
"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
|
||||
"#define AtomInc(x) InterlockedAdd(x, 1)\n"
|
||||
"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
|
||||
"\n"
|
||||
"#define make_uint4 uint4\n"
|
||||
"#define make_uint2 uint2\n"
|
||||
"#define make_int2 int2\n"
|
||||
"\n"
|
||||
"#define WG_SIZE 64\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_SIZE WG_SIZE\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"cbuffer CB : register( b0 )\n"
|
||||
"{\n"
|
||||
" int m_n;\n"
|
||||
" int m_padding[3];\n"
|
||||
"};\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<float4> dst : register( u0 );\n"
|
||||
"StructuredBuffer<float4> src : register( t0 );\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void Copy1F4Kernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < m_n )\n"
|
||||
" {\n"
|
||||
" float4 a0 = src[gIdx];\n"
|
||||
"\n"
|
||||
" dst[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void Copy2F4Kernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( 2*gIdx <= m_n )\n"
|
||||
" {\n"
|
||||
" float4 a0 = src[gIdx*2+0];\n"
|
||||
" float4 a1 = src[gIdx*2+1];\n"
|
||||
"\n"
|
||||
" dst[ gIdx*2+0 ] = a0;\n"
|
||||
" dst[ gIdx*2+1 ] = a1;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void Copy4F4Kernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( 4*gIdx <= m_n )\n"
|
||||
" {\n"
|
||||
" int idx0 = gIdx*4+0;\n"
|
||||
" int idx1 = gIdx*4+1;\n"
|
||||
" int idx2 = gIdx*4+2;\n"
|
||||
" int idx3 = gIdx*4+3;\n"
|
||||
"\n"
|
||||
" float4 a0 = src[idx0];\n"
|
||||
" float4 a1 = src[idx1];\n"
|
||||
" float4 a2 = src[idx2];\n"
|
||||
" float4 a3 = src[idx3];\n"
|
||||
"\n"
|
||||
" dst[ idx0 ] = a0;\n"
|
||||
" dst[ idx1 ] = a1;\n"
|
||||
" dst[ idx2 ] = a2;\n"
|
||||
" dst[ idx3 ] = a3;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<float> dstF1 : register( u0 );\n"
|
||||
"StructuredBuffer<float> srcF1 : register( t0 );\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void CopyF1Kernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < m_n )\n"
|
||||
" {\n"
|
||||
" float a0 = srcF1[gIdx];\n"
|
||||
"\n"
|
||||
" dstF1[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<float2> dstF2 : register( u0 );\n"
|
||||
"StructuredBuffer<float2> srcF2 : register( t0 );\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void CopyF2Kernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < m_n )\n"
|
||||
" {\n"
|
||||
" float2 a0 = srcF2[gIdx];\n"
|
||||
"\n"
|
||||
" dstF2[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
;
|
||||
@@ -0,0 +1,77 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Adl/Adl.h>
|
||||
#include <AdlPrimitives/Math/Math.h>
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
class FillBase
|
||||
{
|
||||
public:
|
||||
enum Option
|
||||
{
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
template<DeviceType TYPE>
|
||||
class Fill
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
struct ConstData
|
||||
{
|
||||
int4 m_data;
|
||||
int m_offset;
|
||||
int m_n;
|
||||
int m_padding[2];
|
||||
};
|
||||
|
||||
struct Data
|
||||
{
|
||||
const Device* m_device;
|
||||
Kernel* m_fillIntKernel;
|
||||
Kernel* m_fillInt2Kernel;
|
||||
Kernel* m_fillInt4Kernel;
|
||||
Buffer<ConstData>* m_constBuffer;
|
||||
};
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData);
|
||||
|
||||
static
|
||||
void deallocate(Data* data);
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<int>& src, const int& value, int n, int offset = 0);
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<int2>& src, const int2& value, int n, int offset = 0);
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<int4>& src, const int4& value, int n, int offset = 0);
|
||||
|
||||
};
|
||||
|
||||
|
||||
#include <AdlPrimitives/Fill/FillHost.inl>
|
||||
#include <AdlPrimitives/Fill/Fill.inl>
|
||||
|
||||
};
|
||||
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
//#define PATH "..\\..\\AdlPrimitives\\Fill\\FillKernels"
|
||||
#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Fill\\FillKernels"
|
||||
#define KERNEL0 "FillIntKernel"
|
||||
#define KERNEL1 "FillInt2Kernel"
|
||||
#define KERNEL2 "FillInt4Kernel"
|
||||
|
||||
#include <AdlPrimitives/Fill/FillKernelsCL.h>
|
||||
#include <AdlPrimitives/Fill/FillKernelsDX11.h>
|
||||
|
||||
|
||||
template<DeviceType TYPE>
|
||||
typename Fill<TYPE>::Data* Fill<TYPE>::allocate( const Device* device )
|
||||
{
|
||||
ADLASSERT( TYPE == device->m_type );
|
||||
|
||||
const char* src[] =
|
||||
#if defined(ADL_LOAD_KERNEL_FROM_STRING)
|
||||
{fillKernelsCL, fillKernelsDX11};
|
||||
#else
|
||||
{0,0};
|
||||
#endif
|
||||
|
||||
Data* data = new Data;
|
||||
data->m_device = device;
|
||||
data->m_fillIntKernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
|
||||
data->m_fillInt2Kernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
|
||||
data->m_fillInt4Kernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
|
||||
data->m_constBuffer = new Buffer<ConstData>( device, 1, BufferBase::BUFFER_CONST );
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void Fill<TYPE>::deallocate( Data* data )
|
||||
{
|
||||
delete data->m_constBuffer;
|
||||
delete data;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void Fill<TYPE>::execute(Data* data, Buffer<int>& src, const int& value, int n, int offset)
|
||||
{
|
||||
ADLASSERT( n>0 );
|
||||
ConstData constBuffer;
|
||||
{
|
||||
constBuffer.m_offset = offset;
|
||||
constBuffer.m_n = n;
|
||||
constBuffer.m_data = make_int4( value );
|
||||
}
|
||||
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( &src ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_fillIntKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( n );
|
||||
}
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void Fill<TYPE>::execute(Data* data, Buffer<int2>& src, const int2& value, int n, int offset)
|
||||
{
|
||||
ADLASSERT( n>0 );
|
||||
ConstData constBuffer;
|
||||
{
|
||||
constBuffer.m_offset = offset;
|
||||
constBuffer.m_n = n;
|
||||
constBuffer.m_data = make_int4( value.x, value.y, 0, 0 );
|
||||
}
|
||||
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( &src ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_fillInt2Kernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( n );
|
||||
}
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void Fill<TYPE>::execute(Data* data, Buffer<int4>& src, const int4& value, int n, int offset)
|
||||
{
|
||||
ADLASSERT( n>0 );
|
||||
ConstData constBuffer;
|
||||
{
|
||||
constBuffer.m_offset = offset;
|
||||
constBuffer.m_n = n;
|
||||
constBuffer.m_data = value;
|
||||
}
|
||||
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( &src ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_fillInt4Kernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( n );
|
||||
}
|
||||
}
|
||||
|
||||
#undef PATH
|
||||
#undef KERNEL0
|
||||
#undef KERNEL1
|
||||
#undef KERNEL2
|
||||
|
||||
@@ -0,0 +1,99 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
template<>
|
||||
class Fill<TYPE_HOST>
|
||||
{
|
||||
public:
|
||||
struct Data
|
||||
{
|
||||
};
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static
|
||||
void deallocate(Data* data)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static
|
||||
void executeImpl(Data* data, Buffer<T>& src, const T& value, int n, int offset = 0)
|
||||
{
|
||||
ADLASSERT( src.getType() == TYPE_HOST );
|
||||
ADLASSERT( src.m_size >= offset+n );
|
||||
HostBuffer<T>& hSrc = (HostBuffer<T>&)src;
|
||||
|
||||
for(int idx=offset; idx<offset+n; idx++)
|
||||
{
|
||||
hSrc[idx] = value;
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<int>& src, const int& value, int n, int offset = 0)
|
||||
{
|
||||
executeImpl( data, src, value, n, offset );
|
||||
}
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<int2>& src, const int2& value, int n, int offset = 0)
|
||||
{
|
||||
executeImpl( data, src, value, n, offset );
|
||||
}
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<int4>& src, const int4& value, int n, int offset = 0)
|
||||
{
|
||||
executeImpl( data, src, value, n, offset );
|
||||
}
|
||||
|
||||
/*
|
||||
static
|
||||
void execute(Data* data, Buffer<int>& src, int value, int n, int offset = 0)
|
||||
{
|
||||
ADLASSERT( src.getType() == TYPE_HOST );
|
||||
ADLASSERT( src.m_size <= offset+n );
|
||||
HostBuffer<u32>& hSrc = (HostBuffer<u32>&)src;
|
||||
|
||||
for(int idx=offset; idx<offset+n; idx++)
|
||||
{
|
||||
src[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<int2>& src, const int2& value, int n, int offset = 0)
|
||||
{
|
||||
ADLASSERT( src.getType() == TYPE_HOST );
|
||||
ADLASSERT( src.m_size <= offset+n );
|
||||
|
||||
}
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<int4>& src, const int4& value, int n, int offset = 0)
|
||||
{
|
||||
ADLASSERT( src.getType() == TYPE_HOST );
|
||||
ADLASSERT( src.m_size <= offset+n );
|
||||
|
||||
}
|
||||
*/
|
||||
};
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||
|
||||
typedef unsigned int u32;
|
||||
#define GET_GROUP_IDX get_group_id(0)
|
||||
#define GET_LOCAL_IDX get_local_id(0)
|
||||
#define GET_GLOBAL_IDX get_global_id(0)
|
||||
#define GET_GROUP_SIZE get_local_size(0)
|
||||
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
|
||||
#define AtomInc(x) atom_inc(&(x))
|
||||
#define AtomInc1(x, out) out = atom_inc(&(x))
|
||||
|
||||
#define make_uint4 (uint4)
|
||||
#define make_uint2 (uint2)
|
||||
#define make_int2 (int2)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int4 m_data;
|
||||
int m_offset;
|
||||
int m_n;
|
||||
int m_padding[2];
|
||||
} ConstBuffer;
|
||||
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void FillIntKernel(__global int* dstInt,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < cb.m_n )
|
||||
{
|
||||
dstInt[ cb.m_offset+gIdx ] = cb.m_data.x;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void FillInt2Kernel(__global int2* dstInt2,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < cb.m_n )
|
||||
{
|
||||
dstInt2[ cb.m_offset+gIdx ] = make_int2( cb.m_data.x, cb.m_data.y );
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void FillInt4Kernel(__global int4* dstInt4,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < cb.m_n )
|
||||
{
|
||||
dstInt4[ cb.m_offset+gIdx ] = cb.m_data;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
typedef uint u32;
|
||||
|
||||
#define GET_GROUP_IDX groupIdx.x
|
||||
#define GET_LOCAL_IDX localIdx.x
|
||||
#define GET_GLOBAL_IDX globalIdx.x
|
||||
#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
|
||||
#define GROUP_MEM_FENCE
|
||||
#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
|
||||
#define AtomInc(x) InterlockedAdd(x, 1)
|
||||
#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
|
||||
|
||||
#define make_uint4 uint4
|
||||
#define make_uint2 uint2
|
||||
#define make_int2 int2
|
||||
|
||||
|
||||
cbuffer CB : register( b0 )
|
||||
{
|
||||
int4 m_data;
|
||||
int m_offset;
|
||||
int m_n;
|
||||
int m_padding[2];
|
||||
};
|
||||
|
||||
|
||||
RWStructuredBuffer<int> dstInt : register( u0 );
|
||||
|
||||
[numthreads(64, 1, 1)]
|
||||
void FillIntKernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < m_n )
|
||||
{
|
||||
dstInt[ m_offset+gIdx ] = m_data.x;
|
||||
}
|
||||
}
|
||||
|
||||
RWStructuredBuffer<int2> dstInt2 : register( u0 );
|
||||
|
||||
[numthreads(64, 1, 1)]
|
||||
void FillInt2Kernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < m_n )
|
||||
{
|
||||
dstInt2[ m_offset+gIdx ] = make_int2( m_data.x, m_data.y );
|
||||
}
|
||||
}
|
||||
|
||||
RWStructuredBuffer<int4> dstInt4 : register( u0 );
|
||||
|
||||
[numthreads(64, 1, 1)]
|
||||
void FillInt4Kernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < m_n )
|
||||
{
|
||||
dstInt4[ m_offset+gIdx ] = m_data;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
static const char* fillKernelsCL= \
|
||||
"/*\n"
|
||||
" 2011 Takahiro Harada\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
|
||||
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
|
||||
"\n"
|
||||
"typedef unsigned int u32;\n"
|
||||
"#define GET_GROUP_IDX get_group_id(0)\n"
|
||||
"#define GET_LOCAL_IDX get_local_id(0)\n"
|
||||
"#define GET_GLOBAL_IDX get_global_id(0)\n"
|
||||
"#define GET_GROUP_SIZE get_local_size(0)\n"
|
||||
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define AtomInc(x) atom_inc(&(x))\n"
|
||||
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
|
||||
"\n"
|
||||
"#define make_uint4 (uint4)\n"
|
||||
"#define make_uint2 (uint2)\n"
|
||||
"#define make_int2 (int2)\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" int4 m_data;\n"
|
||||
" int m_offset;\n"
|
||||
" int m_n;\n"
|
||||
" int m_padding[2];\n"
|
||||
"} ConstBuffer;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void FillIntKernel(__global int* dstInt, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < cb.m_n )\n"
|
||||
" {\n"
|
||||
" dstInt[ cb.m_offset+gIdx ] = cb.m_data.x;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void FillInt2Kernel(__global int2* dstInt2, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < cb.m_n )\n"
|
||||
" {\n"
|
||||
" dstInt2[ cb.m_offset+gIdx ] = make_int2( cb.m_data.x, cb.m_data.y );\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void FillInt4Kernel(__global int4* dstInt4, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < cb.m_n )\n"
|
||||
" {\n"
|
||||
" dstInt4[ cb.m_offset+gIdx ] = cb.m_data;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
;
|
||||
@@ -0,0 +1,69 @@
|
||||
static const char* fillKernelsDX11= \
|
||||
"/*\n"
|
||||
" 2011 Takahiro Harada\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"typedef uint u32;\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_IDX groupIdx.x\n"
|
||||
"#define GET_LOCAL_IDX localIdx.x\n"
|
||||
"#define GET_GLOBAL_IDX globalIdx.x\n"
|
||||
"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
|
||||
"#define GROUP_MEM_FENCE\n"
|
||||
"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
|
||||
"#define AtomInc(x) InterlockedAdd(x, 1)\n"
|
||||
"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
|
||||
"\n"
|
||||
"#define make_uint4 uint4\n"
|
||||
"#define make_uint2 uint2\n"
|
||||
"#define make_int2 int2\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"cbuffer CB : register( b0 )\n"
|
||||
"{\n"
|
||||
" int4 m_data;\n"
|
||||
" int m_offset;\n"
|
||||
" int m_n;\n"
|
||||
" int m_padding[2];\n"
|
||||
"};\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<int> dstInt : register( u0 );\n"
|
||||
"\n"
|
||||
"[numthreads(64, 1, 1)]\n"
|
||||
"void FillIntKernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < m_n )\n"
|
||||
" {\n"
|
||||
" dstInt[ m_offset+gIdx ] = m_data.x;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<int2> dstInt2 : register( u0 );\n"
|
||||
"\n"
|
||||
"[numthreads(64, 1, 1)]\n"
|
||||
"void FillInt2Kernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < m_n )\n"
|
||||
" {\n"
|
||||
" dstInt2[ m_offset+gIdx ] = make_int2( m_data.x, m_data.y );\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<int4> dstInt4 : register( u0 );\n"
|
||||
"\n"
|
||||
"[numthreads(64, 1, 1)]\n"
|
||||
"void FillInt4Kernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < m_n )\n"
|
||||
" {\n"
|
||||
" dstInt4[ m_offset+gIdx ] = m_data;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
;
|
||||
@@ -0,0 +1,231 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
#ifndef ARRAY_H
|
||||
#define ARRAY_H
|
||||
|
||||
#include <string.h>
|
||||
#include <malloc.h>
|
||||
#include <Common/Base/Error.h>
|
||||
#include <new.h>
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
template <class T>
|
||||
class Array
|
||||
{
|
||||
public:
|
||||
__inline
|
||||
Array();
|
||||
__inline
|
||||
Array(int size);
|
||||
__inline
|
||||
~Array();
|
||||
__inline
|
||||
T& operator[] (int idx);
|
||||
__inline
|
||||
const T& operator[] (int idx) const;
|
||||
__inline
|
||||
void pushBack(const T& elem);
|
||||
__inline
|
||||
void popBack();
|
||||
__inline
|
||||
void clear();
|
||||
__inline
|
||||
void setSize(int size);
|
||||
__inline
|
||||
int getSize() const;
|
||||
__inline
|
||||
T* begin();
|
||||
__inline
|
||||
const T* begin() const;
|
||||
__inline
|
||||
T* end();
|
||||
__inline
|
||||
const T* end() const;
|
||||
__inline
|
||||
int indexOf(const T& data) const;
|
||||
__inline
|
||||
void removeAt(int idx);
|
||||
__inline
|
||||
T& expandOne();
|
||||
|
||||
private:
|
||||
Array(const Array& a){}
|
||||
|
||||
private:
|
||||
enum
|
||||
{
|
||||
DEFAULT_SIZE = 128,
|
||||
INCREASE_SIZE = 128,
|
||||
};
|
||||
|
||||
T* m_data;
|
||||
int m_size;
|
||||
int m_capacity;
|
||||
};
|
||||
|
||||
template<class T>
|
||||
Array<T>::Array()
|
||||
{
|
||||
m_size = 0;
|
||||
m_capacity = DEFAULT_SIZE;
|
||||
// m_data = new T[ m_capacity ];
|
||||
m_data = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
|
||||
for(int i=0; i<m_capacity; i++) new(&m_data[i])T;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
Array<T>::Array(int size)
|
||||
{
|
||||
m_size = size;
|
||||
m_capacity = size;
|
||||
// m_data = new T[ m_capacity ];
|
||||
m_data = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
|
||||
for(int i=0; i<m_capacity; i++) new(&m_data[i])T;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
Array<T>::~Array()
|
||||
{
|
||||
if( m_data )
|
||||
{
|
||||
// delete [] m_data;
|
||||
_aligned_free( m_data );
|
||||
m_data = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
template<class T>
|
||||
T& Array<T>::operator[](int idx)
|
||||
{
|
||||
ADLASSERT(idx<m_size);
|
||||
return m_data[idx];
|
||||
}
|
||||
|
||||
template<class T>
|
||||
const T& Array<T>::operator[](int idx) const
|
||||
{
|
||||
ADLASSERT(idx<m_size);
|
||||
return m_data[idx];
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void Array<T>::pushBack(const T& elem)
|
||||
{
|
||||
if( m_size == m_capacity )
|
||||
{
|
||||
int oldCap = m_capacity;
|
||||
m_capacity += INCREASE_SIZE;
|
||||
// T* s = new T[m_capacity];
|
||||
T* s = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
|
||||
memcpy( s, m_data, sizeof(T)*oldCap );
|
||||
// delete [] m_data;
|
||||
_aligned_free( m_data );
|
||||
m_data = s;
|
||||
}
|
||||
m_data[ m_size++ ] = elem;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void Array<T>::popBack()
|
||||
{
|
||||
ADLASSERT( m_size>0 );
|
||||
m_size--;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void Array<T>::clear()
|
||||
{
|
||||
m_size = 0;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void Array<T>::setSize(int size)
|
||||
{
|
||||
if( size > m_capacity )
|
||||
{
|
||||
int oldCap = m_capacity;
|
||||
m_capacity = size;
|
||||
// T* s = new T[m_capacity];
|
||||
T* s = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
|
||||
for(int i=0; i<m_capacity; i++) new(&s[i])T;
|
||||
memcpy( s, m_data, sizeof(T)*oldCap );
|
||||
// delete [] m_data;
|
||||
_aligned_free( m_data );
|
||||
m_data = s;
|
||||
}
|
||||
m_size = size;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
int Array<T>::getSize() const
|
||||
{
|
||||
return m_size;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
const T* Array<T>::begin() const
|
||||
{
|
||||
return m_data;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
T* Array<T>::begin()
|
||||
{
|
||||
return m_data;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
T* Array<T>::end()
|
||||
{
|
||||
return m_data+m_size;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
const T* Array<T>::end() const
|
||||
{
|
||||
return m_data+m_size;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
int Array<T>::indexOf(const T& data) const
|
||||
{
|
||||
for(int i=0; i<m_size; i++)
|
||||
{
|
||||
if( data == m_data[i] ) return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void Array<T>::removeAt(int idx)
|
||||
{
|
||||
ADLASSERT(idx<m_size);
|
||||
m_data[idx] = m_data[--m_size];
|
||||
}
|
||||
|
||||
template<class T>
|
||||
T& Array<T>::expandOne()
|
||||
{
|
||||
setSize( m_size+1 );
|
||||
return m_data[ m_size-1 ];
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,173 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
__inline
|
||||
float2 make_float2(float x, float y)
|
||||
{
|
||||
float2 v;
|
||||
v.s[0] = x; v.s[1] = y;
|
||||
return v;
|
||||
}
|
||||
|
||||
__inline
|
||||
float2 make_float2(float x)
|
||||
{
|
||||
return make_float2(x,x);
|
||||
}
|
||||
|
||||
__inline
|
||||
float2 make_float2(const int2& x)
|
||||
{
|
||||
return make_float2((float)x.s[0], (float)x.s[1]);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__inline
|
||||
float2 operator-(const float2& a)
|
||||
{
|
||||
return make_float2(-a.x, -a.y);
|
||||
}
|
||||
|
||||
__inline
|
||||
float2 operator*(const float2& a, const float2& b)
|
||||
{
|
||||
float2 out;
|
||||
out.s[0] = a.s[0]*b.s[0];
|
||||
out.s[1] = a.s[1]*b.s[1];
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
float2 operator*(float a, const float2& b)
|
||||
{
|
||||
return make_float2(a*b.s[0], a*b.s[1]);
|
||||
}
|
||||
|
||||
__inline
|
||||
float2 operator*(const float2& b, float a)
|
||||
{
|
||||
return make_float2(a*b.s[0], a*b.s[1]);
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator*=(float2& a, const float2& b)
|
||||
{
|
||||
a.s[0]*=b.s[0];
|
||||
a.s[1]*=b.s[1];
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator*=(float2& a, float b)
|
||||
{
|
||||
a.s[0]*=b;
|
||||
a.s[1]*=b;
|
||||
}
|
||||
|
||||
__inline
|
||||
float2 operator/(const float2& a, const float2& b)
|
||||
{
|
||||
float2 out;
|
||||
out.s[0] = a.s[0]/b.s[0];
|
||||
out.s[1] = a.s[1]/b.s[1];
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
float2 operator/(const float2& b, float a)
|
||||
{
|
||||
return make_float2(b.s[0]/a, b.s[1]/a);
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator/=(float2& a, const float2& b)
|
||||
{
|
||||
a.s[0]/=b.s[0];
|
||||
a.s[1]/=b.s[1];
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator/=(float2& a, float b)
|
||||
{
|
||||
a.s[0]/=b;
|
||||
a.s[1]/=b;
|
||||
}
|
||||
//
|
||||
|
||||
__inline
|
||||
float2 operator+(const float2& a, const float2& b)
|
||||
{
|
||||
float2 out;
|
||||
out.s[0] = a.s[0]+b.s[0];
|
||||
out.s[1] = a.s[1]+b.s[1];
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
float2 operator+(const float2& a, float b)
|
||||
{
|
||||
float2 out;
|
||||
out.s[0] = a.s[0]+b;
|
||||
out.s[1] = a.s[1]+b;
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
float2 operator-(const float2& a, const float2& b)
|
||||
{
|
||||
float2 out;
|
||||
out.s[0] = a.s[0]-b.s[0];
|
||||
out.s[1] = a.s[1]-b.s[1];
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
float2 operator-(const float2& a, float b)
|
||||
{
|
||||
float2 out;
|
||||
out.s[0] = a.s[0]-b;
|
||||
out.s[1] = a.s[1]-b;
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator+=(float2& a, const float2& b)
|
||||
{
|
||||
a.s[0]+=b.s[0];
|
||||
a.s[1]+=b.s[1];
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator+=(float2& a, float b)
|
||||
{
|
||||
a.s[0]+=b;
|
||||
a.s[1]+=b;
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator-=(float2& a, const float2& b)
|
||||
{
|
||||
a.s[0]-=b.s[0];
|
||||
a.s[1]-=b.s[1];
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator-=(float2& a, float b)
|
||||
{
|
||||
a.s[0]-=b;
|
||||
a.s[1]-=b;
|
||||
}
|
||||
@@ -0,0 +1,375 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
//#define CHECK_ALIGNMENT(a) ADLASSERT((u32(&(a)) & 0xf) == 0);
|
||||
#define CHECK_ALIGNMENT(a) a;
|
||||
|
||||
|
||||
__inline
|
||||
float4 make_float4(float x, float y, float z, float w = 0.f)
|
||||
{
|
||||
float4 v;
|
||||
v.x = x; v.y = y; v.z = z; v.w = w;
|
||||
return v;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 make_float4(float x)
|
||||
{
|
||||
return make_float4(x,x,x,x);
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 make_float4(const int4& x)
|
||||
{
|
||||
return make_float4((float)x.s[0], (float)x.s[1], (float)x.s[2], (float)x.s[3]);
|
||||
}
|
||||
|
||||
__inline
|
||||
int4 make_int4(int x, int y, int z, int w = 0)
|
||||
{
|
||||
int4 v;
|
||||
v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
|
||||
return v;
|
||||
}
|
||||
|
||||
__inline
|
||||
int4 make_int4(int x)
|
||||
{
|
||||
return make_int4(x,x,x,x);
|
||||
}
|
||||
|
||||
__inline
|
||||
int4 make_int4(const float4& x)
|
||||
{
|
||||
return make_int4((int)x.x, (int)x.y, (int)x.z, (int)x.w);
|
||||
}
|
||||
|
||||
__inline
|
||||
int2 make_int2(int a, int b)
|
||||
{
|
||||
int2 ans; ans.x = a; ans.y = b;
|
||||
return ans;
|
||||
}
|
||||
|
||||
__inline
|
||||
bool operator ==(const int2& a, const int2& b)
|
||||
{
|
||||
return a.x==b.x && a.y==b.y;
|
||||
}
|
||||
|
||||
__inline
|
||||
bool operator ==(const int4& a, const int4& b)
|
||||
{
|
||||
return a.x==b.x && a.y==b.y && a.z==b.z && a.w==b.w;
|
||||
}
|
||||
|
||||
__inline
|
||||
bool operator ==(const float2& a, const float2& b)
|
||||
{
|
||||
return a.x==b.x && a.y==b.y;
|
||||
}
|
||||
|
||||
__inline
|
||||
bool operator ==(const float4& a, const float4& b)
|
||||
{
|
||||
return a.x==b.x && a.y==b.y && a.z==b.z && a.w==b.w;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 operator-(const float4& a)
|
||||
{
|
||||
return make_float4(-a.x, -a.y, -a.z, -a.w);
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 operator*(const float4& a, const float4& b)
|
||||
{
|
||||
// ADLASSERT((u32(&a) & 0xf) == 0);
|
||||
|
||||
float4 out;
|
||||
out.s[0] = a.s[0]*b.s[0];
|
||||
out.s[1] = a.s[1]*b.s[1];
|
||||
out.s[2] = a.s[2]*b.s[2];
|
||||
out.s[3] = a.s[3]*b.s[3];
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 operator*(float a, const float4& b)
|
||||
{
|
||||
return make_float4(a*b.s[0], a*b.s[1], a*b.s[2], a*b.s[3]);
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 operator*(const float4& b, float a)
|
||||
{
|
||||
CHECK_ALIGNMENT(b);
|
||||
|
||||
return make_float4(a*b.s[0], a*b.s[1], a*b.s[2], a*b.s[3]);
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator*=(float4& a, const float4& b)
|
||||
{
|
||||
CHECK_ALIGNMENT(a);
|
||||
|
||||
a.s[0]*=b.s[0];
|
||||
a.s[1]*=b.s[1];
|
||||
a.s[2]*=b.s[2];
|
||||
a.s[3]*=b.s[3];
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator*=(float4& a, float b)
|
||||
{
|
||||
CHECK_ALIGNMENT(a);
|
||||
|
||||
a.s[0]*=b;
|
||||
a.s[1]*=b;
|
||||
a.s[2]*=b;
|
||||
a.s[3]*=b;
|
||||
}
|
||||
/*
|
||||
__inline
|
||||
bool operator ==(const float4& a, const float4& b)
|
||||
{
|
||||
|
||||
|
||||
}
|
||||
*/
|
||||
//
|
||||
__inline
|
||||
float4 operator/(const float4& a, const float4& b)
|
||||
{
|
||||
CHECK_ALIGNMENT(a);
|
||||
|
||||
float4 out;
|
||||
out.s[0] = a.s[0]/b.s[0];
|
||||
out.s[1] = a.s[1]/b.s[1];
|
||||
out.s[2] = a.s[2]/b.s[2];
|
||||
out.s[3] = a.s[3]/b.s[3];
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 operator/(const float4& b, float a)
|
||||
{
|
||||
CHECK_ALIGNMENT(b);
|
||||
|
||||
return make_float4(b.s[0]/a, b.s[1]/a, b.s[2]/a, b.s[3]/a);
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator/=(float4& a, const float4& b)
|
||||
{
|
||||
a.s[0]/=b.s[0];
|
||||
a.s[1]/=b.s[1];
|
||||
a.s[2]/=b.s[2];
|
||||
a.s[3]/=b.s[3];
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator/=(float4& a, float b)
|
||||
{
|
||||
ADLASSERT((u32(&a) & 0xf) == 0);
|
||||
|
||||
a.s[0]/=b;
|
||||
a.s[1]/=b;
|
||||
a.s[2]/=b;
|
||||
a.s[3]/=b;
|
||||
}
|
||||
//
|
||||
|
||||
__inline
|
||||
float4 operator+(const float4& a, const float4& b)
|
||||
{
|
||||
CHECK_ALIGNMENT(a);
|
||||
|
||||
float4 out;
|
||||
out.s[0] = a.s[0]+b.s[0];
|
||||
out.s[1] = a.s[1]+b.s[1];
|
||||
out.s[2] = a.s[2]+b.s[2];
|
||||
out.s[3] = a.s[3]+b.s[3];
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 operator+(const float4& a, float b)
|
||||
{
|
||||
CHECK_ALIGNMENT(a);
|
||||
|
||||
float4 out;
|
||||
out.s[0] = a.s[0]+b;
|
||||
out.s[1] = a.s[1]+b;
|
||||
out.s[2] = a.s[2]+b;
|
||||
out.s[3] = a.s[3]+b;
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 operator-(const float4& a, const float4& b)
|
||||
{
|
||||
CHECK_ALIGNMENT(a);
|
||||
|
||||
float4 out;
|
||||
out.s[0] = a.s[0]-b.s[0];
|
||||
out.s[1] = a.s[1]-b.s[1];
|
||||
out.s[2] = a.s[2]-b.s[2];
|
||||
out.s[3] = a.s[3]-b.s[3];
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 operator-(const float4& a, float b)
|
||||
{
|
||||
CHECK_ALIGNMENT(a);
|
||||
|
||||
float4 out;
|
||||
out.s[0] = a.s[0]-b;
|
||||
out.s[1] = a.s[1]-b;
|
||||
out.s[2] = a.s[2]-b;
|
||||
out.s[3] = a.s[3]-b;
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator+=(float4& a, const float4& b)
|
||||
{
|
||||
CHECK_ALIGNMENT(a);
|
||||
|
||||
a.s[0]+=b.s[0];
|
||||
a.s[1]+=b.s[1];
|
||||
a.s[2]+=b.s[2];
|
||||
a.s[3]+=b.s[3];
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator+=(float4& a, float b)
|
||||
{
|
||||
CHECK_ALIGNMENT(a);
|
||||
|
||||
a.s[0]+=b;
|
||||
a.s[1]+=b;
|
||||
a.s[2]+=b;
|
||||
a.s[3]+=b;
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator-=(float4& a, const float4& b)
|
||||
{
|
||||
CHECK_ALIGNMENT(a);
|
||||
|
||||
a.s[0]-=b.s[0];
|
||||
a.s[1]-=b.s[1];
|
||||
a.s[2]-=b.s[2];
|
||||
a.s[3]-=b.s[3];
|
||||
}
|
||||
|
||||
__inline
|
||||
void operator-=(float4& a, float b)
|
||||
{
|
||||
CHECK_ALIGNMENT(a);
|
||||
|
||||
a.s[0]-=b;
|
||||
a.s[1]-=b;
|
||||
a.s[2]-=b;
|
||||
a.s[3]-=b;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
__inline
|
||||
float4 cross3(const float4& a, const float4& b)
|
||||
{
|
||||
return make_float4(a.s[1]*b.s[2]-a.s[2]*b.s[1],
|
||||
a.s[2]*b.s[0]-a.s[0]*b.s[2],
|
||||
a.s[0]*b.s[1]-a.s[1]*b.s[0],
|
||||
0);
|
||||
}
|
||||
|
||||
__inline
|
||||
float dot3F4(const float4& a, const float4& b)
|
||||
{
|
||||
return a.x*b.x+a.y*b.y+a.z*b.z;
|
||||
}
|
||||
|
||||
__inline
|
||||
float length3(const float4& a)
|
||||
{
|
||||
return sqrtf(dot3F4(a,a));
|
||||
}
|
||||
|
||||
__inline
|
||||
float dot4(const float4& a, const float4& b)
|
||||
{
|
||||
return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
|
||||
}
|
||||
|
||||
// for height
|
||||
__inline
|
||||
float dot3w1(const float4& point, const float4& eqn)
|
||||
{
|
||||
return point.x*eqn.x+point.y*eqn.y+point.z*eqn.z+eqn.w;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 normalize3(const float4& a)
|
||||
{
|
||||
float length = sqrtf(dot3F4(a, a));
|
||||
return 1.f/length * a;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 normalize4(const float4& a)
|
||||
{
|
||||
float length = sqrtf(dot4(a, a));
|
||||
return 1.f/length * a;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 createEquation(const float4& a, const float4& b, const float4& c)
|
||||
{
|
||||
float4 eqn;
|
||||
float4 ab = b-a;
|
||||
float4 ac = c-a;
|
||||
eqn = normalize3( cross3(ab, ac) );
|
||||
eqn.w = -dot3F4(eqn,a);
|
||||
return eqn;
|
||||
}
|
||||
|
||||
__inline
|
||||
float intersectPlaneLine( const float4& planeEqn, const float4& vec, const float4& orig )
|
||||
{
|
||||
return (-planeEqn.w - dot3F4(planeEqn, orig))/dot3F4(planeEqn, vec);
|
||||
}
|
||||
|
||||
template<>
|
||||
__inline
|
||||
float4 max2(const float4& a, const float4& b)
|
||||
{
|
||||
return make_float4( max2(a.x,b.x), max2(a.y,b.y), max2(a.z,b.z), max2(a.w,b.w) );
|
||||
}
|
||||
|
||||
template<>
|
||||
__inline
|
||||
float4 min2(const float4& a, const float4& b)
|
||||
{
|
||||
return make_float4( min2(a.x,b.x), min2(a.y,b.y), min2(a.z,b.z), min2(a.w,b.w) );
|
||||
}
|
||||
|
||||
@@ -0,0 +1,224 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#ifndef CL_MATH_H
|
||||
#define CL_MATH_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <xmmintrin.h>
|
||||
|
||||
|
||||
#include <Adl/Adl.h>
|
||||
|
||||
#include <algorithm>
|
||||
#define pxSort std::sort
|
||||
|
||||
#define PI 3.14159265358979323846f
|
||||
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
|
||||
|
||||
|
||||
#define _MEM_CLASSALIGN16 __declspec(align(16))
|
||||
#define _MEM_ALIGNED_ALLOCATOR16 void* operator new(size_t size) { return _aligned_malloc( size, 16 ); } \
|
||||
void operator delete(void *p) { _aligned_free( p ); } \
|
||||
void* operator new[](size_t size) { return _aligned_malloc( size, 16 ); } \
|
||||
void operator delete[](void *p) { _aligned_free( p ); } \
|
||||
void* operator new(size_t size, void* p) { return p; } \
|
||||
void operator delete(void *p, void* pp) {}
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
template<class T>
|
||||
T nextPowerOf2(T n)
|
||||
{
|
||||
n -= 1;
|
||||
for(int i=0; i<sizeof(T)*8; i++)
|
||||
n = n | (n>>i);
|
||||
return n+1;
|
||||
}
|
||||
|
||||
typedef unsigned int u32;
|
||||
typedef unsigned short u16;
|
||||
typedef unsigned char u8;
|
||||
|
||||
_MEM_CLASSALIGN16
|
||||
struct float4
|
||||
{
|
||||
_MEM_ALIGNED_ALLOCATOR16;
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
float x,y,z,w;
|
||||
};
|
||||
struct
|
||||
{
|
||||
float s[4];
|
||||
};
|
||||
__m128 m_quad;
|
||||
};
|
||||
};
|
||||
|
||||
_MEM_CLASSALIGN16
|
||||
struct int4
|
||||
{
|
||||
_MEM_ALIGNED_ALLOCATOR16;
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
int x,y,z,w;
|
||||
};
|
||||
struct
|
||||
{
|
||||
int s[4];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
_MEM_CLASSALIGN16
|
||||
struct uint4
|
||||
{
|
||||
_MEM_ALIGNED_ALLOCATOR16;
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
u32 x,y,z,w;
|
||||
};
|
||||
struct
|
||||
{
|
||||
u32 s[4];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
struct int2
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
int x,y;
|
||||
};
|
||||
struct
|
||||
{
|
||||
int s[2];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
struct float2
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
float x,y;
|
||||
};
|
||||
struct
|
||||
{
|
||||
float s[2];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
T max2(const T& a, const T& b)
|
||||
{
|
||||
return (a>b)? a:b;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
T min2(const T& a, const T& b)
|
||||
{
|
||||
return (a<b)? a:b;
|
||||
}
|
||||
|
||||
|
||||
#include <AdlPrimitives/Math/Float4.inl>
|
||||
#include <AdlPrimitives/Math/Float2.inl>
|
||||
|
||||
|
||||
template<typename T>
|
||||
void swap2(T& a, T& b)
|
||||
{
|
||||
T tmp = a;
|
||||
a = b;
|
||||
b = tmp;
|
||||
}
|
||||
|
||||
|
||||
__inline
|
||||
void seedRandom(int seed)
|
||||
{
|
||||
srand( seed );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
T getRandom(const T& minV, const T& maxV)
|
||||
{
|
||||
float r = (rand()%10000)/10000.f;
|
||||
T range = maxV - minV;
|
||||
return (T)(minV + r*range);
|
||||
}
|
||||
|
||||
template<>
|
||||
__inline
|
||||
float4 getRandom(const float4& minV, const float4& maxV)
|
||||
{
|
||||
float4 r = make_float4( (rand()%10000)/10000.f, (rand()%10000)/10000.f, (rand()%10000)/10000.f, (rand()%10000)/10000.f );
|
||||
float4 range = maxV - minV;
|
||||
return (minV + r*range);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename T>
|
||||
T* addByteOffset(void* baseAddr, u32 offset)
|
||||
{
|
||||
return (T*)(((u32)baseAddr)+offset);
|
||||
}
|
||||
|
||||
|
||||
struct Pair32
|
||||
{
|
||||
Pair32(){}
|
||||
Pair32(u32 a, u32 b) : m_a(a), m_b(b){}
|
||||
|
||||
u32 m_a;
|
||||
u32 m_b;
|
||||
};
|
||||
|
||||
struct PtrPair
|
||||
{
|
||||
PtrPair(){}
|
||||
PtrPair(void* a, void* b) : m_a(a), m_b(b){}
|
||||
template<typename T>
|
||||
PtrPair(T* a, T* b) : m_a((void*)a), m_b((void*)b){}
|
||||
|
||||
void* m_a;
|
||||
void* m_b;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,357 @@
|
||||
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
|
||||
#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
|
||||
|
||||
typedef unsigned int u32;
|
||||
typedef unsigned short u16;
|
||||
typedef unsigned char u8;
|
||||
|
||||
#define GET_GROUP_IDX get_group_id(0)
|
||||
#define GET_LOCAL_IDX get_local_id(0)
|
||||
#define GET_GLOBAL_IDX get_global_id(0)
|
||||
#define GET_GROUP_SIZE get_local_size(0)
|
||||
#define GET_NUM_GROUPS get_num_groups(0)
|
||||
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
|
||||
#define AtomInc(x) atom_inc(&(x))
|
||||
#define AtomInc1(x, out) out = atom_inc(&(x))
|
||||
#define AppendInc(x, out) out = atomic_inc(x)
|
||||
#define AtomAdd(x, value) atom_add(&(x), value)
|
||||
#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
|
||||
#define AtomXhg(x, value) atom_xchg ( &(x), value )
|
||||
|
||||
|
||||
#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
|
||||
|
||||
#define make_float4 (float4)
|
||||
#define make_float2 (float2)
|
||||
#define make_uint4 (uint4)
|
||||
#define make_int4 (int4)
|
||||
#define make_uint2 (uint2)
|
||||
#define make_int2 (int2)
|
||||
|
||||
|
||||
#define max2 max
|
||||
#define min2 min
|
||||
|
||||
|
||||
///////////////////////////////////////
|
||||
// Vector
|
||||
///////////////////////////////////////
|
||||
__inline
|
||||
float fastDiv(float numerator, float denominator)
|
||||
{
|
||||
return native_divide(numerator, denominator);
|
||||
// return numerator/denominator;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 fastDiv4(float4 numerator, float4 denominator)
|
||||
{
|
||||
return native_divide(numerator, denominator);
|
||||
}
|
||||
|
||||
__inline
|
||||
float fastSqrtf(float f2)
|
||||
{
|
||||
return native_sqrt(f2);
|
||||
// return sqrt(f2);
|
||||
}
|
||||
|
||||
__inline
|
||||
float fastRSqrt(float f2)
|
||||
{
|
||||
return native_rsqrt(f2);
|
||||
}
|
||||
|
||||
__inline
|
||||
float fastLength4(float4 v)
|
||||
{
|
||||
return fast_length(v);
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 fastNormalize4(float4 v)
|
||||
{
|
||||
return fast_normalize(v);
|
||||
}
|
||||
|
||||
|
||||
__inline
|
||||
float sqrtf(float a)
|
||||
{
|
||||
// return sqrt(a);
|
||||
return native_sqrt(a);
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 cross3(float4 a, float4 b)
|
||||
{
|
||||
return cross(a,b);
|
||||
}
|
||||
|
||||
__inline
|
||||
float dot3F4(float4 a, float4 b)
|
||||
{
|
||||
float4 a1 = make_float4(a.xyz,0.f);
|
||||
float4 b1 = make_float4(b.xyz,0.f);
|
||||
return dot(a1, b1);
|
||||
}
|
||||
|
||||
__inline
|
||||
float length3(const float4 a)
|
||||
{
|
||||
return sqrtf(dot3F4(a,a));
|
||||
}
|
||||
|
||||
__inline
|
||||
float dot4(const float4 a, const float4 b)
|
||||
{
|
||||
return dot( a, b );
|
||||
}
|
||||
|
||||
// for height
|
||||
__inline
|
||||
float dot3w1(const float4 point, const float4 eqn)
|
||||
{
|
||||
return dot3F4(point,eqn) + eqn.w;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 normalize3(const float4 a)
|
||||
{
|
||||
float4 n = make_float4(a.x, a.y, a.z, 0.f);
|
||||
return fastNormalize4( n );
|
||||
// float length = sqrtf(dot3F4(a, a));
|
||||
// return 1.f/length * a;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 normalize4(const float4 a)
|
||||
{
|
||||
float length = sqrtf(dot4(a, a));
|
||||
return 1.f/length * a;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 createEquation(const float4 a, const float4 b, const float4 c)
|
||||
{
|
||||
float4 eqn;
|
||||
float4 ab = b-a;
|
||||
float4 ac = c-a;
|
||||
eqn = normalize3( cross3(ab, ac) );
|
||||
eqn.w = -dot3F4(eqn,a);
|
||||
return eqn;
|
||||
}
|
||||
|
||||
///////////////////////////////////////
|
||||
// Matrix3x3
|
||||
///////////////////////////////////////
|
||||
|
||||
typedef struct
|
||||
{
|
||||
float4 m_row[3];
|
||||
}Matrix3x3;
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtZero();
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtIdentity();
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtTranspose(Matrix3x3 m);
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);
|
||||
|
||||
__inline
|
||||
float4 mtMul1(Matrix3x3 a, float4 b);
|
||||
|
||||
__inline
|
||||
float4 mtMul3(float4 a, Matrix3x3 b);
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtZero()
|
||||
{
|
||||
Matrix3x3 m;
|
||||
m.m_row[0] = (float4)(0.f);
|
||||
m.m_row[1] = (float4)(0.f);
|
||||
m.m_row[2] = (float4)(0.f);
|
||||
return m;
|
||||
}
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtIdentity()
|
||||
{
|
||||
Matrix3x3 m;
|
||||
m.m_row[0] = (float4)(1,0,0,0);
|
||||
m.m_row[1] = (float4)(0,1,0,0);
|
||||
m.m_row[2] = (float4)(0,0,1,0);
|
||||
return m;
|
||||
}
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtTranspose(Matrix3x3 m)
|
||||
{
|
||||
Matrix3x3 out;
|
||||
out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
|
||||
out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
|
||||
out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
|
||||
{
|
||||
Matrix3x3 transB;
|
||||
transB = mtTranspose( b );
|
||||
Matrix3x3 ans;
|
||||
// why this doesn't run when 0ing in the for{}
|
||||
a.m_row[0].w = 0.f;
|
||||
a.m_row[1].w = 0.f;
|
||||
a.m_row[2].w = 0.f;
|
||||
for(int i=0; i<3; i++)
|
||||
{
|
||||
// a.m_row[i].w = 0.f;
|
||||
ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
|
||||
ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
|
||||
ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
|
||||
ans.m_row[i].w = 0.f;
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 mtMul1(Matrix3x3 a, float4 b)
|
||||
{
|
||||
float4 ans;
|
||||
ans.x = dot3F4( a.m_row[0], b );
|
||||
ans.y = dot3F4( a.m_row[1], b );
|
||||
ans.z = dot3F4( a.m_row[2], b );
|
||||
ans.w = 0.f;
|
||||
return ans;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 mtMul3(float4 a, Matrix3x3 b)
|
||||
{
|
||||
float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
|
||||
float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
|
||||
float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
|
||||
|
||||
float4 ans;
|
||||
ans.x = dot3F4( a, colx );
|
||||
ans.y = dot3F4( a, coly );
|
||||
ans.z = dot3F4( a, colz );
|
||||
return ans;
|
||||
}
|
||||
|
||||
///////////////////////////////////////
|
||||
// Quaternion
|
||||
///////////////////////////////////////
|
||||
|
||||
typedef float4 Quaternion;
|
||||
|
||||
__inline
|
||||
Quaternion qtMul(Quaternion a, Quaternion b);
|
||||
|
||||
__inline
|
||||
Quaternion qtNormalize(Quaternion in);
|
||||
|
||||
__inline
|
||||
float4 qtRotate(Quaternion q, float4 vec);
|
||||
|
||||
__inline
|
||||
Quaternion qtInvert(Quaternion q);
|
||||
|
||||
__inline
|
||||
Matrix3x3 qtGetRotationMatrix(Quaternion q);
|
||||
|
||||
|
||||
|
||||
__inline
|
||||
Quaternion qtMul(Quaternion a, Quaternion b)
|
||||
{
|
||||
Quaternion ans;
|
||||
ans = cross3( a, b );
|
||||
ans += a.w*b+b.w*a;
|
||||
// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
|
||||
ans.w = a.w*b.w - dot3F4(a, b);
|
||||
return ans;
|
||||
}
|
||||
|
||||
__inline
|
||||
Quaternion qtNormalize(Quaternion in)
|
||||
{
|
||||
return fastNormalize4(in);
|
||||
// in /= length( in );
|
||||
// return in;
|
||||
}
|
||||
__inline
|
||||
float4 qtRotate(Quaternion q, float4 vec)
|
||||
{
|
||||
Quaternion qInv = qtInvert( q );
|
||||
float4 vcpy = vec;
|
||||
vcpy.w = 0.f;
|
||||
float4 out = qtMul(qtMul(q,vcpy),qInv);
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
Quaternion qtInvert(Quaternion q)
|
||||
{
|
||||
return (Quaternion)(-q.xyz, q.w);
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 qtInvRotate(const Quaternion q, float4 vec)
|
||||
{
|
||||
return qtRotate( qtInvert( q ), vec );
|
||||
}
|
||||
|
||||
__inline
|
||||
Matrix3x3 qtGetRotationMatrix(Quaternion quat)
|
||||
{
|
||||
float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
|
||||
Matrix3x3 out;
|
||||
|
||||
out.m_row[0].x=1-2*quat2.y-2*quat2.z;
|
||||
out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;
|
||||
out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;
|
||||
out.m_row[0].w = 0.f;
|
||||
|
||||
out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;
|
||||
out.m_row[1].y=1-2*quat2.x-2*quat2.z;
|
||||
out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;
|
||||
out.m_row[1].w = 0.f;
|
||||
|
||||
out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;
|
||||
out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;
|
||||
out.m_row[2].z=1-2*quat2.x-2*quat2.y;
|
||||
out.m_row[2].w = 0.f;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,197 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
#ifndef MATRIX3X3_H
|
||||
#define MATRIX3X3_H
|
||||
|
||||
#include <AdlPrimitives/Math/Math.h>
|
||||
|
||||
///////////////////////////////////////
|
||||
// Matrix3x3
|
||||
///////////////////////////////////////
|
||||
namespace adl
|
||||
{
|
||||
|
||||
typedef
|
||||
_MEM_CLASSALIGN16 struct
|
||||
{
|
||||
_MEM_ALIGNED_ALLOCATOR16;
|
||||
float4 m_row[3];
|
||||
}Matrix3x3;
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtZero();
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtIdentity();
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtDiagonal(float a, float b, float c);
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtTranspose(const Matrix3x3& m);
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtMul(const Matrix3x3& a, const Matrix3x3& b);
|
||||
|
||||
__inline
|
||||
float4 mtMul1(const Matrix3x3& a, const float4& b);
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtMul2(float a, const Matrix3x3& b);
|
||||
|
||||
__inline
|
||||
float4 mtMul3(const float4& b, const Matrix3x3& a);
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtInvert(const Matrix3x3& m);
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtZero()
|
||||
{
|
||||
Matrix3x3 m;
|
||||
m.m_row[0] = make_float4(0.f);
|
||||
m.m_row[1] = make_float4(0.f);
|
||||
m.m_row[2] = make_float4(0.f);
|
||||
return m;
|
||||
}
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtIdentity()
|
||||
{
|
||||
Matrix3x3 m;
|
||||
m.m_row[0] = make_float4(1,0,0);
|
||||
m.m_row[1] = make_float4(0,1,0);
|
||||
m.m_row[2] = make_float4(0,0,1);
|
||||
return m;
|
||||
}
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtDiagonal(float a, float b, float c)
|
||||
{
|
||||
Matrix3x3 m;
|
||||
m.m_row[0] = make_float4(a,0,0);
|
||||
m.m_row[1] = make_float4(0,b,0);
|
||||
m.m_row[2] = make_float4(0,0,c);
|
||||
return m;
|
||||
}
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtTranspose(const Matrix3x3& m)
|
||||
{
|
||||
Matrix3x3 out;
|
||||
out.m_row[0] = make_float4(m.m_row[0].s[0], m.m_row[1].s[0], m.m_row[2].s[0], 0.f);
|
||||
out.m_row[1] = make_float4(m.m_row[0].s[1], m.m_row[1].s[1], m.m_row[2].s[1], 0.f);
|
||||
out.m_row[2] = make_float4(m.m_row[0].s[2], m.m_row[1].s[2], m.m_row[2].s[2], 0.f);
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtMul(const Matrix3x3& a, const Matrix3x3& b)
|
||||
{
|
||||
Matrix3x3 transB;
|
||||
transB = mtTranspose( b );
|
||||
Matrix3x3 ans;
|
||||
for(int i=0; i<3; i++)
|
||||
{
|
||||
ans.m_row[i].s[0] = dot3F4(a.m_row[i],transB.m_row[0]);
|
||||
ans.m_row[i].s[1] = dot3F4(a.m_row[i],transB.m_row[1]);
|
||||
ans.m_row[i].s[2] = dot3F4(a.m_row[i],transB.m_row[2]);
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 mtMul1(const Matrix3x3& a, const float4& b)
|
||||
{
|
||||
float4 ans;
|
||||
ans.s[0] = dot3F4( a.m_row[0], b );
|
||||
ans.s[1] = dot3F4( a.m_row[1], b );
|
||||
ans.s[2] = dot3F4( a.m_row[2], b );
|
||||
return ans;
|
||||
}
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtMul2(float a, const Matrix3x3& b)
|
||||
{
|
||||
Matrix3x3 ans;
|
||||
ans.m_row[0] = a*b.m_row[0];
|
||||
ans.m_row[1] = a*b.m_row[1];
|
||||
ans.m_row[2] = a*b.m_row[2];
|
||||
return ans;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 mtMul3(const float4& a, const Matrix3x3& b)
|
||||
{
|
||||
float4 ans;
|
||||
ans.x = a.x*b.m_row[0].x + a.y*b.m_row[1].x + a.z*b.m_row[2].x;
|
||||
ans.y = a.x*b.m_row[0].y + a.y*b.m_row[1].y + a.z*b.m_row[2].y;
|
||||
ans.z = a.x*b.m_row[0].z + a.y*b.m_row[1].z + a.z*b.m_row[2].z;
|
||||
return ans;
|
||||
}
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtInvert(const Matrix3x3& m)
|
||||
{
|
||||
float det = m.m_row[0].s[0]*m.m_row[1].s[1]*m.m_row[2].s[2]+m.m_row[1].s[0]*m.m_row[2].s[1]*m.m_row[0].s[2]+m.m_row[2].s[0]*m.m_row[0].s[1]*m.m_row[1].s[2]
|
||||
-m.m_row[0].s[0]*m.m_row[2].s[1]*m.m_row[1].s[2]-m.m_row[2].s[0]*m.m_row[1].s[1]*m.m_row[0].s[2]-m.m_row[1].s[0]*m.m_row[0].s[1]*m.m_row[2].s[2];
|
||||
|
||||
ADLASSERT( det );
|
||||
|
||||
Matrix3x3 ans;
|
||||
ans.m_row[0].s[0] = m.m_row[1].s[1]*m.m_row[2].s[2] - m.m_row[1].s[2]*m.m_row[2].s[1];
|
||||
ans.m_row[0].s[1] = m.m_row[0].s[2]*m.m_row[2].s[1] - m.m_row[0].s[1]*m.m_row[2].s[2];
|
||||
ans.m_row[0].s[2] = m.m_row[0].s[1]*m.m_row[1].s[2] - m.m_row[0].s[2]*m.m_row[1].s[1];
|
||||
ans.m_row[0].w = 0.f;
|
||||
|
||||
ans.m_row[1].s[0] = m.m_row[1].s[2]*m.m_row[2].s[0] - m.m_row[1].s[0]*m.m_row[2].s[2];
|
||||
ans.m_row[1].s[1] = m.m_row[0].s[0]*m.m_row[2].s[2] - m.m_row[0].s[2]*m.m_row[2].s[0];
|
||||
ans.m_row[1].s[2] = m.m_row[0].s[2]*m.m_row[1].s[0] - m.m_row[0].s[0]*m.m_row[1].s[2];
|
||||
ans.m_row[1].w = 0.f;
|
||||
|
||||
ans.m_row[2].s[0] = m.m_row[1].s[0]*m.m_row[2].s[1] - m.m_row[1].s[1]*m.m_row[2].s[0];
|
||||
ans.m_row[2].s[1] = m.m_row[0].s[1]*m.m_row[2].s[0] - m.m_row[0].s[0]*m.m_row[2].s[1];
|
||||
ans.m_row[2].s[2] = m.m_row[0].s[0]*m.m_row[1].s[1] - m.m_row[0].s[1]*m.m_row[1].s[0];
|
||||
ans.m_row[2].w = 0.f;
|
||||
|
||||
ans = mtMul2((1.0f/det), ans);
|
||||
return ans;
|
||||
}
|
||||
|
||||
__inline
|
||||
Matrix3x3 mtSet( const float4& a, const float4& b, const float4& c )
|
||||
{
|
||||
Matrix3x3 m;
|
||||
m.m_row[0] = a;
|
||||
m.m_row[1] = b;
|
||||
m.m_row[2] = c;
|
||||
return m;
|
||||
}
|
||||
|
||||
__inline
|
||||
Matrix3x3 operator+(const Matrix3x3& a, const Matrix3x3& b)
|
||||
{
|
||||
Matrix3x3 out;
|
||||
out.m_row[0] = a.m_row[0] + b.m_row[0];
|
||||
out.m_row[1] = a.m_row[1] + b.m_row[1];
|
||||
out.m_row[2] = a.m_row[2] + b.m_row[2];
|
||||
return out;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,159 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#ifndef QUATERNION_H
|
||||
#define QUATERNION_H
|
||||
|
||||
#include <AdlPrimitives/Math/Matrix3x3.h>
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
typedef float4 Quaternion;
|
||||
|
||||
__inline
|
||||
Quaternion qtSet(const float4& axis, float angle);
|
||||
|
||||
__inline
|
||||
Quaternion qtMul(const Quaternion& a, const Quaternion& b);
|
||||
|
||||
__inline
|
||||
float4 qtRotate(const Quaternion& q, const float4& vec);
|
||||
|
||||
__inline
|
||||
float4 qtInvRotate(const Quaternion& q, const float4& vec);
|
||||
|
||||
__inline
|
||||
Quaternion qtInvert(const Quaternion& q);
|
||||
|
||||
__inline
|
||||
Matrix3x3 qtGetRotationMatrix(const Quaternion& quat);
|
||||
|
||||
__inline
|
||||
Quaternion qtNormalize(const Quaternion& q);
|
||||
|
||||
__inline
|
||||
Quaternion qtGetIdentity() { return make_float4(0,0,0,1); }
|
||||
|
||||
__inline
|
||||
Quaternion qtSet(const float4& axis, float angle)
|
||||
{
|
||||
float4 nAxis = normalize3( axis );
|
||||
|
||||
Quaternion q;
|
||||
q.s[0] = nAxis.s[0]*sin(angle/2);
|
||||
q.s[1] = nAxis.s[1]*sin(angle/2);
|
||||
q.s[2] = nAxis.s[2]*sin(angle/2);
|
||||
q.s[3] = cos(angle/2);
|
||||
return q;
|
||||
}
|
||||
|
||||
__inline
|
||||
Quaternion qtMul(const Quaternion& a, const Quaternion& b)
|
||||
{
|
||||
Quaternion ans;
|
||||
ans = cross3( a, b );
|
||||
ans += a.s[3]*b + b.s[3]*a;
|
||||
ans.s[3] = a.s[3]*b.s[3] - (a.s[0]*b.s[0]+a.s[1]*b.s[1]+a.s[2]*b.s[2]);
|
||||
return ans;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 qtRotate(const Quaternion& q, const float4& vec)
|
||||
{
|
||||
Quaternion vecQ = vec;
|
||||
vecQ.s[3] = 0.f;
|
||||
Quaternion qInv = qtInvert( q );
|
||||
float4 out = qtMul(qtMul(q,vecQ),qInv);
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 qtInvRotate(const Quaternion& q, const float4& vec)
|
||||
{
|
||||
return qtRotate( qtInvert( q ), vec );
|
||||
}
|
||||
|
||||
__inline
|
||||
Quaternion qtInvert(const Quaternion& q)
|
||||
{
|
||||
Quaternion ans;
|
||||
ans.s[0] = -q.s[0];
|
||||
ans.s[1] = -q.s[1];
|
||||
ans.s[2] = -q.s[2];
|
||||
ans.s[3] = q.s[3];
|
||||
return ans;
|
||||
}
|
||||
|
||||
__inline
|
||||
Matrix3x3 qtGetRotationMatrix(const Quaternion& quat)
|
||||
{
|
||||
float4 quat2 = make_float4(quat.s[0]*quat.s[0], quat.s[1]*quat.s[1], quat.s[2]*quat.s[2], 0.f);
|
||||
Matrix3x3 out;
|
||||
|
||||
out.m_row[0].s[0]=1-2*quat2.s[1]-2*quat2.s[2];
|
||||
out.m_row[0].s[1]=2*quat.s[0]*quat.s[1]-2*quat.s[3]*quat.s[2];
|
||||
out.m_row[0].s[2]=2*quat.s[0]*quat.s[2]+2*quat.s[3]*quat.s[1];
|
||||
out.m_row[0].s[3] = 0.f;
|
||||
|
||||
out.m_row[1].s[0]=2*quat.s[0]*quat.s[1]+2*quat.s[3]*quat.s[2];
|
||||
out.m_row[1].s[1]=1-2*quat2.s[0]-2*quat2.s[2];
|
||||
out.m_row[1].s[2]=2*quat.s[1]*quat.s[2]-2*quat.s[3]*quat.s[0];
|
||||
out.m_row[1].s[3] = 0.f;
|
||||
|
||||
out.m_row[2].s[0]=2*quat.s[0]*quat.s[2]-2*quat.s[3]*quat.s[1];
|
||||
out.m_row[2].s[1]=2*quat.s[1]*quat.s[2]+2*quat.s[3]*quat.s[0];
|
||||
out.m_row[2].s[2]=1-2*quat2.s[0]-2*quat2.s[1];
|
||||
out.m_row[2].s[3] = 0.f;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline
|
||||
Quaternion qtGetQuaternion(const Matrix3x3* m)
|
||||
{
|
||||
Quaternion q;
|
||||
q.w = sqrtf( m[0].m_row[0].x + m[0].m_row[1].y + m[0].m_row[2].z + 1 ) * 0.5f;
|
||||
float inv4w = 1.f/(4.f*q.w);
|
||||
q.x = (m[0].m_row[2].y-m[0].m_row[1].z)*inv4w;
|
||||
q.y = (m[0].m_row[0].z-m[0].m_row[2].x)*inv4w;
|
||||
q.z = (m[0].m_row[1].x-m[0].m_row[0].y)*inv4w;
|
||||
|
||||
return q;
|
||||
}
|
||||
|
||||
__inline
|
||||
Quaternion qtNormalize(const Quaternion& q)
|
||||
{
|
||||
return normalize4(q);
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 transform(const float4& p, const float4& translation, const Quaternion& orientation)
|
||||
{
|
||||
return qtRotate( orientation, p ) + translation;
|
||||
}
|
||||
|
||||
__inline
|
||||
float4 invTransform(const float4& p, const float4& translation, const Quaternion& orientation)
|
||||
{
|
||||
return qtRotate( qtInvert( orientation ), p-translation ); // use qtInvRotate
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Adl/Adl.h>
|
||||
#include <AdlPrimitives/Math/Math.h>
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
class PrefixScanBase
|
||||
{
|
||||
public:
|
||||
enum Option
|
||||
{
|
||||
INCLUSIVE,
|
||||
EXCLUSIVE
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template<DeviceType TYPE>
|
||||
class PrefixScan : public PrefixScanBase
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
enum
|
||||
{
|
||||
BLOCK_SIZE = 128
|
||||
};
|
||||
|
||||
struct Data
|
||||
{
|
||||
Option m_option;
|
||||
const Device* m_device;
|
||||
Kernel* m_localScanKernel;
|
||||
Kernel* m_blockSumKernel;
|
||||
Kernel* m_propagationKernel;
|
||||
Buffer<u32>* m_workBuffer;
|
||||
Buffer<int4>* m_constBuffer[3];// todo. dx need one for each
|
||||
int m_maxSize;
|
||||
};
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData, int maxSize, Option option = EXCLUSIVE);
|
||||
|
||||
static
|
||||
void deallocate(Data* data);
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, u32* sum = 0);
|
||||
};
|
||||
|
||||
|
||||
|
||||
#include <AdlPrimitives/Scan/PrefixScanHost.inl>
|
||||
#include <AdlPrimitives/Scan/PrefixScan.inl>
|
||||
|
||||
};
|
||||
@@ -0,0 +1,125 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Scan\\PrefixScanKernels"
|
||||
#define KERNEL0 "LocalScanKernel"
|
||||
#define KERNEL1 "TopLevelScanKernel"
|
||||
#define KERNEL2 "AddOffsetKernel"
|
||||
|
||||
#include <AdlPrimitives/Scan/PrefixScanKernelsCL.h>
|
||||
#include <AdlPrimitives/Scan/PrefixScanKernelsDX11.h>
|
||||
|
||||
template<DeviceType TYPE>
|
||||
typename PrefixScan<TYPE>::Data* PrefixScan<TYPE>::allocate(const Device* device, int maxSize, Option option)
|
||||
{
|
||||
ADLASSERT( TYPE == device->m_type );
|
||||
|
||||
ADLASSERT( maxSize <= BLOCK_SIZE*2*2048 );
|
||||
|
||||
const char* src[] =
|
||||
#if defined(ADL_LOAD_KERNEL_FROM_STRING)
|
||||
{prefixScanKernelsCL, prefixScanKernelsDX11};
|
||||
#else
|
||||
{0,0};
|
||||
#endif
|
||||
Data* data = new Data;
|
||||
data->m_device = device;
|
||||
data->m_localScanKernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
|
||||
data->m_blockSumKernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
|
||||
data->m_propagationKernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
|
||||
|
||||
int bufSize = (NEXTMULTIPLEOF( max2( maxSize/BLOCK_SIZE, (int)BLOCK_SIZE ), BLOCK_SIZE )+1);
|
||||
data->m_workBuffer = new Buffer<u32>( device, bufSize );
|
||||
data->m_constBuffer[0] = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
|
||||
data->m_constBuffer[1] = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
|
||||
data->m_constBuffer[2] = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
|
||||
|
||||
data->m_maxSize = maxSize;
|
||||
data->m_option = option;
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void PrefixScan<TYPE>::deallocate(Data* data)
|
||||
{
|
||||
delete data->m_workBuffer;
|
||||
delete data->m_constBuffer[0];
|
||||
delete data->m_constBuffer[1];
|
||||
delete data->m_constBuffer[2];
|
||||
delete data;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void PrefixScan<TYPE>::execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, u32* sum)
|
||||
{
|
||||
ADLASSERT( data );
|
||||
ADLASSERT( n <= data->m_maxSize );
|
||||
ADLASSERT( data->m_option == EXCLUSIVE );
|
||||
const u32 numBlocks = u32( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
|
||||
|
||||
|
||||
int4 constBuffer;
|
||||
constBuffer.x = n;
|
||||
constBuffer.y = numBlocks;
|
||||
constBuffer.z = (int)nextPowerOf2( numBlocks );
|
||||
|
||||
Buffer<u32>* srcNative = BufferUtils::map<TYPE, true>( data->m_device, &src );
|
||||
Buffer<u32>* dstNative = BufferUtils::map<TYPE, false>( data->m_device, &dst );
|
||||
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( dstNative ), BufferInfo( srcNative ), BufferInfo( data->m_workBuffer ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_localScanKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[0], constBuffer );
|
||||
launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
|
||||
}
|
||||
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( data->m_workBuffer ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_blockSumKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[1], constBuffer );
|
||||
launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
|
||||
}
|
||||
|
||||
|
||||
if( numBlocks > 1 )
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( dstNative ), BufferInfo( data->m_workBuffer ) };
|
||||
Launcher launcher( data->m_device, data->m_propagationKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[2], constBuffer );
|
||||
launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
|
||||
}
|
||||
|
||||
DeviceUtils::waitForCompletion( data->m_device );
|
||||
if( sum )
|
||||
{
|
||||
dstNative->read( sum, 1, n-1);
|
||||
}
|
||||
DeviceUtils::waitForCompletion( data->m_device );
|
||||
|
||||
BufferUtils::unmap<false>( srcNative, &src );
|
||||
BufferUtils::unmap<true>( dstNative, &dst );
|
||||
}
|
||||
|
||||
#undef PATH
|
||||
#undef KERNEL0
|
||||
#undef KERNEL1
|
||||
#undef KERNEL2
|
||||
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
template<>
|
||||
class PrefixScan<TYPE_HOST> : public PrefixScanBase
|
||||
{
|
||||
public:
|
||||
struct Data
|
||||
{
|
||||
Option m_option;
|
||||
};
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData, int maxSize, Option option = EXCLUSIVE)
|
||||
{
|
||||
ADLASSERT( deviceData->m_type == TYPE_HOST );
|
||||
|
||||
Data* data = new Data;
|
||||
data->m_option = option;
|
||||
return data;
|
||||
}
|
||||
|
||||
static
|
||||
void deallocate(Data* data)
|
||||
{
|
||||
delete data;
|
||||
}
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, u32* sum = 0)
|
||||
{
|
||||
ADLASSERT( src.getType() == TYPE_HOST && dst.getType() == TYPE_HOST );
|
||||
HostBuffer<u32>& hSrc = (HostBuffer<u32>&)src;
|
||||
HostBuffer<u32>& hDst = (HostBuffer<u32>&)dst;
|
||||
|
||||
u32 s = 0;
|
||||
if( data->m_option == EXCLUSIVE )
|
||||
{
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
hDst[i] = s;
|
||||
s += hSrc[i];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
s += hSrc[i];
|
||||
hDst[i] = s;
|
||||
}
|
||||
}
|
||||
|
||||
if( sum )
|
||||
{
|
||||
*sum = hDst[n-1];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
@@ -0,0 +1,153 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
typedef unsigned int u32;
|
||||
#define GET_GROUP_IDX get_group_id(0)
|
||||
#define GET_LOCAL_IDX get_local_id(0)
|
||||
#define GET_GLOBAL_IDX get_global_id(0)
|
||||
#define GET_GROUP_SIZE get_local_size(0)
|
||||
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
|
||||
|
||||
// takahiro end
|
||||
#define WG_SIZE 128
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint m_numElems;
|
||||
uint m_numBlocks;
|
||||
uint m_numScanBlocks;
|
||||
uint m_padding[1];
|
||||
} ConstBuffer;
|
||||
|
||||
|
||||
u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)
|
||||
{
|
||||
u32 blocksum;
|
||||
int offset = 1;
|
||||
for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
|
||||
{
|
||||
GROUP_LDS_BARRIER;
|
||||
for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
|
||||
{
|
||||
int ai = offset*(2*iIdx+1)-1;
|
||||
int bi = offset*(2*iIdx+2)-1;
|
||||
data[bi] += data[ai];
|
||||
}
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
if( lIdx == 0 )
|
||||
{
|
||||
blocksum = data[ n-1 ];
|
||||
data[ n-1 ] = 0;
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
offset >>= 1;
|
||||
for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
|
||||
{
|
||||
GROUP_LDS_BARRIER;
|
||||
for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
|
||||
{
|
||||
int ai = offset*(2*iIdx+1)-1;
|
||||
int bi = offset*(2*iIdx+2)-1;
|
||||
u32 temp = data[ai];
|
||||
data[ai] = data[bi];
|
||||
data[bi] += temp;
|
||||
}
|
||||
}
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
return blocksum;
|
||||
}
|
||||
|
||||
|
||||
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
|
||||
__kernel
|
||||
void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
__local u32 ldsData[WG_SIZE*2];
|
||||
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
int lIdx = GET_LOCAL_IDX;
|
||||
|
||||
ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
|
||||
ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
|
||||
|
||||
u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
|
||||
|
||||
if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;
|
||||
|
||||
if( (2*gIdx) < cb.m_numElems )
|
||||
{
|
||||
dst[2*gIdx] = ldsData[2*lIdx];
|
||||
}
|
||||
if( (2*gIdx + 1) < cb.m_numElems )
|
||||
{
|
||||
dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
|
||||
__kernel
|
||||
void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, ConstBuffer cb)
|
||||
{
|
||||
const u32 blockSize = WG_SIZE*2;
|
||||
|
||||
int myIdx = GET_GROUP_IDX+1;
|
||||
int lIdx = GET_LOCAL_IDX;
|
||||
|
||||
u32 iBlockSum = blockSum[myIdx];
|
||||
|
||||
int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
|
||||
for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
|
||||
{
|
||||
dst[i] += iBlockSum;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
|
||||
__kernel
|
||||
void TopLevelScanKernel(__global u32* dst, ConstBuffer cb)
|
||||
{
|
||||
__local u32 ldsData[2048];
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
int lIdx = GET_LOCAL_IDX;
|
||||
int lSize = GET_GROUP_SIZE;
|
||||
|
||||
for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
|
||||
{
|
||||
ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
|
||||
|
||||
for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
|
||||
{
|
||||
dst[i] = ldsData[i];
|
||||
}
|
||||
|
||||
if( gIdx == 0 )
|
||||
{
|
||||
dst[cb.m_numBlocks] = sum;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,157 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
typedef uint u32;
|
||||
|
||||
#define GET_GROUP_IDX groupIdx.x
|
||||
#define GET_LOCAL_IDX localIdx.x
|
||||
#define GET_GLOBAL_IDX globalIdx.x
|
||||
#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
|
||||
|
||||
// takahiro end
|
||||
#define WG_SIZE 128
|
||||
|
||||
#define GET_GROUP_SIZE WG_SIZE
|
||||
|
||||
|
||||
cbuffer SortCB : register( b0 )
|
||||
{
|
||||
int m_numElems;
|
||||
int m_numBlocks;
|
||||
int m_numScanBlocks;
|
||||
};
|
||||
|
||||
RWStructuredBuffer<uint> dst : register( u0 );
|
||||
RWStructuredBuffer<uint> src : register( u1 );
|
||||
RWStructuredBuffer<uint> sumBuffer : register( u2 );
|
||||
|
||||
|
||||
groupshared u32 ldsData[2048];
|
||||
|
||||
u32 ScanExclusive(u32 n, int lIdx, int lSize)
|
||||
{
|
||||
u32 blocksum;
|
||||
int offset = 1;
|
||||
for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
|
||||
{
|
||||
GROUP_LDS_BARRIER;
|
||||
for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
|
||||
{
|
||||
int ai = offset*(2*iIdx+1)-1;
|
||||
int bi = offset*(2*iIdx+2)-1;
|
||||
ldsData[bi] += ldsData[ai];
|
||||
}
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
if( lIdx == 0 )
|
||||
{
|
||||
blocksum = ldsData[ n-1 ];
|
||||
ldsData[ n-1 ] = 0;
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
offset >>= 1;
|
||||
for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
|
||||
{
|
||||
GROUP_LDS_BARRIER;
|
||||
for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
|
||||
{
|
||||
int ai = offset*(2*iIdx+1)-1;
|
||||
int bi = offset*(2*iIdx+2)-1;
|
||||
u32 temp = ldsData[ai];
|
||||
ldsData[ai] = ldsData[bi];
|
||||
ldsData[bi] += temp;
|
||||
}
|
||||
}
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
return blocksum;
|
||||
}
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void LocalScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
int lIdx = GET_LOCAL_IDX;
|
||||
|
||||
ldsData[2*lIdx] = ( 2*gIdx < m_numElems )? src[2*gIdx]: 0;
|
||||
ldsData[2*lIdx + 1] = ( 2*gIdx+1 < m_numElems )? src[2*gIdx + 1]: 0;
|
||||
|
||||
u32 sum = ScanExclusive(WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
|
||||
|
||||
if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;
|
||||
|
||||
if( (2*gIdx) < m_numElems )
|
||||
{
|
||||
dst[2*gIdx] = ldsData[2*lIdx];
|
||||
}
|
||||
if( (2*gIdx + 1) < m_numElems )
|
||||
{
|
||||
dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
|
||||
}
|
||||
}
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void TopLevelScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
int lIdx = GET_LOCAL_IDX;
|
||||
int lSize = GET_GROUP_SIZE;
|
||||
|
||||
for(int i=lIdx; i<m_numScanBlocks; i+=lSize )
|
||||
{
|
||||
ldsData[i] = (i<m_numBlocks)? dst[i]:0;
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
u32 sum = ScanExclusive(m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
|
||||
|
||||
for(int i=lIdx; i<m_numBlocks; i+=lSize )
|
||||
{
|
||||
dst[i] = ldsData[i];
|
||||
}
|
||||
|
||||
if( gIdx == 0 )
|
||||
{
|
||||
dst[m_numBlocks] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
RWStructuredBuffer<uint> blockSum2 : register( u1 );
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void AddOffsetKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)
|
||||
{
|
||||
const u32 blockSize = WG_SIZE*2;
|
||||
|
||||
int myIdx = GET_GROUP_IDX+1;
|
||||
int llIdx = GET_LOCAL_IDX;
|
||||
|
||||
u32 iBlockSum = blockSum2[myIdx];
|
||||
|
||||
int endValue = min((myIdx+1)*(blockSize), m_numElems);
|
||||
for(int i=myIdx*blockSize+llIdx; i<endValue; i+=GET_GROUP_SIZE)
|
||||
{
|
||||
dst[i] += iBlockSum;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,143 @@
|
||||
static const char* prefixScanKernelsCL= \
|
||||
"/*\n"
|
||||
" 2011 Takahiro Harada\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"typedef unsigned int u32;\n"
|
||||
"#define GET_GROUP_IDX get_group_id(0)\n"
|
||||
"#define GET_LOCAL_IDX get_local_id(0)\n"
|
||||
"#define GET_GLOBAL_IDX get_global_id(0)\n"
|
||||
"#define GET_GROUP_SIZE get_local_size(0)\n"
|
||||
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"\n"
|
||||
"// takahiro end\n"
|
||||
"#define WG_SIZE 128\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" uint m_numElems;\n"
|
||||
" uint m_numBlocks;\n"
|
||||
" uint m_numScanBlocks;\n"
|
||||
" uint m_padding[1];\n"
|
||||
"} ConstBuffer;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
|
||||
"{\n"
|
||||
" u32 blocksum;\n"
|
||||
" int offset = 1;\n"
|
||||
" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
|
||||
" {\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
|
||||
" {\n"
|
||||
" int ai = offset*(2*iIdx+1)-1;\n"
|
||||
" int bi = offset*(2*iIdx+2)-1;\n"
|
||||
" data[bi] += data[ai];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" if( lIdx == 0 )\n"
|
||||
" {\n"
|
||||
" blocksum = data[ n-1 ];\n"
|
||||
" data[ n-1 ] = 0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" offset >>= 1;\n"
|
||||
" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
|
||||
" {\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
|
||||
" {\n"
|
||||
" int ai = offset*(2*iIdx+1)-1;\n"
|
||||
" int bi = offset*(2*iIdx+2)-1;\n"
|
||||
" u32 temp = data[ai];\n"
|
||||
" data[ai] = data[bi];\n"
|
||||
" data[bi] += temp;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" return blocksum;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
|
||||
"__kernel\n"
|
||||
"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" __local u32 ldsData[WG_SIZE*2];\n"
|
||||
"\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
"\n"
|
||||
" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
|
||||
" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
|
||||
"\n"
|
||||
" u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
|
||||
"\n"
|
||||
" if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
|
||||
"\n"
|
||||
" if( (2*gIdx) < cb.m_numElems )\n"
|
||||
" {\n"
|
||||
" dst[2*gIdx] = ldsData[2*lIdx];\n"
|
||||
" }\n"
|
||||
" if( (2*gIdx + 1) < cb.m_numElems )\n"
|
||||
" {\n"
|
||||
" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
|
||||
"__kernel\n"
|
||||
"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" const u32 blockSize = WG_SIZE*2;\n"
|
||||
"\n"
|
||||
" int myIdx = GET_GROUP_IDX+1;\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
"\n"
|
||||
" u32 iBlockSum = blockSum[myIdx];\n"
|
||||
"\n"
|
||||
" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
|
||||
" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
|
||||
" {\n"
|
||||
" dst[i] += iBlockSum;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
|
||||
"__kernel\n"
|
||||
"void TopLevelScanKernel(__global u32* dst, ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" __local u32 ldsData[2048];\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
" int lSize = GET_GROUP_SIZE;\n"
|
||||
"\n"
|
||||
" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
|
||||
" {\n"
|
||||
" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
|
||||
"\n"
|
||||
" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
|
||||
" {\n"
|
||||
" dst[i] = ldsData[i];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" if( gIdx == 0 )\n"
|
||||
" {\n"
|
||||
" dst[cb.m_numBlocks] = sum;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
;
|
||||
@@ -0,0 +1,147 @@
|
||||
static const char* prefixScanKernelsDX11= \
|
||||
"/*\n"
|
||||
" 2011 Takahiro Harada\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"typedef uint u32;\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_IDX groupIdx.x\n"
|
||||
"#define GET_LOCAL_IDX localIdx.x\n"
|
||||
"#define GET_GLOBAL_IDX globalIdx.x\n"
|
||||
"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
|
||||
"\n"
|
||||
"// takahiro end\n"
|
||||
"#define WG_SIZE 128\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_SIZE WG_SIZE\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"cbuffer SortCB : register( b0 )\n"
|
||||
"{\n"
|
||||
" int m_numElems;\n"
|
||||
" int m_numBlocks;\n"
|
||||
" int m_numScanBlocks;\n"
|
||||
"};\n"
|
||||
" \n"
|
||||
"RWStructuredBuffer<uint> dst : register( u0 );\n"
|
||||
"RWStructuredBuffer<uint> src : register( u1 );\n"
|
||||
"RWStructuredBuffer<uint> sumBuffer : register( u2 );\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"groupshared u32 ldsData[2048];\n"
|
||||
"\n"
|
||||
"u32 ScanExclusive(u32 n, int lIdx, int lSize)\n"
|
||||
"{\n"
|
||||
" u32 blocksum;\n"
|
||||
" int offset = 1;\n"
|
||||
" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
|
||||
" {\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
|
||||
" {\n"
|
||||
" int ai = offset*(2*iIdx+1)-1;\n"
|
||||
" int bi = offset*(2*iIdx+2)-1;\n"
|
||||
" ldsData[bi] += ldsData[ai];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" if( lIdx == 0 )\n"
|
||||
" {\n"
|
||||
" blocksum = ldsData[ n-1 ];\n"
|
||||
" ldsData[ n-1 ] = 0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" offset >>= 1;\n"
|
||||
" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
|
||||
" {\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
|
||||
" {\n"
|
||||
" int ai = offset*(2*iIdx+1)-1;\n"
|
||||
" int bi = offset*(2*iIdx+2)-1;\n"
|
||||
" u32 temp = ldsData[ai];\n"
|
||||
" ldsData[ai] = ldsData[bi];\n"
|
||||
" ldsData[bi] += temp;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" return blocksum;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void LocalScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
"\n"
|
||||
" ldsData[2*lIdx] = ( 2*gIdx < m_numElems )? src[2*gIdx]: 0;\n"
|
||||
" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < m_numElems )? src[2*gIdx + 1]: 0;\n"
|
||||
"\n"
|
||||
" u32 sum = ScanExclusive(WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
|
||||
"\n"
|
||||
" if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
|
||||
"\n"
|
||||
" if( (2*gIdx) < m_numElems )\n"
|
||||
" {\n"
|
||||
" dst[2*gIdx] = ldsData[2*lIdx];\n"
|
||||
" }\n"
|
||||
" if( (2*gIdx + 1) < m_numElems )\n"
|
||||
" {\n"
|
||||
" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void TopLevelScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
" int lSize = GET_GROUP_SIZE;\n"
|
||||
"\n"
|
||||
" for(int i=lIdx; i<m_numScanBlocks; i+=lSize )\n"
|
||||
" {\n"
|
||||
" ldsData[i] = (i<m_numBlocks)? dst[i]:0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" u32 sum = ScanExclusive(m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
|
||||
"\n"
|
||||
" for(int i=lIdx; i<m_numBlocks; i+=lSize )\n"
|
||||
" {\n"
|
||||
" dst[i] = ldsData[i];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" if( gIdx == 0 )\n"
|
||||
" {\n"
|
||||
" dst[m_numBlocks] = sum;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
" \n"
|
||||
"RWStructuredBuffer<uint> blockSum2 : register( u1 );\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void AddOffsetKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
|
||||
"{\n"
|
||||
" const u32 blockSize = WG_SIZE*2;\n"
|
||||
"\n"
|
||||
" int myIdx = GET_GROUP_IDX+1;\n"
|
||||
" int llIdx = GET_LOCAL_IDX;\n"
|
||||
"\n"
|
||||
" u32 iBlockSum = blockSum2[myIdx];\n"
|
||||
"\n"
|
||||
" int endValue = min((myIdx+1)*(blockSize), m_numElems);\n"
|
||||
" for(int i=myIdx*blockSize+llIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
|
||||
" {\n"
|
||||
" dst[i] += iBlockSum;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
;
|
||||
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Adl/Adl.h>
|
||||
#include <AdlPrimitives/Math/Math.h>
|
||||
#include <AdlPrimitives/Sort/SortData.h>
|
||||
#include <AdlPrimitives/Fill/Fill.h>
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
class BoundSearchBase
|
||||
{
|
||||
public:
|
||||
enum Option
|
||||
{
|
||||
BOUND_LOWER,
|
||||
BOUND_UPPER,
|
||||
COUNT,
|
||||
};
|
||||
};
|
||||
|
||||
template<DeviceType TYPE>
|
||||
class BoundSearch : public BoundSearchBase
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
struct Data
|
||||
{
|
||||
const Device* m_device;
|
||||
Kernel* m_lowerSortDataKernel;
|
||||
Kernel* m_upperSortDataKernel;
|
||||
Kernel* m_subtractKernel;
|
||||
Buffer<int4>* m_constBuffer;
|
||||
Buffer<u32>* m_lower;
|
||||
Buffer<u32>* m_upper;
|
||||
typename Fill<TYPE>::Data* m_fillData;
|
||||
};
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData, int maxSize = 0);
|
||||
|
||||
static
|
||||
void deallocate(Data* data);
|
||||
|
||||
// src has to be src[i].m_key <= src[i+1].m_key
|
||||
static
|
||||
void execute(Data* data, Buffer<SortData>& src, u32 nSrc, Buffer<u32>& dst, u32 nDst, Option option = BOUND_LOWER );
|
||||
|
||||
// static
|
||||
// void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, Option option = );
|
||||
};
|
||||
|
||||
#include <AdlPrimitives/Search/BoundSearchHost.inl>
|
||||
#include <AdlPrimitives/Search/BoundSearch.inl>
|
||||
|
||||
};
|
||||
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Search\\BoundSearchKernels"
|
||||
#define KERNEL0 "SearchSortDataLowerKernel"
|
||||
#define KERNEL1 "SearchSortDataUpperKernel"
|
||||
#define KERNEL2 "SubtractKernel"
|
||||
|
||||
#include <AdlPrimitives/Search/BoundSearchKernelsCL.h>
|
||||
#include <AdlPrimitives/Search/BoundSearchKernelsDX11.h>
|
||||
|
||||
template<DeviceType TYPE>
|
||||
typename BoundSearch<TYPE>::Data* BoundSearch<TYPE>::allocate(const Device* device, int maxSize)
|
||||
{
|
||||
ADLASSERT( TYPE == device->m_type );
|
||||
|
||||
const char* src[] =
|
||||
#if defined(ADL_LOAD_KERNEL_FROM_STRING)
|
||||
{boundSearchKernelsCL, boundSearchKernelsDX11};
|
||||
#else
|
||||
{0,0};
|
||||
#endif
|
||||
|
||||
Data* data = new Data;
|
||||
|
||||
data->m_device = device;
|
||||
data->m_lowerSortDataKernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
|
||||
data->m_upperSortDataKernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
|
||||
data->m_constBuffer = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
|
||||
if( maxSize )
|
||||
{
|
||||
data->m_subtractKernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
|
||||
}
|
||||
data->m_lower = (maxSize == 0)? 0: new Buffer<u32>( device, maxSize );
|
||||
data->m_upper = (maxSize == 0)? 0: new Buffer<u32>( device, maxSize );
|
||||
data->m_fillData = (maxSize == 0)? 0: Fill<TYPE>::allocate( device );
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void BoundSearch<TYPE>::deallocate(Data* data)
|
||||
{
|
||||
delete data->m_constBuffer;
|
||||
if( data->m_lower ) delete data->m_lower;
|
||||
if( data->m_upper ) delete data->m_upper;
|
||||
if( data->m_fillData ) Fill<TYPE>::deallocate( data->m_fillData );
|
||||
delete data;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void BoundSearch<TYPE>::execute(Data* data, Buffer<SortData>& src, u32 nSrc, Buffer<u32>& dst, u32 nDst, Option option )
|
||||
{
|
||||
int4 constBuffer;
|
||||
constBuffer.x = nSrc;
|
||||
constBuffer.y = nDst;
|
||||
|
||||
Buffer<SortData>* srcNative = BufferUtils::map<TYPE, true>( data->m_device, &src );
|
||||
Buffer<u32>* dstNative = BufferUtils::map<TYPE, false>( data->m_device, &dst );
|
||||
|
||||
if( option == BOUND_LOWER )
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( srcNative, true ), BufferInfo( dstNative ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_lowerSortDataKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( nSrc, 64 );
|
||||
}
|
||||
else if( option == BOUND_UPPER )
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( srcNative, true ), BufferInfo( dstNative ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_upperSortDataKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( nSrc+1, 64 );
|
||||
}
|
||||
else if( option == COUNT )
|
||||
{
|
||||
ADLASSERT( data->m_lower );
|
||||
ADLASSERT( data->m_upper );
|
||||
ADLASSERT( data->m_lower->getSize() <= (int)nDst );
|
||||
ADLASSERT( data->m_upper->getSize() <= (int)nDst );
|
||||
|
||||
int zero = 0;
|
||||
Fill<TYPE>::execute( data->m_fillData, (Buffer<int>&)*data->m_lower, zero, nDst );
|
||||
Fill<TYPE>::execute( data->m_fillData, (Buffer<int>&)*data->m_upper, zero, nDst );
|
||||
|
||||
execute( data, src, nSrc, *data->m_lower, nDst, BOUND_LOWER );
|
||||
execute( data, src, nSrc, *data->m_upper, nDst, BOUND_UPPER );
|
||||
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( data->m_upper, true ), BufferInfo( data->m_lower, true ), BufferInfo( dstNative ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_subtractKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( nDst, 64 );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ADLASSERT( 0 );
|
||||
}
|
||||
|
||||
BufferUtils::unmap<false>( srcNative, &src );
|
||||
BufferUtils::unmap<true>( dstNative, &dst );
|
||||
}
|
||||
|
||||
|
||||
#undef PATH
|
||||
#undef KERNEL0
|
||||
#undef KERNEL1
|
||||
#undef KERNEL2
|
||||
|
||||
@@ -0,0 +1,111 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
template<>
|
||||
class BoundSearch<TYPE_HOST> : public BoundSearchBase
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
struct Data
|
||||
{
|
||||
const Device* m_device;
|
||||
};
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData, int maxSize = 0)
|
||||
{
|
||||
ADLASSERT( deviceData->m_type == TYPE_HOST );
|
||||
Data* data = new Data;
|
||||
data->m_device = deviceData;
|
||||
return data;
|
||||
}
|
||||
|
||||
static
|
||||
void deallocate(Data* data)
|
||||
{
|
||||
delete data;
|
||||
}
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<SortData>& rawSrc, u32 nSrc, Buffer<u32>& rawDst, u32 nDst, Option option = BOUND_LOWER)
|
||||
{
|
||||
ADLASSERT( rawSrc.getType() == TYPE_HOST );
|
||||
ADLASSERT( rawDst.getType() == TYPE_HOST );
|
||||
|
||||
HostBuffer<SortData>& src = *(HostBuffer<SortData>*)&rawSrc;
|
||||
HostBuffer<u32>& dst = *(HostBuffer<u32>*)&rawDst;
|
||||
|
||||
for(int i=0; i<nSrc-1; i++)
|
||||
ADLASSERT( src[i].m_key <= src[i+1].m_key );
|
||||
|
||||
if( option == BOUND_LOWER )
|
||||
{
|
||||
for(u32 i=0; i<nSrc; i++)
|
||||
{
|
||||
SortData& iData = (i==0)? SortData(-1,-1): src[i-1];
|
||||
SortData& jData = (i==nSrc)? SortData(nDst, nDst): src[i];
|
||||
|
||||
if( iData.m_key != jData.m_key )
|
||||
{
|
||||
// for(u32 k=iData.m_key+1; k<=min(jData.m_key,nDst-1); k++)
|
||||
u32 k = jData.m_key;
|
||||
{
|
||||
dst[k] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if( option == BOUND_UPPER )
|
||||
{
|
||||
for(u32 i=0; i<nSrc+1; i++)
|
||||
{
|
||||
SortData& iData = (i==0)? SortData(0,0): src[i-1];
|
||||
SortData& jData = (i==nSrc)? SortData(nDst, nDst): src[i];
|
||||
|
||||
if( iData.m_key != jData.m_key )
|
||||
{
|
||||
// for(u32 k=iData.m_key; k<min(jData.m_key,nDst); k++)
|
||||
u32 k = iData.m_key;
|
||||
{
|
||||
dst[k] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if( option == COUNT )
|
||||
{
|
||||
HostBuffer<u32> lower( data->m_device, nDst );
|
||||
HostBuffer<u32> upper( data->m_device, nDst );
|
||||
|
||||
for(u32 i=0; i<nDst; i++) { lower[i] = upper[i] = 0; }
|
||||
|
||||
execute( data, rawSrc, nSrc, lower, nDst, BOUND_LOWER );
|
||||
execute( data, rawSrc, nSrc, upper, nDst, BOUND_UPPER );
|
||||
|
||||
for(u32 i=0; i<nDst; i++) { dst[i] = upper[i] - lower[i]; }
|
||||
}
|
||||
else
|
||||
{
|
||||
ADLASSERT( 0 );
|
||||
}
|
||||
}
|
||||
|
||||
// static
|
||||
// void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, Option option = );
|
||||
};
|
||||
|
||||
|
||||
@@ -0,0 +1,112 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
typedef unsigned int u32;
|
||||
#define GET_GROUP_IDX get_group_id(0)
|
||||
#define GET_LOCAL_IDX get_local_id(0)
|
||||
#define GET_GLOBAL_IDX get_global_id(0)
|
||||
#define GET_GROUP_SIZE get_local_size(0)
|
||||
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u32 m_key;
|
||||
u32 m_value;
|
||||
}SortData;
|
||||
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u32 m_nSrc;
|
||||
u32 m_nDst;
|
||||
u32 m_padding[2];
|
||||
} ConstBuffer;
|
||||
|
||||
|
||||
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
__kernel
|
||||
void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
u32 nSrc = cb.m_nSrc;
|
||||
u32 nDst = cb.m_nDst;
|
||||
|
||||
if( gIdx < nSrc )
|
||||
{
|
||||
SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);
|
||||
SortData end; end.m_key = nDst; end.m_value = nDst;
|
||||
|
||||
SortData iData = (gIdx==0)? first: src[gIdx-1];
|
||||
SortData jData = (gIdx==nSrc)? end: src[gIdx];
|
||||
|
||||
if( iData.m_key != jData.m_key )
|
||||
{
|
||||
// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
|
||||
u32 k = jData.m_key;
|
||||
{
|
||||
dst[k] = gIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
__kernel
|
||||
void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
u32 nSrc = cb.m_nSrc;
|
||||
u32 nDst = cb.m_nDst;
|
||||
|
||||
if( gIdx < nSrc+1 )
|
||||
{
|
||||
SortData first; first.m_key = 0; first.m_value = 0;
|
||||
SortData end; end.m_key = nDst; end.m_value = nDst;
|
||||
|
||||
SortData iData = (gIdx==0)? first: src[gIdx-1];
|
||||
SortData jData = (gIdx==nSrc)? end: src[gIdx];
|
||||
|
||||
if( iData.m_key != jData.m_key )
|
||||
{
|
||||
// for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)
|
||||
u32 k = iData.m_key;
|
||||
{
|
||||
dst[k] = gIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
__kernel
|
||||
void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
u32 nSrc = cb.m_nSrc;
|
||||
u32 nDst = cb.m_nDst;
|
||||
|
||||
if( gIdx < nDst )
|
||||
{
|
||||
C[gIdx] = A[gIdx] - B[gIdx];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,104 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
typedef uint u32;
|
||||
|
||||
#define GET_GROUP_IDX groupIdx.x
|
||||
#define GET_LOCAL_IDX localIdx.x
|
||||
#define GET_GLOBAL_IDX globalIdx.x
|
||||
#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
|
||||
#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
|
||||
#define AtomInc(x) InterlockedAdd(x, 1)
|
||||
#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
|
||||
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u32 m_key;
|
||||
u32 m_value;
|
||||
}SortData;
|
||||
|
||||
|
||||
|
||||
cbuffer SortCB : register( b0 )
|
||||
{
|
||||
u32 m_nSrc;
|
||||
u32 m_nDst;
|
||||
u32 m_padding[2];
|
||||
};
|
||||
|
||||
|
||||
StructuredBuffer<SortData> src : register( t0 );
|
||||
RWStructuredBuffer<u32> dst : register( u0 );
|
||||
|
||||
|
||||
[numthreads(64, 1, 1)]
|
||||
void SearchSortDataLowerKernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
u32 nSrc = m_nSrc;
|
||||
u32 nDst = m_nDst;
|
||||
|
||||
if( gIdx < nSrc )
|
||||
{
|
||||
SortData iData;
|
||||
SortData jData;
|
||||
if( gIdx==0 ) iData.m_key = iData.m_value = (u32)-1;
|
||||
else iData = src[gIdx-1];
|
||||
|
||||
if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;
|
||||
else jData = src[gIdx];
|
||||
|
||||
if( iData.m_key != jData.m_key )
|
||||
{
|
||||
// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
|
||||
u32 k = jData.m_key;
|
||||
{
|
||||
dst[k] = gIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[numthreads(64, 1, 1)]
|
||||
void SearchSortDataUpperKernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
u32 nSrc = m_nSrc;
|
||||
u32 nDst = m_nDst;
|
||||
|
||||
if( gIdx < nSrc+1 )
|
||||
{
|
||||
SortData iData;
|
||||
SortData jData;
|
||||
if( gIdx==0 ) iData.m_key = iData.m_value = 0;
|
||||
else iData = src[gIdx-1];
|
||||
|
||||
if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;
|
||||
else jData = src[gIdx];
|
||||
|
||||
if( iData.m_key != jData.m_key )
|
||||
{
|
||||
// for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)
|
||||
u32 k = iData.m_key;
|
||||
{
|
||||
dst[k] = gIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
static const char* boundSearchKernelsCL= \
|
||||
"/*\n"
|
||||
" 2011 Takahiro Harada\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"typedef unsigned int u32;\n"
|
||||
"#define GET_GROUP_IDX get_group_id(0)\n"
|
||||
"#define GET_LOCAL_IDX get_local_id(0)\n"
|
||||
"#define GET_GLOBAL_IDX get_global_id(0)\n"
|
||||
"#define GET_GROUP_SIZE get_local_size(0)\n"
|
||||
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_key; \n"
|
||||
" u32 m_value;\n"
|
||||
"}SortData;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_nSrc;\n"
|
||||
" u32 m_nDst;\n"
|
||||
" u32 m_padding[2];\n"
|
||||
"} ConstBuffer;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"__kernel\n"
|
||||
"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" u32 nSrc = cb.m_nSrc;\n"
|
||||
" u32 nDst = cb.m_nDst;\n"
|
||||
"\n"
|
||||
" if( gIdx < nSrc )\n"
|
||||
" {\n"
|
||||
" SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
|
||||
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
|
||||
"\n"
|
||||
" SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
|
||||
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
|
||||
"\n"
|
||||
" if( iData.m_key != jData.m_key )\n"
|
||||
" {\n"
|
||||
"// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
|
||||
" u32 k = jData.m_key;\n"
|
||||
" {\n"
|
||||
" dst[k] = gIdx;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"__kernel\n"
|
||||
"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" u32 nSrc = cb.m_nSrc;\n"
|
||||
" u32 nDst = cb.m_nDst;\n"
|
||||
"\n"
|
||||
" if( gIdx < nSrc+1 )\n"
|
||||
" {\n"
|
||||
" SortData first; first.m_key = 0; first.m_value = 0;\n"
|
||||
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
|
||||
"\n"
|
||||
" SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
|
||||
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
|
||||
"\n"
|
||||
" if( iData.m_key != jData.m_key )\n"
|
||||
" {\n"
|
||||
"// for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)\n"
|
||||
" u32 k = iData.m_key;\n"
|
||||
" {\n"
|
||||
" dst[k] = gIdx;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"__kernel\n"
|
||||
"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" u32 nSrc = cb.m_nSrc;\n"
|
||||
" u32 nDst = cb.m_nDst;\n"
|
||||
"\n"
|
||||
" if( gIdx < nDst )\n"
|
||||
" {\n"
|
||||
" C[gIdx] = A[gIdx] - B[gIdx];\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
;
|
||||
@@ -0,0 +1,94 @@
|
||||
static const char* boundSearchKernelsDX11= \
|
||||
"/*\n"
|
||||
" 2011 Takahiro Harada\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"typedef uint u32;\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_IDX groupIdx.x\n"
|
||||
"#define GET_LOCAL_IDX localIdx.x\n"
|
||||
"#define GET_GLOBAL_IDX globalIdx.x\n"
|
||||
"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
|
||||
"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
|
||||
"#define AtomInc(x) InterlockedAdd(x, 1)\n"
|
||||
"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_key; \n"
|
||||
" u32 m_value;\n"
|
||||
"}SortData;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"cbuffer SortCB : register( b0 )\n"
|
||||
"{\n"
|
||||
" u32 m_nSrc;\n"
|
||||
" u32 m_nDst;\n"
|
||||
" u32 m_padding[2];\n"
|
||||
"};\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"StructuredBuffer<SortData> src : register( t0 );\n"
|
||||
"RWStructuredBuffer<u32> dst : register( u0 );\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"[numthreads(64, 1, 1)]\n"
|
||||
"void SearchSortDataLowerKernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" u32 nSrc = m_nSrc;\n"
|
||||
" u32 nDst = m_nDst;\n"
|
||||
"\n"
|
||||
" if( gIdx < nSrc )\n"
|
||||
" {\n"
|
||||
" SortData iData;\n"
|
||||
" SortData jData;\n"
|
||||
" if( gIdx==0 ) iData.m_key = iData.m_value = (u32)-1;\n"
|
||||
" else iData = src[gIdx-1];\n"
|
||||
"\n"
|
||||
" if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;\n"
|
||||
" else jData = src[gIdx];\n"
|
||||
"\n"
|
||||
" if( iData.m_key != jData.m_key )\n"
|
||||
" {\n"
|
||||
"// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
|
||||
" u32 k = jData.m_key;\n"
|
||||
" {\n"
|
||||
" dst[k] = gIdx;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"[numthreads(64, 1, 1)]\n"
|
||||
"void SearchSortDataUpperKernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" u32 nSrc = m_nSrc;\n"
|
||||
" u32 nDst = m_nDst;\n"
|
||||
"\n"
|
||||
" if( gIdx < nSrc+1 )\n"
|
||||
" {\n"
|
||||
" SortData iData;\n"
|
||||
" SortData jData;\n"
|
||||
" if( gIdx==0 ) iData.m_key = iData.m_value = 0;\n"
|
||||
" else iData = src[gIdx-1];\n"
|
||||
"\n"
|
||||
" if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;\n"
|
||||
" else jData = src[gIdx];\n"
|
||||
"\n"
|
||||
" if( iData.m_key != jData.m_key )\n"
|
||||
" {\n"
|
||||
"// for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)\n"
|
||||
" u32 k = iData.m_key;\n"
|
||||
" {\n"
|
||||
" dst[k] = gIdx;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
;
|
||||
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
2011 Takahiro Harada
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Adl/Adl.h>
|
||||
#include <AdlPrimitives/Math/Math.h>
|
||||
#include <AdlPrimitives/Sort/SortData.h>
|
||||
#include <AdlPrimitives/Scan/PrefixScan.h>
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
class RadixSortBase
|
||||
{
|
||||
public:
|
||||
enum Option
|
||||
{
|
||||
SORT_SIMPLE,
|
||||
SORT_STANDARD,
|
||||
SORT_ADVANCED
|
||||
};
|
||||
};
|
||||
|
||||
template<DeviceType TYPE>
|
||||
class RadixSort : public RadixSortBase
|
||||
{
|
||||
public:
|
||||
struct Data
|
||||
{
|
||||
Option m_option;
|
||||
const Device* m_deviceData;
|
||||
typename PrefixScan<TYPE>::Data* m_scanData;
|
||||
int m_maxSize;
|
||||
};
|
||||
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_STANDARD);
|
||||
|
||||
static
|
||||
void deallocate(Data* data);
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<SortData>& inout, int n, int sortBits = 32);
|
||||
};
|
||||
|
||||
|
||||
#include <AdlPrimitives/Sort/RadixSort.inl>
|
||||
#include <AdlPrimitives/Sort/RadixSortHost.inl>
|
||||
|
||||
};
|
||||
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
2011 Takahiro Harada
|
||||
*/
|
||||
|
||||
#include <AdlPrimitives/Sort/RadixSortSimple.inl>
|
||||
#include <AdlPrimitives/Sort/RadixSortStandard.inl>
|
||||
#include <AdlPrimitives/Sort/RadixSortAdvanced.inl>
|
||||
|
||||
|
||||
#define DISPATCH_IMPL(x) \
|
||||
switch( data->m_option ) \
|
||||
{ \
|
||||
case SORT_SIMPLE: RadixSortSimple<TYPE>::x; break; \
|
||||
case SORT_STANDARD: RadixSortStandard<TYPE>::x; break; \
|
||||
case SORT_ADVANCED: RadixSortAdvanced<TYPE>::x; break; \
|
||||
default:ADLASSERT(0);break; \
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
typename RadixSort<TYPE>::Data* RadixSort<TYPE>::allocate(const Device* deviceData, int maxSize, Option option)
|
||||
{
|
||||
ADLASSERT( TYPE == deviceData->m_type );
|
||||
|
||||
void* dataOut;
|
||||
switch( option )
|
||||
{
|
||||
case SORT_SIMPLE:
|
||||
dataOut = RadixSortSimple<TYPE>::allocate( deviceData, maxSize, option );
|
||||
break;
|
||||
case SORT_STANDARD:
|
||||
dataOut = RadixSortStandard<TYPE>::allocate( deviceData, maxSize, option );
|
||||
break;
|
||||
case SORT_ADVANCED:
|
||||
dataOut = RadixSortAdvanced<TYPE>::allocate( deviceData, maxSize, option );
|
||||
break;
|
||||
default:
|
||||
ADLASSERT(0);
|
||||
break;
|
||||
}
|
||||
return (typename RadixSort<TYPE>::Data*)dataOut;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void RadixSort<TYPE>::deallocate(Data* data)
|
||||
{
|
||||
DISPATCH_IMPL( deallocate( data ) );
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void RadixSort<TYPE>::execute(Data* data, Buffer<SortData>& inout, int n, int sortBits)
|
||||
{
|
||||
DISPATCH_IMPL( execute( data, inout, n, sortBits ) );
|
||||
}
|
||||
|
||||
|
||||
#undef DISPATCH_IMPL
|
||||
|
||||
|
||||
@@ -0,0 +1,98 @@
|
||||
/*
|
||||
2011 Takahiro Harada
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Adl/Adl.h>
|
||||
#include <AdlPrimitives/Math/Math.h>
|
||||
#include <AdlPrimitives/Copy/Copy.h>
|
||||
#include <AdlPrimitives/Sort/SortData.h>
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
class RadixSort32Base
|
||||
{
|
||||
public:
|
||||
// enum Option
|
||||
// {
|
||||
// SORT_SIMPLE,
|
||||
// SORT_STANDARD,
|
||||
// SORT_ADVANCED
|
||||
// };
|
||||
};
|
||||
|
||||
template<DeviceType TYPE>
|
||||
class RadixSort32 : public RadixSort32Base
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
enum
|
||||
{
|
||||
DATA_ALIGNMENT = 256,
|
||||
WG_SIZE = 64,
|
||||
ELEMENTS_PER_WORK_ITEM = (256/WG_SIZE),
|
||||
BITS_PER_PASS = 4,
|
||||
|
||||
// if you change this, change nPerWI in kernel as well
|
||||
NUM_WGS = 20*6, // cypress
|
||||
// NUM_WGS = 24*6, // cayman
|
||||
// NUM_WGS = 32*4, // nv
|
||||
};
|
||||
|
||||
struct ConstData
|
||||
{
|
||||
int m_n;
|
||||
int m_nWGs;
|
||||
int m_startBit;
|
||||
int m_nBlocksPerWG;
|
||||
};
|
||||
|
||||
struct Data
|
||||
{
|
||||
const Device* m_device;
|
||||
int m_maxSize;
|
||||
|
||||
Kernel* m_streamCountKernel;
|
||||
Kernel* m_streamCountSortDataKernel;
|
||||
Kernel* m_prefixScanKernel;
|
||||
Kernel* m_sortAndScatterKernel;
|
||||
Kernel* m_sortAndScatterKeyValueKernel;
|
||||
Kernel* m_sortAndScatterSortDataKernel;
|
||||
|
||||
Buffer<u32>* m_workBuffer0;
|
||||
Buffer<u32>* m_workBuffer1;
|
||||
Buffer<u32>* m_workBuffer2;
|
||||
Buffer<SortData>* m_workBuffer3;
|
||||
|
||||
Buffer<ConstData>* m_constBuffer[32/BITS_PER_PASS];
|
||||
|
||||
typename Copy<TYPE>::Data* m_copyData;
|
||||
};
|
||||
|
||||
static
|
||||
Data* allocate(const Device* device, int maxSize);
|
||||
|
||||
static
|
||||
void deallocate(Data* data);
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<u32>& inout, int n, int sortBits = 32);
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<u32>& in, Buffer<u32>& out, int n, int sortBits = 32);
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<u32>& keysIn, Buffer<u32>& keysOut, Buffer<u32>& valuesIn, Buffer<u32>& valuesOut, int n, int sortBits = 32);
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<SortData>& keyValuesInOut, int n, int sortBits = 32 );
|
||||
};
|
||||
|
||||
|
||||
#include <AdlPrimitives/Sort/RadixSort32Host.inl>
|
||||
#include <AdlPrimitives/Sort/RadixSort32.inl>
|
||||
|
||||
};
|
||||
@@ -0,0 +1,346 @@
|
||||
/*
|
||||
2011 Takahiro Harada
|
||||
*/
|
||||
|
||||
#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Sort\\RadixSort32Kernels"
|
||||
#define RADIXSORT32_KERNEL0 "StreamCountKernel"
|
||||
#define RADIXSORT32_KERNEL1 "PrefixScanKernel"
|
||||
#define RADIXSORT32_KERNEL2 "SortAndScatterKernel"
|
||||
#define RADIXSORT32_KERNEL3 "SortAndScatterKeyValueKernel"
|
||||
#define RADIXSORT32_KERNEL4 "SortAndScatterSortDataKernel"
|
||||
#define RADIXSORT32_KERNEL5 "StreamCountSortDataKernel"
|
||||
|
||||
#include "RadixSort32KernelsCL.h"
|
||||
#include "RadixSort32KernelsDX11.h"
|
||||
|
||||
// todo. Shader compiler (2010JuneSDK) doesn't allow me to place Barriers in SortAndScatterKernel...
|
||||
// So it only works on a GPU with 64 wide SIMD.
|
||||
|
||||
template<DeviceType TYPE>
|
||||
typename RadixSort32<TYPE>::Data* RadixSort32<TYPE>::allocate( const Device* device, int maxSize )
|
||||
{
|
||||
ADLASSERT( TYPE == device->m_type );
|
||||
|
||||
const char* src[] =
|
||||
#if defined(ADL_LOAD_KERNEL_FROM_STRING)
|
||||
{radixSort32KernelsCL, radixSort32KernelsDX11};
|
||||
#else
|
||||
{0,0};
|
||||
#endif
|
||||
|
||||
Data* data = new Data;
|
||||
data->m_device = device;
|
||||
data->m_maxSize = maxSize;
|
||||
data->m_streamCountKernel = device->getKernel( PATH, RADIXSORT32_KERNEL0, 0, src[TYPE] );
|
||||
data->m_streamCountSortDataKernel = device->getKernel( PATH, RADIXSORT32_KERNEL5, 0, src[TYPE] );
|
||||
|
||||
|
||||
|
||||
data->m_prefixScanKernel = device->getKernel( PATH, RADIXSORT32_KERNEL1, 0, src[TYPE] );
|
||||
data->m_sortAndScatterKernel = device->getKernel( PATH, RADIXSORT32_KERNEL2, 0, src[TYPE] );
|
||||
data->m_sortAndScatterKeyValueKernel = device->getKernel( PATH, RADIXSORT32_KERNEL3, 0, src[TYPE] );
|
||||
data->m_sortAndScatterSortDataKernel = device->getKernel( PATH, RADIXSORT32_KERNEL4, 0, src[TYPE] );
|
||||
|
||||
int wtf = NUM_WGS*(1<<BITS_PER_PASS);
|
||||
|
||||
data->m_workBuffer0 = new Buffer<u32>( device, maxSize );
|
||||
data->m_workBuffer1 = new Buffer<u32>( device , wtf );
|
||||
data->m_workBuffer2 = new Buffer<u32>( device, maxSize );
|
||||
data->m_workBuffer3 = new Buffer<SortData>(device,maxSize);
|
||||
|
||||
|
||||
for(int i=0; i<32/BITS_PER_PASS; i++)
|
||||
data->m_constBuffer[i] = new Buffer<ConstData>( device, 1, BufferBase::BUFFER_CONST );
|
||||
|
||||
data->m_copyData = Copy<TYPE>::allocate( device );
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void RadixSort32<TYPE>::deallocate( Data* data )
|
||||
{
|
||||
delete data->m_workBuffer0;
|
||||
delete data->m_workBuffer1;
|
||||
delete data->m_workBuffer2;
|
||||
delete data->m_workBuffer3;
|
||||
|
||||
for(int i=0; i<32/BITS_PER_PASS; i++)
|
||||
delete data->m_constBuffer[i];
|
||||
|
||||
Copy<TYPE>::deallocate( data->m_copyData );
|
||||
|
||||
delete data;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void RadixSort32<TYPE>::execute(Data* data, Buffer<u32>& inout, int n, int sortBits /* = 32 */ )
|
||||
{
|
||||
ADLASSERT( n%DATA_ALIGNMENT == 0 );
|
||||
ADLASSERT( n <= data->m_maxSize );
|
||||
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
|
||||
ADLASSERT( BITS_PER_PASS == 4 );
|
||||
ADLASSERT( WG_SIZE == 64 );
|
||||
ADLASSERT( (sortBits&0x3) == 0 );
|
||||
|
||||
Buffer<u32>* src = &inout;
|
||||
Buffer<u32>* dst = data->m_workBuffer0;
|
||||
Buffer<u32>* histogramBuffer = data->m_workBuffer1;
|
||||
|
||||
int nWGs = NUM_WGS;
|
||||
ConstData cdata;
|
||||
{
|
||||
int nBlocks = (n+ELEMENTS_PER_WORK_ITEM*WG_SIZE-1)/(ELEMENTS_PER_WORK_ITEM*WG_SIZE);
|
||||
|
||||
cdata.m_n = n;
|
||||
cdata.m_nWGs = NUM_WGS;
|
||||
cdata.m_startBit = 0;
|
||||
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
|
||||
|
||||
if( nBlocks < NUM_WGS )
|
||||
{
|
||||
cdata.m_nBlocksPerWG = 1;
|
||||
nWGs = nBlocks;
|
||||
}
|
||||
}
|
||||
|
||||
for(int ib=0; ib<sortBits; ib+=4)
|
||||
{
|
||||
cdata.m_startBit = ib;
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer ) };
|
||||
Launcher launcher( data->m_device, data->m_streamCountKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[ib/4], cdata );
|
||||
launcher.launch1D( NUM_WGS*WG_SIZE, WG_SIZE );
|
||||
}
|
||||
{// prefix scan group histogram
|
||||
BufferInfo bInfo[] = { BufferInfo( histogramBuffer ) };
|
||||
Launcher launcher( data->m_device, data->m_prefixScanKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[ib/4], cdata );
|
||||
launcher.launch1D( 128, 128 );
|
||||
}
|
||||
{// local sort and distribute
|
||||
BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer, true ), BufferInfo( dst ) };
|
||||
Launcher launcher( data->m_device, data->m_sortAndScatterKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[ib/4], cdata );
|
||||
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
|
||||
}
|
||||
swap2( src, dst );
|
||||
}
|
||||
|
||||
if( src != &inout )
|
||||
{
|
||||
Copy<TYPE>::execute( data->m_copyData, (Buffer<float>&)inout, (Buffer<float>&)*src, n );
|
||||
}
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void RadixSort32<TYPE>::execute(Data* data, Buffer<u32>& in, Buffer<u32>& out, int n, int sortBits /* = 32 */ )
|
||||
{
|
||||
ADLASSERT( n%DATA_ALIGNMENT == 0 );
|
||||
ADLASSERT( n <= data->m_maxSize );
|
||||
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
|
||||
ADLASSERT( BITS_PER_PASS == 4 );
|
||||
ADLASSERT( WG_SIZE == 64 );
|
||||
ADLASSERT( (sortBits&0x3) == 0 );
|
||||
|
||||
Buffer<u32>* src = ∈
|
||||
Buffer<u32>* dst = data->m_workBuffer0;
|
||||
Buffer<u32>* histogramBuffer = data->m_workBuffer1;
|
||||
|
||||
int nWGs = NUM_WGS;
|
||||
ConstData cdata;
|
||||
{
|
||||
int nBlocks = (n+ELEMENTS_PER_WORK_ITEM*WG_SIZE-1)/(ELEMENTS_PER_WORK_ITEM*WG_SIZE);
|
||||
cdata.m_n = n;
|
||||
cdata.m_nWGs = NUM_WGS;
|
||||
cdata.m_startBit = 0;
|
||||
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
|
||||
if( nBlocks < NUM_WGS )
|
||||
{
|
||||
cdata.m_nBlocksPerWG = 1;
|
||||
nWGs = nBlocks;
|
||||
}
|
||||
}
|
||||
|
||||
if( sortBits == 4 ) dst = &out;
|
||||
|
||||
for(int ib=0; ib<sortBits; ib+=4)
|
||||
{
|
||||
if( ib==4 )
|
||||
{
|
||||
dst = &out;
|
||||
}
|
||||
|
||||
cdata.m_startBit = ib;
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer ) };
|
||||
Launcher launcher( data->m_device, data->m_streamCountKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[ib/4], cdata );
|
||||
launcher.launch1D( NUM_WGS*WG_SIZE, WG_SIZE );
|
||||
}
|
||||
{// prefix scan group histogram
|
||||
BufferInfo bInfo[] = { BufferInfo( histogramBuffer ) };
|
||||
Launcher launcher( data->m_device, data->m_prefixScanKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[ib/4], cdata );
|
||||
launcher.launch1D( 128, 128 );
|
||||
}
|
||||
{// local sort and distribute
|
||||
BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer, true ), BufferInfo( dst ) };
|
||||
Launcher launcher( data->m_device, data->m_sortAndScatterKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[ib/4], cdata );
|
||||
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
|
||||
}
|
||||
swap2( src, dst );
|
||||
}
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void RadixSort32<TYPE>::execute(Data* data, Buffer<u32>& keysIn, Buffer<u32>& keysOut, Buffer<u32>& valuesIn, Buffer<u32>& valuesOut, int n, int sortBits /* = 32 */)
|
||||
{
|
||||
ADLASSERT( n%DATA_ALIGNMENT == 0 );
|
||||
ADLASSERT( n <= data->m_maxSize );
|
||||
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
|
||||
ADLASSERT( BITS_PER_PASS == 4 );
|
||||
ADLASSERT( WG_SIZE == 64 );
|
||||
ADLASSERT( (sortBits&0x3) == 0 );
|
||||
|
||||
Buffer<u32>* src = &keysIn;
|
||||
Buffer<u32>* srcVal = &valuesIn;
|
||||
Buffer<u32>* dst = data->m_workBuffer0;
|
||||
Buffer<u32>* dstVal = data->m_workBuffer2;
|
||||
Buffer<u32>* histogramBuffer = data->m_workBuffer1;
|
||||
|
||||
int nWGs = NUM_WGS;
|
||||
ConstData cdata;
|
||||
{
|
||||
int nBlocks = (n+ELEMENTS_PER_WORK_ITEM*WG_SIZE-1)/(ELEMENTS_PER_WORK_ITEM*WG_SIZE);
|
||||
cdata.m_n = n;
|
||||
cdata.m_nWGs = NUM_WGS;
|
||||
cdata.m_startBit = 0;
|
||||
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
|
||||
if( nBlocks < NUM_WGS )
|
||||
{
|
||||
cdata.m_nBlocksPerWG = 1;
|
||||
nWGs = nBlocks;
|
||||
}
|
||||
}
|
||||
|
||||
if( sortBits == 4 )
|
||||
{
|
||||
dst = &keysOut;
|
||||
dstVal = &valuesOut;
|
||||
}
|
||||
|
||||
for(int ib=0; ib<sortBits; ib+=4)
|
||||
{
|
||||
if( ib==4 )
|
||||
{
|
||||
dst = &keysOut;
|
||||
dstVal = &valuesOut;
|
||||
}
|
||||
|
||||
cdata.m_startBit = ib;
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer ) };
|
||||
Launcher launcher( data->m_device, data->m_streamCountKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[ib/4], cdata );
|
||||
launcher.launch1D( NUM_WGS*WG_SIZE, WG_SIZE );
|
||||
}
|
||||
{// prefix scan group histogram
|
||||
BufferInfo bInfo[] = { BufferInfo( histogramBuffer ) };
|
||||
Launcher launcher( data->m_device, data->m_prefixScanKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[ib/4], cdata );
|
||||
launcher.launch1D( 128, 128 );
|
||||
}
|
||||
{// local sort and distribute
|
||||
BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( srcVal, true ), BufferInfo( histogramBuffer, true ), BufferInfo( dst ), BufferInfo( dstVal ) };
|
||||
Launcher launcher( data->m_device, data->m_sortAndScatterKeyValueKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[ib/4], cdata );
|
||||
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
|
||||
}
|
||||
swap2( src, dst );
|
||||
swap2( srcVal, dstVal );
|
||||
}
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void RadixSort32<TYPE>::execute(Data* data, Buffer<SortData>& keyValuesInOut, int n, int sortBits /* = 32 */)
|
||||
{
|
||||
ADLASSERT( n%DATA_ALIGNMENT == 0 );
|
||||
ADLASSERT( n <= data->m_maxSize );
|
||||
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
|
||||
ADLASSERT( BITS_PER_PASS == 4 );
|
||||
ADLASSERT( WG_SIZE == 64 );
|
||||
ADLASSERT( (sortBits&0x3) == 0 );
|
||||
|
||||
Buffer<SortData>* src = &keyValuesInOut;
|
||||
Buffer<SortData>* dst = data->m_workBuffer3;
|
||||
|
||||
Buffer<u32>* histogramBuffer = data->m_workBuffer1;
|
||||
|
||||
int nWGs = NUM_WGS;
|
||||
ConstData cdata;
|
||||
{
|
||||
int nBlocks = (n+ELEMENTS_PER_WORK_ITEM*WG_SIZE-1)/(ELEMENTS_PER_WORK_ITEM*WG_SIZE);
|
||||
cdata.m_n = n;
|
||||
cdata.m_nWGs = NUM_WGS;
|
||||
cdata.m_startBit = 0;
|
||||
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
|
||||
if( nBlocks < NUM_WGS )
|
||||
{
|
||||
cdata.m_nBlocksPerWG = 1;
|
||||
nWGs = nBlocks;
|
||||
}
|
||||
}
|
||||
|
||||
int count=0;
|
||||
for(int ib=0; ib<sortBits; ib+=4)
|
||||
{
|
||||
cdata.m_startBit = ib;
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer ) };
|
||||
Launcher launcher( data->m_device, data->m_streamCountSortDataKernel);
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[ib/4], cdata );
|
||||
launcher.launch1D( NUM_WGS*WG_SIZE, WG_SIZE );
|
||||
}
|
||||
{// prefix scan group histogram
|
||||
BufferInfo bInfo[] = { BufferInfo( histogramBuffer ) };
|
||||
Launcher launcher( data->m_device, data->m_prefixScanKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[ib/4], cdata );
|
||||
launcher.launch1D( 128, 128 );
|
||||
}
|
||||
{// local sort and distribute
|
||||
BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer, true ), BufferInfo( dst )};
|
||||
Launcher launcher( data->m_device, data->m_sortAndScatterSortDataKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[ib/4], cdata );
|
||||
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
|
||||
}
|
||||
swap2( src, dst );
|
||||
count++;
|
||||
}
|
||||
|
||||
if (count&1)
|
||||
{
|
||||
ADLASSERT(0);//need to copy from workbuffer to keyValuesInOut
|
||||
|
||||
}
|
||||
}
|
||||
#undef PATH
|
||||
#undef RADIXSORT32_KERNEL0
|
||||
#undef RADIXSORT32_KERNEL1
|
||||
#undef RADIXSORT32_KERNEL2
|
||||
#undef RADIXSORT32_KERNEL3
|
||||
|
||||
@@ -0,0 +1,163 @@
|
||||
/*
|
||||
2011 Takahiro Harada
|
||||
*/
|
||||
|
||||
template<>
|
||||
class RadixSort32<TYPE_HOST> : public RadixSort32Base
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
enum
|
||||
{
|
||||
BITS_PER_PASS = 8,
|
||||
NUM_TABLES = (1<<BITS_PER_PASS),
|
||||
};
|
||||
|
||||
struct Data
|
||||
{
|
||||
HostBuffer<u32>* m_workBuffer;
|
||||
};
|
||||
|
||||
static
|
||||
Data* allocate(const Device* device, int maxSize)
|
||||
{
|
||||
ADLASSERT( device->m_type == TYPE_HOST );
|
||||
|
||||
Data* data = new Data;
|
||||
data->m_workBuffer = new HostBuffer<u32>( device, maxSize );
|
||||
return data;
|
||||
}
|
||||
|
||||
static
|
||||
void deallocate(Data* data)
|
||||
{
|
||||
delete data->m_workBuffer;
|
||||
delete data;
|
||||
}
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<u32>& inout, int n, int sortBits = 32)
|
||||
{
|
||||
ADLASSERT( inout.getType() == TYPE_HOST );
|
||||
|
||||
int tables[NUM_TABLES];
|
||||
int counter[NUM_TABLES];
|
||||
|
||||
u32* src = inout.m_ptr;
|
||||
u32* dst = data->m_workBuffer->m_ptr;
|
||||
|
||||
for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
|
||||
{
|
||||
for(int i=0; i<NUM_TABLES; i++)
|
||||
{
|
||||
tables[i] = 0;
|
||||
}
|
||||
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (src[i] >> startBit) & (NUM_TABLES-1);
|
||||
tables[tableIdx]++;
|
||||
}
|
||||
|
||||
// prefix scan
|
||||
int sum = 0;
|
||||
for(int i=0; i<NUM_TABLES; i++)
|
||||
{
|
||||
int iData = tables[i];
|
||||
tables[i] = sum;
|
||||
sum += iData;
|
||||
counter[i] = 0;
|
||||
}
|
||||
|
||||
// distribute
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (src[i] >> startBit) & (NUM_TABLES-1);
|
||||
|
||||
dst[tables[tableIdx] + counter[tableIdx]] = src[i];
|
||||
counter[tableIdx] ++;
|
||||
}
|
||||
|
||||
swap2( src, dst );
|
||||
}
|
||||
|
||||
{
|
||||
if( src != inout.m_ptr )
|
||||
{
|
||||
memcpy( dst, src, sizeof(u32)*n );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<u32>& keyInout, const Buffer<u32>& valueInout, int n, int sortBits = 32)
|
||||
{
|
||||
ADLASSERT( keyInout.getType() == TYPE_HOST );
|
||||
|
||||
int tables[NUM_TABLES];
|
||||
int counter[NUM_TABLES];
|
||||
|
||||
u32* src = keyInout.m_ptr;
|
||||
u32* dst = data->m_workBuffer->m_ptr;
|
||||
|
||||
HostBuffer<u32> bufVal(valueInout.m_device, valueInout.m_size);
|
||||
bufVal.write(valueInout.m_ptr, valueInout.m_size);
|
||||
|
||||
u32* srcVal = valueInout.m_ptr;
|
||||
u32* dstVal = bufVal.m_ptr;
|
||||
|
||||
for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
|
||||
{
|
||||
for(int i=0; i<NUM_TABLES; i++)
|
||||
{
|
||||
tables[i] = 0;
|
||||
}
|
||||
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (src[i] >> startBit) & (NUM_TABLES-1);
|
||||
tables[tableIdx]++;
|
||||
}
|
||||
|
||||
// prefix scan
|
||||
int sum = 0;
|
||||
for(int i=0; i<NUM_TABLES; i++)
|
||||
{
|
||||
int iData = tables[i];
|
||||
tables[i] = sum;
|
||||
sum += iData;
|
||||
counter[i] = 0;
|
||||
}
|
||||
|
||||
// distribute
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (src[i] >> startBit) & (NUM_TABLES-1);
|
||||
int newIdx = tables[tableIdx] + counter[tableIdx];
|
||||
dst[newIdx] = src[i];
|
||||
dstVal[newIdx] = srcVal[i];
|
||||
counter[tableIdx]++;
|
||||
}
|
||||
|
||||
swap2( src, dst );
|
||||
swap2( srcVal, dstVal );
|
||||
}
|
||||
|
||||
{
|
||||
if( src != keyInout.m_ptr )
|
||||
{
|
||||
memcpy( dst, src, sizeof(u32)*n );
|
||||
}
|
||||
|
||||
if( srcVal != valueInout.m_ptr )
|
||||
{
|
||||
memcpy( dstVal, srcVal, sizeof(u32)*n );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,985 @@
|
||||
/*
|
||||
2011 Takahiro Harada
|
||||
*/
|
||||
|
||||
typedef uint u32;
|
||||
|
||||
#define GET_GROUP_IDX groupIdx.x
|
||||
#define GET_LOCAL_IDX localIdx.x
|
||||
#define GET_GLOBAL_IDX globalIdx.x
|
||||
#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
|
||||
#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
|
||||
#define AtomInc(x) InterlockedAdd(x, 1)
|
||||
#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
|
||||
|
||||
#define min2 min
|
||||
#define max2 max
|
||||
|
||||
|
||||
cbuffer CB0 : register( b0 )
|
||||
{
|
||||
int m_startBit;
|
||||
int m_totalBlocks;
|
||||
int m_nWorkGroupsToExecute;
|
||||
int m_nBlocksPerGroup;
|
||||
|
||||
};
|
||||
|
||||
|
||||
typedef struct {
|
||||
unsigned int key;
|
||||
unsigned int value;
|
||||
} KeyValuePair;
|
||||
|
||||
|
||||
StructuredBuffer<u32> rHistogram : register(t0);
|
||||
|
||||
RWStructuredBuffer<KeyValuePair> dataToSort : register( u0 );
|
||||
RWStructuredBuffer<KeyValuePair> dataToSortOut : register( u1 );
|
||||
|
||||
|
||||
|
||||
#define WG_SIZE 128
|
||||
#define ELEMENTS_PER_WORK_ITEM 4
|
||||
#define BITS_PER_PASS 4
|
||||
#define NUM_BUCKET (1<<BITS_PER_PASS)
|
||||
|
||||
|
||||
groupshared u32 sorterSharedMemory[max(WG_SIZE*2*2, WG_SIZE*ELEMENTS_PER_WORK_ITEM*2)];
|
||||
groupshared u32 localHistogramToCarry[NUM_BUCKET];
|
||||
groupshared u32 localHistogram[NUM_BUCKET*2];
|
||||
groupshared u32 localHistogramMat[NUM_BUCKET*WG_SIZE];
|
||||
groupshared u32 localPrefixSum[NUM_BUCKET];
|
||||
|
||||
|
||||
|
||||
#define SET_LOCAL_SORT_DATA(idx, sortDataIn) sorterSharedMemory[2*(idx)+0] = sortDataIn.key; sorterSharedMemory[2*(idx)+1] = sortDataIn.value;
|
||||
#define GET_LOCAL_SORT_DATA(idx, sortDataOut) sortDataOut.key = sorterSharedMemory[2*(idx)+0]; sortDataOut.value = sorterSharedMemory[2*(idx)+1];
|
||||
|
||||
|
||||
|
||||
uint4 prefixScanVector( uint4 data )
|
||||
{
|
||||
data.y += data.x;
|
||||
data.w += data.z;
|
||||
data.z += data.y;
|
||||
data.w += data.y;
|
||||
return data;
|
||||
}
|
||||
|
||||
uint prefixScanVectorEx( inout uint4 data )
|
||||
{
|
||||
uint4 backup = data;
|
||||
data.y += data.x;
|
||||
data.w += data.z;
|
||||
data.z += data.y;
|
||||
data.w += data.y;
|
||||
uint sum = data.w;
|
||||
data -= backup;
|
||||
return sum;
|
||||
}
|
||||
|
||||
uint localPrefixScan128( uint pData, uint lIdx, inout uint totalSum )
|
||||
{
|
||||
{ // Set data
|
||||
sorterSharedMemory[lIdx] = 0;
|
||||
sorterSharedMemory[lIdx+WG_SIZE] = pData;
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
{ // Prefix sum
|
||||
int idx = 2*lIdx + (WG_SIZE+1);
|
||||
if( lIdx < 64 )
|
||||
{
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-2];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-32];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
|
||||
}
|
||||
if( lIdx < 64 ) sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
totalSum = sorterSharedMemory[WG_SIZE*2-1];
|
||||
return sorterSharedMemory[lIdx+127];
|
||||
}
|
||||
|
||||
void localPrefixScan128Dual( uint pData0, uint pData1, uint lIdx,
|
||||
inout uint rank0, inout uint rank1,
|
||||
inout uint totalSum0, inout uint totalSum1 )
|
||||
{
|
||||
{ // Set data
|
||||
sorterSharedMemory[lIdx] = 0;
|
||||
sorterSharedMemory[lIdx+WG_SIZE] = pData0;
|
||||
sorterSharedMemory[2*WG_SIZE+lIdx] = 0;
|
||||
sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1;
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
// if( lIdx < 128 ) // todo. assert wg size is 128
|
||||
{ // Prefix sum
|
||||
int blockIdx = lIdx/64;
|
||||
int groupIdx = lIdx%64;
|
||||
int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;
|
||||
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-2];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-32];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
|
||||
|
||||
sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
totalSum0 = sorterSharedMemory[WG_SIZE*2-1];
|
||||
rank0 = sorterSharedMemory[lIdx+127];
|
||||
totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];
|
||||
rank1 = sorterSharedMemory[2*WG_SIZE+lIdx+127];
|
||||
}
|
||||
|
||||
uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum )
|
||||
{
|
||||
{ // Set data
|
||||
sorterSharedMemory[lIdx] = 0;
|
||||
sorterSharedMemory[lIdx+WG_SIZE] = prefixScanVectorEx( pData );
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
{ // Prefix sum
|
||||
int idx = 2*lIdx + (WG_SIZE+1);
|
||||
if( lIdx < 64 )
|
||||
{
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-2];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-32];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
|
||||
|
||||
sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
|
||||
}
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
totalSum = sorterSharedMemory[WG_SIZE*2-1];
|
||||
uint addValue = sorterSharedMemory[lIdx+127];
|
||||
return pData + uint4(addValue, addValue, addValue, addValue);
|
||||
}
|
||||
|
||||
void localPrefixSum128Dual( uint4 pData0, uint4 pData1, uint lIdx,
|
||||
inout uint4 dataOut0, inout uint4 dataOut1,
|
||||
inout uint totalSum0, inout uint totalSum1 )
|
||||
{
|
||||
/*
|
||||
dataOut0 = localPrefixSum128V( pData0, lIdx, totalSum0 );
|
||||
GROUP_LDS_BARRIER;
|
||||
dataOut1 = localPrefixSum128V( pData1, lIdx, totalSum1 );
|
||||
return;
|
||||
*/
|
||||
|
||||
uint4 backup0 = pData0;
|
||||
uint4 backup1 = pData1;
|
||||
|
||||
{ // Prefix sum in a vector
|
||||
pData0 = prefixScanVector( pData0 );
|
||||
pData1 = prefixScanVector( pData1 );
|
||||
}
|
||||
|
||||
{ // Set data
|
||||
sorterSharedMemory[lIdx] = 0;
|
||||
sorterSharedMemory[lIdx+WG_SIZE] = pData0.w;
|
||||
sorterSharedMemory[2*WG_SIZE+lIdx] = 0;
|
||||
sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1.w;
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
// if( lIdx < 128 ) // todo. assert wg size is 128
|
||||
{ // Prefix sum
|
||||
int blockIdx = lIdx/64;
|
||||
int groupIdx = lIdx%64;
|
||||
int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;
|
||||
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-2];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-32];
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
|
||||
|
||||
sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
totalSum0 = sorterSharedMemory[WG_SIZE*2-1];
|
||||
{
|
||||
uint addValue = sorterSharedMemory[lIdx+127];
|
||||
dataOut0 = pData0 + uint4(addValue, addValue, addValue, addValue) - backup0;
|
||||
}
|
||||
|
||||
totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];
|
||||
{
|
||||
uint addValue = sorterSharedMemory[2*WG_SIZE+lIdx+127];
|
||||
dataOut1 = pData1 + uint4(addValue, addValue, addValue, addValue) - backup1;
|
||||
}
|
||||
}
|
||||
|
||||
uint4 extractKeys(uint4 data, uint targetKey)
|
||||
{
|
||||
uint4 key;
|
||||
key.x = data.x == targetKey ? 1:0;
|
||||
key.y = data.y == targetKey ? 1:0;
|
||||
key.z = data.z == targetKey ? 1:0;
|
||||
key.w = data.w == targetKey ? 1:0;
|
||||
return key;
|
||||
}
|
||||
|
||||
uint4 extractKeysByBits(uint4 data, uint targetKey)
|
||||
{
|
||||
uint4 key;
|
||||
uint mask = 1<<targetKey;
|
||||
key.x = (data.x & mask) >> targetKey;
|
||||
key.y = (data.y & mask) >> targetKey;
|
||||
key.z = (data.z & mask) >> targetKey;
|
||||
key.w = (data.w & mask) >> targetKey;
|
||||
return key;
|
||||
}
|
||||
|
||||
uint packKeys(uint lower, uint upper)
|
||||
{
|
||||
return lower|(upper<<16);
|
||||
}
|
||||
|
||||
uint4 packKeys(uint4 lower, uint4 upper)
|
||||
{
|
||||
return uint4( lower.x|(upper.x<<16), lower.y|(upper.y<<16), lower.z|(upper.z<<16), lower.w|(upper.w<<16) );
|
||||
}
|
||||
|
||||
uint extractLower( uint data )
|
||||
{
|
||||
return data&0xffff;
|
||||
}
|
||||
|
||||
uint extractUpper( uint data )
|
||||
{
|
||||
return (data>>16)&0xffff;
|
||||
}
|
||||
|
||||
uint4 extractLower( uint4 data )
|
||||
{
|
||||
return uint4( data.x&0xffff, data.y&0xffff, data.z&0xffff, data.w&0xffff );
|
||||
}
|
||||
|
||||
uint4 extractUpper( uint4 data )
|
||||
{
|
||||
return uint4( (data.x>>16)&0xffff, (data.y>>16)&0xffff, (data.z>>16)&0xffff, (data.w>>16)&0xffff );
|
||||
}
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void SortAndScatterKernel( DEFAULT_ARGS )
|
||||
{
|
||||
u32 lIdx = GET_LOCAL_IDX;
|
||||
u32 wgIdx = GET_GROUP_IDX;
|
||||
|
||||
if( lIdx < (NUM_BUCKET) )
|
||||
{
|
||||
localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx];
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
for(uint igroup=wgIdx*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx+1)*m_nBlocksPerGroup); igroup++)
|
||||
{
|
||||
u32 myHistogram;
|
||||
if( lIdx < (NUM_BUCKET) )
|
||||
{
|
||||
localPrefixSum[lIdx] = 0.f;
|
||||
}
|
||||
|
||||
u32 newOffset[4];
|
||||
KeyValuePair myData[4];
|
||||
{ // read data
|
||||
int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
|
||||
uint startAddress = igroup*numLocalElements + lIdx*4;
|
||||
|
||||
myData[0] = dataToSort[startAddress+0];
|
||||
myData[1] = dataToSort[startAddress+1];
|
||||
myData[2] = dataToSort[startAddress+2];
|
||||
myData[3] = dataToSort[startAddress+3];
|
||||
|
||||
newOffset[0] = newOffset[1] = newOffset[2] = newOffset[3] = 0;
|
||||
}
|
||||
|
||||
int localOffset = 0;
|
||||
uint4 b = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);
|
||||
for(uint targetKey=0; targetKey<(NUM_BUCKET); targetKey+=4)
|
||||
{
|
||||
uint4 key[4];
|
||||
uint keySet[2];
|
||||
{ // pack 4
|
||||
uint4 scannedKey[4];
|
||||
key[0] = scannedKey[0] = extractKeys( b, targetKey+0 );
|
||||
key[1] = scannedKey[1] = extractKeys( b, targetKey+1 );
|
||||
key[2] = scannedKey[2] = extractKeys( b, targetKey+2 );
|
||||
key[3] = scannedKey[3] = extractKeys( b, targetKey+3 );
|
||||
{
|
||||
uint s[4];
|
||||
s[0] = prefixScanVectorEx( scannedKey[0] );
|
||||
s[1] = prefixScanVectorEx( scannedKey[1] );
|
||||
s[2] = prefixScanVectorEx( scannedKey[2] );
|
||||
s[3] = prefixScanVectorEx( scannedKey[3] );
|
||||
keySet[0] = packKeys( s[0], s[1] );
|
||||
keySet[1] = packKeys( s[2], s[3] );
|
||||
}
|
||||
}
|
||||
|
||||
uint dstAddressBase[4];
|
||||
{
|
||||
|
||||
uint totalSumPacked[2];
|
||||
uint dstAddressPacked[2];
|
||||
|
||||
localPrefixScan128Dual( keySet[0], keySet[1], lIdx, dstAddressPacked[0], dstAddressPacked[1], totalSumPacked[0], totalSumPacked[1] );
|
||||
|
||||
dstAddressBase[0] = extractLower( dstAddressPacked[0] );
|
||||
dstAddressBase[1] = extractUpper( dstAddressPacked[0] );
|
||||
dstAddressBase[2] = extractLower( dstAddressPacked[1] );
|
||||
dstAddressBase[3] = extractUpper( dstAddressPacked[1] );
|
||||
|
||||
uint4 histogram;
|
||||
histogram.x = extractLower(totalSumPacked[0]);
|
||||
histogram.y = extractUpper(totalSumPacked[0]);
|
||||
histogram.z = extractLower(totalSumPacked[1]);
|
||||
histogram.w = extractUpper(totalSumPacked[1]);
|
||||
|
||||
if( lIdx == targetKey + 0 ) myHistogram = histogram.x;
|
||||
else if( lIdx == targetKey + 1 ) myHistogram = histogram.y;
|
||||
else if( lIdx == targetKey + 2 ) myHistogram = histogram.z;
|
||||
else if( lIdx == targetKey + 3 ) myHistogram = histogram.w;
|
||||
|
||||
uint histogramSum = prefixScanVectorEx( histogram );
|
||||
|
||||
if( lIdx == targetKey + 0 ) localPrefixSum[targetKey+0] = localOffset+histogram.x;
|
||||
else if( lIdx == targetKey + 1 ) localPrefixSum[targetKey+1] = localOffset+histogram.y;
|
||||
else if( lIdx == targetKey + 2 ) localPrefixSum[targetKey+2] = localOffset+histogram.z;
|
||||
else if( lIdx == targetKey + 3 ) localPrefixSum[targetKey+3] = localOffset+histogram.w;
|
||||
|
||||
localOffset += histogramSum;
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
|
||||
for(int ie=0; ie<4; ie++)
|
||||
{
|
||||
uint4 scannedKey = key[ie];
|
||||
prefixScanVectorEx( scannedKey );
|
||||
|
||||
uint offset = localPrefixSum[targetKey + ie] + dstAddressBase[ie];
|
||||
uint4 dstAddress = uint4( offset, offset, offset, offset ) + scannedKey;
|
||||
|
||||
newOffset[0] += dstAddress.x*key[ie].x;
|
||||
newOffset[1] += dstAddress.y*key[ie].y;
|
||||
newOffset[2] += dstAddress.z*key[ie].z;
|
||||
newOffset[3] += dstAddress.w*key[ie].w;
|
||||
}
|
||||
}
|
||||
|
||||
{ // local scatter
|
||||
SET_LOCAL_SORT_DATA(newOffset[0], myData[0]);
|
||||
SET_LOCAL_SORT_DATA(newOffset[1], myData[1]);
|
||||
SET_LOCAL_SORT_DATA(newOffset[2], myData[2]);
|
||||
SET_LOCAL_SORT_DATA(newOffset[3], myData[3]);
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
{ // write data
|
||||
for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
|
||||
{
|
||||
int dataIdx = 4*lIdx+i;
|
||||
KeyValuePair localData; GET_LOCAL_SORT_DATA( dataIdx, localData );
|
||||
int binIdx = (localData.key >> m_startBit) & 0xf;
|
||||
int groupOffset = localHistogramToCarry[binIdx];
|
||||
int myIdx = dataIdx - localPrefixSum[binIdx];
|
||||
|
||||
dataToSortOut[ groupOffset + myIdx ] = localData;
|
||||
}
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
if( lIdx < NUM_BUCKET )
|
||||
{
|
||||
localHistogramToCarry[lIdx] += myHistogram;
|
||||
}
|
||||
GROUP_LDS_BARRIER;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void SortAndScatterKernel1( DEFAULT_ARGS )
|
||||
{
|
||||
u32 lIdx = GET_LOCAL_IDX;
|
||||
u32 wgIdx = GET_GROUP_IDX;
|
||||
|
||||
if( lIdx < (NUM_BUCKET) )
|
||||
{
|
||||
localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx.x];
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)
|
||||
{
|
||||
u32 myHistogram;
|
||||
|
||||
KeyValuePair myData[4];
|
||||
uint startAddrBlock;
|
||||
{ // read data
|
||||
int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
|
||||
startAddrBlock = lIdx*4;
|
||||
uint startAddress = igroup*numLocalElements + startAddrBlock;
|
||||
|
||||
myData[0] = dataToSort[startAddress+0];
|
||||
myData[1] = dataToSort[startAddress+1];
|
||||
myData[2] = dataToSort[startAddress+2];
|
||||
myData[3] = dataToSort[startAddress+3];
|
||||
}
|
||||
|
||||
// local sort
|
||||
for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)
|
||||
{
|
||||
uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);
|
||||
uint total;
|
||||
uint4 rankOfP = localPrefixSum128V( keys, lIdx, total );
|
||||
uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );
|
||||
|
||||
uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );
|
||||
SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );
|
||||
SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );
|
||||
SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );
|
||||
GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );
|
||||
GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );
|
||||
GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );
|
||||
}
|
||||
|
||||
{// create histogram -> prefix sum
|
||||
if( lIdx < NUM_BUCKET )
|
||||
{
|
||||
localHistogram[lIdx] = 0;
|
||||
localHistogram[NUM_BUCKET+lIdx] = 0;
|
||||
}
|
||||
GROUP_LDS_BARRIER;
|
||||
uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);
|
||||
|
||||
InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );
|
||||
InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );
|
||||
InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );
|
||||
InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
uint hIdx = NUM_BUCKET+lIdx;
|
||||
if( lIdx < NUM_BUCKET )
|
||||
{
|
||||
myHistogram = localHistogram[hIdx];
|
||||
}
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
if( lIdx < NUM_BUCKET )
|
||||
{
|
||||
localHistogram[hIdx] = localHistogram[hIdx-1];
|
||||
|
||||
localHistogram[hIdx] += localHistogram[hIdx-1];
|
||||
localHistogram[hIdx] += localHistogram[hIdx-2];
|
||||
localHistogram[hIdx] += localHistogram[hIdx-4];
|
||||
localHistogram[hIdx] += localHistogram[hIdx-8];
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
}
|
||||
/*
|
||||
{// write back
|
||||
int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
|
||||
startAddrBlock = lIdx*4;
|
||||
uint startAddress = igroup*numLocalElements + startAddrBlock;
|
||||
|
||||
for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
|
||||
{
|
||||
dataToSortOut[ startAddress+ie ] = myData[ie];
|
||||
}
|
||||
}
|
||||
*/
|
||||
{
|
||||
for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
|
||||
{
|
||||
int dataIdx = startAddrBlock+ie;
|
||||
int binIdx = (myData[ie].key>>m_startBit)&0xf;
|
||||
int groupOffset = localHistogramToCarry[binIdx];
|
||||
int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
|
||||
dataToSortOut[ groupOffset + myIdx ] = myData[ie];
|
||||
}
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
if( lIdx < NUM_BUCKET )
|
||||
{
|
||||
localHistogramToCarry[lIdx] += myHistogram;
|
||||
}
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void SortAndScatterKernel1( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )
|
||||
{
|
||||
if( lIdx.x < (NUM_BUCKET) )
|
||||
{
|
||||
localHistogramToCarry[lIdx.x] = rHistogram[lIdx.x*m_nWorkGroupsToExecute + gIdx.x];
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)
|
||||
{
|
||||
u32 myHistogram;
|
||||
|
||||
KeyValuePair myData[4];
|
||||
uint startAddrBlock;
|
||||
{ // read data
|
||||
int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
|
||||
startAddrBlock = lIdx.x*4;
|
||||
uint startAddress = igroup*numLocalElements + startAddrBlock;
|
||||
|
||||
myData[0] = dataToSort[startAddress+0];
|
||||
myData[1] = dataToSort[startAddress+1];
|
||||
myData[2] = dataToSort[startAddress+2];
|
||||
myData[3] = dataToSort[startAddress+3];
|
||||
}
|
||||
|
||||
for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)
|
||||
{
|
||||
uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);
|
||||
uint total;
|
||||
uint4 rankOfP = localPrefixSum128V( keys, lIdx.x, total );
|
||||
uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );
|
||||
|
||||
uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );
|
||||
SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );
|
||||
SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );
|
||||
SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );
|
||||
GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );
|
||||
GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );
|
||||
GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );
|
||||
}
|
||||
|
||||
{// create histogram -> prefix sum
|
||||
if( lIdx.x < NUM_BUCKET )
|
||||
{
|
||||
localHistogram[lIdx.x] = 0;
|
||||
localHistogram[NUM_BUCKET+lIdx.x] = 0;
|
||||
}
|
||||
GROUP_LDS_BARRIER;
|
||||
uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);
|
||||
|
||||
InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );
|
||||
InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );
|
||||
InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );
|
||||
InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
uint hIdx = NUM_BUCKET+lIdx.x;
|
||||
if( lIdx.x < NUM_BUCKET )
|
||||
{
|
||||
myHistogram = localHistogram[hIdx];
|
||||
}
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
|
||||
if( lIdx.x < NUM_BUCKET )
|
||||
{
|
||||
localHistogram[hIdx] = localHistogram[hIdx-1];
|
||||
|
||||
localHistogram[hIdx] += localHistogram[hIdx-1];
|
||||
localHistogram[hIdx] += localHistogram[hIdx-2];
|
||||
localHistogram[hIdx] += localHistogram[hIdx-4];
|
||||
localHistogram[hIdx] += localHistogram[hIdx-8];
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
}
|
||||
{// write back
|
||||
for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
|
||||
{
|
||||
int dataIdx = startAddrBlock+ie;
|
||||
int binIdx = (myData[ie].key>>m_startBit)&0xf;
|
||||
int groupOffset = localHistogramToCarry[binIdx];
|
||||
int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
|
||||
|
||||
dataToSortOut[ groupOffset + myIdx ] = myData[ie];
|
||||
}
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
if( lIdx.x < NUM_BUCKET )
|
||||
{
|
||||
localHistogramToCarry[lIdx.x] += myHistogram;
|
||||
}
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
StructuredBuffer<KeyValuePair> dataToSort1 : register( t0 );
|
||||
RWStructuredBuffer<u32> wHistogram1 : register(u0);
|
||||
|
||||
#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx.x]
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void StreamCountKernel( DEFAULT_ARGS )
|
||||
{
|
||||
u32 lIdx = GET_LOCAL_IDX;
|
||||
u32 wgIdx = GET_GROUP_IDX;
|
||||
|
||||
int myHistogram[NUM_BUCKET];
|
||||
|
||||
for(int i=0; i<NUM_BUCKET; i++)
|
||||
{
|
||||
MY_HISTOGRAM(i) = 0;
|
||||
}
|
||||
|
||||
for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)
|
||||
{
|
||||
uint localKeys[4];
|
||||
{ // read data
|
||||
int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
|
||||
|
||||
uint4 localAddress = uint4(lIdx, lIdx, lIdx, lIdx)*4+uint4(0,1,2,3);
|
||||
uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;
|
||||
|
||||
KeyValuePair localData0 = dataToSort1[globalAddress.x];
|
||||
KeyValuePair localData1 = dataToSort1[globalAddress.y];
|
||||
KeyValuePair localData2 = dataToSort1[globalAddress.z];
|
||||
KeyValuePair localData3 = dataToSort1[globalAddress.w];
|
||||
|
||||
localKeys[0] = (localData0.key >> m_startBit) & 0xf;
|
||||
localKeys[1] = (localData1.key >> m_startBit) & 0xf;
|
||||
localKeys[2] = (localData2.key >> m_startBit) & 0xf;
|
||||
localKeys[3] = (localData3.key >> m_startBit) & 0xf;
|
||||
}
|
||||
|
||||
MY_HISTOGRAM( localKeys[0] )++;
|
||||
MY_HISTOGRAM( localKeys[1] )++;
|
||||
MY_HISTOGRAM( localKeys[2] )++;
|
||||
MY_HISTOGRAM( localKeys[3] )++;
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
{ // reduce to 1
|
||||
if( lIdx < 64 )//WG_SIZE/2 )
|
||||
{
|
||||
for(int i=0; i<NUM_BUCKET/2; i++)
|
||||
{
|
||||
int idx = lIdx;
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];
|
||||
}
|
||||
}
|
||||
else if( lIdx < 128 )
|
||||
{
|
||||
for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)
|
||||
{
|
||||
int idx = lIdx-64;
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
{ // write data
|
||||
if( lIdx < NUM_BUCKET )
|
||||
{
|
||||
wHistogram1[ lIdx*m_nWorkGroupsToExecute + wgIdx.x ] = localHistogramMat[ lIdx*WG_SIZE+0 ];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void StreamCountKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )
|
||||
{
|
||||
int myHistogram[NUM_BUCKET];
|
||||
|
||||
for(int i=0; i<NUM_BUCKET; i++)
|
||||
{
|
||||
myHistogram[i] = 0;
|
||||
}
|
||||
|
||||
for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)
|
||||
{
|
||||
uint localKeys[4];
|
||||
{ // read data
|
||||
int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
|
||||
|
||||
uint4 localAddress = uint4(lIdx.x, lIdx.x, lIdx.x, lIdx.x)*4+uint4(0,1,2,3);
|
||||
uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;
|
||||
|
||||
KeyValuePair localData0 = dataToSort1[globalAddress.x];
|
||||
KeyValuePair localData1 = dataToSort1[globalAddress.y];
|
||||
KeyValuePair localData2 = dataToSort1[globalAddress.z];
|
||||
KeyValuePair localData3 = dataToSort1[globalAddress.w];
|
||||
|
||||
localKeys[0] = (localData0.key >> m_startBit) & 0xf;
|
||||
localKeys[1] = (localData1.key >> m_startBit) & 0xf;
|
||||
localKeys[2] = (localData2.key >> m_startBit) & 0xf;
|
||||
localKeys[3] = (localData3.key >> m_startBit) & 0xf;
|
||||
}
|
||||
|
||||
myHistogram[ localKeys[0] ]++;
|
||||
myHistogram[ localKeys[1] ]++;
|
||||
myHistogram[ localKeys[2] ]++;
|
||||
myHistogram[ localKeys[3] ]++;
|
||||
}
|
||||
|
||||
{ // move to shared
|
||||
for(int i=0; i<NUM_BUCKET; i++)
|
||||
{
|
||||
localHistogramMat[i*WG_SIZE+lIdx.x] = myHistogram[i];
|
||||
}
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
{ // reduce to 1
|
||||
if( lIdx.x < 64 )//WG_SIZE/2 )
|
||||
{
|
||||
for(int i=0; i<NUM_BUCKET/2; i++)
|
||||
{
|
||||
int idx = lIdx.x;
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];
|
||||
}
|
||||
}
|
||||
else if( lIdx.x < 128 )
|
||||
{
|
||||
for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)
|
||||
{
|
||||
int idx = lIdx.x-64;
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];
|
||||
localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
{ // write data
|
||||
if( lIdx.x < NUM_BUCKET )
|
||||
{
|
||||
wHistogram1[ lIdx.x*m_nWorkGroupsToExecute + gIdx.x ] = localHistogramMat[ lIdx.x*WG_SIZE+0 ];
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
// for MAX_WG_SIZE 20
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void PrefixScanKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )
|
||||
{
|
||||
uint4 myData = uint4(0,0,0,0);
|
||||
if( 4*lIdx.x+0 < NUM_BUCKET*m_nWorkGroupsToExecute )
|
||||
myData.x = wHistogram1[4*lIdx.x+0];
|
||||
if( 4*lIdx.x+1 < NUM_BUCKET*m_nWorkGroupsToExecute )
|
||||
myData.y = wHistogram1[4*lIdx.x+1];
|
||||
if( 4*lIdx.x+2 < NUM_BUCKET*m_nWorkGroupsToExecute )
|
||||
myData.z = wHistogram1[4*lIdx.x+2];
|
||||
if( 4*lIdx.x+3 < NUM_BUCKET*m_nWorkGroupsToExecute )
|
||||
myData.w = wHistogram1[4*lIdx.x+3];
|
||||
|
||||
uint totalSum;
|
||||
|
||||
uint4 scanned = localPrefixSum128V( myData, lIdx.x, totalSum );
|
||||
|
||||
wHistogram1[4*lIdx.x+0] = scanned.x;
|
||||
wHistogram1[4*lIdx.x+1] = scanned.y;
|
||||
wHistogram1[4*lIdx.x+2] = scanned.z;
|
||||
wHistogram1[4*lIdx.x+3] = scanned.w;
|
||||
}
|
||||
*/
|
||||
|
||||
// for MAX_WG_SIZE 80
|
||||
// can hold up to WG_SIZE*12 (128*12 > 80*16 )
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void PrefixScanKernel( DEFAULT_ARGS )
|
||||
{
|
||||
u32 lIdx = GET_LOCAL_IDX;
|
||||
u32 wgIdx = GET_GROUP_IDX;
|
||||
|
||||
uint data[12] = {0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
for(int i=0; i<12; i++)
|
||||
{
|
||||
if( int(12*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )
|
||||
data[i] = wHistogram1[12*lIdx+i];
|
||||
}
|
||||
|
||||
uint4 myData = uint4(0,0,0,0);
|
||||
myData.x = data[0] + data[1];
|
||||
myData.y = data[2] + data[3];
|
||||
myData.z = data[4] + data[5];
|
||||
myData.w = data[6] + data[7];
|
||||
|
||||
|
||||
uint totalSum;
|
||||
uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );
|
||||
|
||||
data[11] = scanned.w + data[9] + data[10];
|
||||
data[10] = scanned.w + data[9];
|
||||
data[9] = scanned.w;
|
||||
data[8] = scanned.z + data[6] + data[7];
|
||||
data[7] = scanned.z + data[6];
|
||||
data[6] = scanned.z;
|
||||
data[5] = scanned.y + data[3] + data[4];
|
||||
data[4] = scanned.y + data[3];
|
||||
data[3] = scanned.y;
|
||||
data[2] = scanned.x + data[0] + data[1];
|
||||
data[1] = scanned.x + data[0];
|
||||
data[0] = scanned.x;
|
||||
|
||||
for(int i=0; i<12; i++)
|
||||
{
|
||||
wHistogram1[12*lIdx+i] = data[i];
|
||||
}
|
||||
}
|
||||
/*
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void PrefixScanKernel( DEFAULT_ARGS )
|
||||
{
|
||||
u32 lIdx = GET_LOCAL_IDX;
|
||||
u32 wgIdx = GET_GROUP_IDX;
|
||||
|
||||
uint data[8] = {0,0,0,0,0,0,0,0};
|
||||
for(int i=0; i<8; i++)
|
||||
{
|
||||
if( int(8*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )
|
||||
data[i] = wHistogram1[8*lIdx+i];
|
||||
}
|
||||
|
||||
uint4 myData = uint4(0,0,0,0);
|
||||
myData.x = data[0] + data[1];
|
||||
myData.y = data[2] + data[3];
|
||||
myData.z = data[4] + data[5];
|
||||
myData.w = data[6] + data[7];
|
||||
|
||||
|
||||
uint totalSum;
|
||||
uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );
|
||||
|
||||
data[7] = scanned.w + data[6];
|
||||
data[6] = scanned.w;// + data[5];
|
||||
data[5] = scanned.z + data[4];
|
||||
data[4] = scanned.z;// + data[3];
|
||||
data[3] = scanned.y + data[2];
|
||||
data[2] = scanned.y;// + data[1];
|
||||
data[1] = scanned.x + data[0];
|
||||
data[0] = scanned.x;
|
||||
|
||||
for(int i=0; i<8; i++)
|
||||
{
|
||||
wHistogram1[8*lIdx+i] = data[i];
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void CopyKernel( DEFAULT_ARGS )
|
||||
{
|
||||
u32 lIdx = GET_LOCAL_IDX;
|
||||
u32 wgIdx = GET_GROUP_IDX;
|
||||
|
||||
for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)
|
||||
{
|
||||
KeyValuePair myData[4];
|
||||
uint startAddrBlock;
|
||||
{ // read data
|
||||
int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
|
||||
startAddrBlock = lIdx*4;
|
||||
uint startAddress = igroup*numLocalElements + startAddrBlock;
|
||||
|
||||
myData[0] = dataToSort[startAddress+0];
|
||||
myData[1] = dataToSort[startAddress+1];
|
||||
myData[2] = dataToSort[startAddress+2];
|
||||
myData[3] = dataToSort[startAddress+3];
|
||||
}
|
||||
|
||||
{
|
||||
int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
|
||||
uint startAddress = igroup*numLocalElements + startAddrBlock;
|
||||
|
||||
dataToSortOut[startAddress+0] = myData[0];
|
||||
dataToSortOut[startAddress+1] = myData[1];
|
||||
dataToSortOut[startAddress+2] = myData[2];
|
||||
dataToSortOut[startAddress+3] = myData[3];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,987 @@
|
||||
static const char* radixSortAdvancedKernelsDX11= \
|
||||
"/*\n"
|
||||
" 2011 Takahiro Harada\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"typedef uint u32;\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_IDX groupIdx.x\n"
|
||||
"#define GET_LOCAL_IDX localIdx.x\n"
|
||||
"#define GET_GLOBAL_IDX globalIdx.x\n"
|
||||
"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
|
||||
"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
|
||||
"#define AtomInc(x) InterlockedAdd(x, 1)\n"
|
||||
"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
|
||||
"\n"
|
||||
"#define min2 min\n"
|
||||
"#define max2 max\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"cbuffer CB0 : register( b0 )\n"
|
||||
"{\n"
|
||||
" int m_startBit;\n"
|
||||
" int m_totalBlocks;\n"
|
||||
" int m_nWorkGroupsToExecute;\n"
|
||||
" int m_nBlocksPerGroup;\n"
|
||||
"\n"
|
||||
"};\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct {\n"
|
||||
" unsigned int key;\n"
|
||||
" unsigned int value;\n"
|
||||
"} KeyValuePair;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"StructuredBuffer<u32> rHistogram : register(t0);\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<KeyValuePair> dataToSort : register( u0 );\n"
|
||||
"RWStructuredBuffer<KeyValuePair> dataToSortOut : register( u1 );\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"#define WG_SIZE 128\n"
|
||||
"#define ELEMENTS_PER_WORK_ITEM 4\n"
|
||||
"#define BITS_PER_PASS 4\n"
|
||||
"#define NUM_BUCKET (1<<BITS_PER_PASS)\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"groupshared u32 sorterSharedMemory[max(WG_SIZE*2*2, WG_SIZE*ELEMENTS_PER_WORK_ITEM*2)];\n"
|
||||
"groupshared u32 localHistogramToCarry[NUM_BUCKET];\n"
|
||||
"groupshared u32 localHistogram[NUM_BUCKET*2];\n"
|
||||
"groupshared u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
|
||||
"groupshared u32 localPrefixSum[NUM_BUCKET];\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"#define SET_LOCAL_SORT_DATA(idx, sortDataIn) sorterSharedMemory[2*(idx)+0] = sortDataIn.key; sorterSharedMemory[2*(idx)+1] = sortDataIn.value; \n"
|
||||
"#define GET_LOCAL_SORT_DATA(idx, sortDataOut) sortDataOut.key = sorterSharedMemory[2*(idx)+0]; sortDataOut.value = sorterSharedMemory[2*(idx)+1];\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"uint4 prefixScanVector( uint4 data )\n"
|
||||
"{\n"
|
||||
" data.y += data.x;\n"
|
||||
" data.w += data.z;\n"
|
||||
" data.z += data.y;\n"
|
||||
" data.w += data.y;\n"
|
||||
" return data;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint prefixScanVectorEx( inout uint4 data )\n"
|
||||
"{\n"
|
||||
" uint4 backup = data;\n"
|
||||
" data.y += data.x;\n"
|
||||
" data.w += data.z;\n"
|
||||
" data.z += data.y;\n"
|
||||
" data.w += data.y;\n"
|
||||
" uint sum = data.w;\n"
|
||||
" data -= backup;\n"
|
||||
" return sum;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint localPrefixScan128( uint pData, uint lIdx, inout uint totalSum )\n"
|
||||
"{\n"
|
||||
" { // Set data\n"
|
||||
" sorterSharedMemory[lIdx] = 0;\n"
|
||||
" sorterSharedMemory[lIdx+WG_SIZE] = pData;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" { // Prefix sum\n"
|
||||
" int idx = 2*lIdx + (WG_SIZE+1);\n"
|
||||
" if( lIdx < 64 )\n"
|
||||
" {\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; \n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-8]; \n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-32]; \n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
|
||||
" }\n"
|
||||
" if( lIdx < 64 ) sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" totalSum = sorterSharedMemory[WG_SIZE*2-1];\n"
|
||||
" return sorterSharedMemory[lIdx+127];\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"void localPrefixScan128Dual( uint pData0, uint pData1, uint lIdx, \n"
|
||||
" inout uint rank0, inout uint rank1,\n"
|
||||
" inout uint totalSum0, inout uint totalSum1 )\n"
|
||||
"{\n"
|
||||
" { // Set data\n"
|
||||
" sorterSharedMemory[lIdx] = 0;\n"
|
||||
" sorterSharedMemory[lIdx+WG_SIZE] = pData0;\n"
|
||||
" sorterSharedMemory[2*WG_SIZE+lIdx] = 0;\n"
|
||||
" sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
"// if( lIdx < 128 ) // todo. assert wg size is 128\n"
|
||||
" { // Prefix sum\n"
|
||||
" int blockIdx = lIdx/64;\n"
|
||||
" int groupIdx = lIdx%64;\n"
|
||||
" int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;\n"
|
||||
"\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-2];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-32]; \n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
|
||||
"\n"
|
||||
" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" totalSum0 = sorterSharedMemory[WG_SIZE*2-1];\n"
|
||||
" rank0 = sorterSharedMemory[lIdx+127];\n"
|
||||
" totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];\n"
|
||||
" rank1 = sorterSharedMemory[2*WG_SIZE+lIdx+127];\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum )\n"
|
||||
"{\n"
|
||||
" { // Set data\n"
|
||||
" sorterSharedMemory[lIdx] = 0;\n"
|
||||
" sorterSharedMemory[lIdx+WG_SIZE] = prefixScanVectorEx( pData );\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" { // Prefix sum\n"
|
||||
" int idx = 2*lIdx + (WG_SIZE+1);\n"
|
||||
" if( lIdx < 64 )\n"
|
||||
" {\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; \n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-8]; \n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-32]; \n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
|
||||
"\n"
|
||||
" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" totalSum = sorterSharedMemory[WG_SIZE*2-1];\n"
|
||||
" uint addValue = sorterSharedMemory[lIdx+127];\n"
|
||||
" return pData + uint4(addValue, addValue, addValue, addValue);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"void localPrefixSum128Dual( uint4 pData0, uint4 pData1, uint lIdx, \n"
|
||||
" inout uint4 dataOut0, inout uint4 dataOut1, \n"
|
||||
" inout uint totalSum0, inout uint totalSum1 )\n"
|
||||
"{\n"
|
||||
"/*\n"
|
||||
" dataOut0 = localPrefixSum128V( pData0, lIdx, totalSum0 );\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" dataOut1 = localPrefixSum128V( pData1, lIdx, totalSum1 );\n"
|
||||
" return;\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
" uint4 backup0 = pData0;\n"
|
||||
" uint4 backup1 = pData1;\n"
|
||||
"\n"
|
||||
" { // Prefix sum in a vector\n"
|
||||
" pData0 = prefixScanVector( pData0 );\n"
|
||||
" pData1 = prefixScanVector( pData1 );\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" { // Set data\n"
|
||||
" sorterSharedMemory[lIdx] = 0;\n"
|
||||
" sorterSharedMemory[lIdx+WG_SIZE] = pData0.w;\n"
|
||||
" sorterSharedMemory[2*WG_SIZE+lIdx] = 0;\n"
|
||||
" sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1.w;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
"// if( lIdx < 128 ) // todo. assert wg size is 128\n"
|
||||
" { // Prefix sum\n"
|
||||
" int blockIdx = lIdx/64;\n"
|
||||
" int groupIdx = lIdx%64;\n"
|
||||
" int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;\n"
|
||||
"\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-2];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-32]; \n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
|
||||
"\n"
|
||||
" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" totalSum0 = sorterSharedMemory[WG_SIZE*2-1];\n"
|
||||
" {\n"
|
||||
" uint addValue = sorterSharedMemory[lIdx+127];\n"
|
||||
" dataOut0 = pData0 + uint4(addValue, addValue, addValue, addValue) - backup0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];\n"
|
||||
" {\n"
|
||||
" uint addValue = sorterSharedMemory[2*WG_SIZE+lIdx+127];\n"
|
||||
" dataOut1 = pData1 + uint4(addValue, addValue, addValue, addValue) - backup1;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint4 extractKeys(uint4 data, uint targetKey)\n"
|
||||
"{\n"
|
||||
" uint4 key;\n"
|
||||
" key.x = data.x == targetKey ? 1:0;\n"
|
||||
" key.y = data.y == targetKey ? 1:0;\n"
|
||||
" key.z = data.z == targetKey ? 1:0;\n"
|
||||
" key.w = data.w == targetKey ? 1:0;\n"
|
||||
" return key;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint4 extractKeysByBits(uint4 data, uint targetKey)\n"
|
||||
"{\n"
|
||||
" uint4 key;\n"
|
||||
" uint mask = 1<<targetKey;\n"
|
||||
" key.x = (data.x & mask) >> targetKey;\n"
|
||||
" key.y = (data.y & mask) >> targetKey;\n"
|
||||
" key.z = (data.z & mask) >> targetKey;\n"
|
||||
" key.w = (data.w & mask) >> targetKey;\n"
|
||||
" return key;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint packKeys(uint lower, uint upper)\n"
|
||||
"{\n"
|
||||
" return lower|(upper<<16);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint4 packKeys(uint4 lower, uint4 upper)\n"
|
||||
"{\n"
|
||||
" return uint4( lower.x|(upper.x<<16), lower.y|(upper.y<<16), lower.z|(upper.z<<16), lower.w|(upper.w<<16) );\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint extractLower( uint data )\n"
|
||||
"{\n"
|
||||
" return data&0xffff;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint extractUpper( uint data )\n"
|
||||
"{\n"
|
||||
" return (data>>16)&0xffff;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint4 extractLower( uint4 data )\n"
|
||||
"{\n"
|
||||
" return uint4( data.x&0xffff, data.y&0xffff, data.z&0xffff, data.w&0xffff );\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint4 extractUpper( uint4 data )\n"
|
||||
"{\n"
|
||||
" return uint4( (data.x>>16)&0xffff, (data.y>>16)&0xffff, (data.z>>16)&0xffff, (data.w>>16)&0xffff );\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void SortAndScatterKernel( DEFAULT_ARGS ) \n"
|
||||
"{\n"
|
||||
" u32 lIdx = GET_LOCAL_IDX;\n"
|
||||
" u32 wgIdx = GET_GROUP_IDX;\n"
|
||||
"\n"
|
||||
" if( lIdx < (NUM_BUCKET) )\n"
|
||||
" {\n"
|
||||
" localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" for(uint igroup=wgIdx*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx+1)*m_nBlocksPerGroup); igroup++)\n"
|
||||
" {\n"
|
||||
" u32 myHistogram;\n"
|
||||
" if( lIdx < (NUM_BUCKET) )\n"
|
||||
" {\n"
|
||||
" localPrefixSum[lIdx] = 0.f;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" u32 newOffset[4];\n"
|
||||
" KeyValuePair myData[4];\n"
|
||||
" { // read data\n"
|
||||
" int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
|
||||
" uint startAddress = igroup*numLocalElements + lIdx*4;\n"
|
||||
"\n"
|
||||
" myData[0] = dataToSort[startAddress+0];\n"
|
||||
" myData[1] = dataToSort[startAddress+1];\n"
|
||||
" myData[2] = dataToSort[startAddress+2];\n"
|
||||
" myData[3] = dataToSort[startAddress+3];\n"
|
||||
"\n"
|
||||
" newOffset[0] = newOffset[1] = newOffset[2] = newOffset[3] = 0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" int localOffset = 0;\n"
|
||||
" uint4 b = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);\n"
|
||||
" for(uint targetKey=0; targetKey<(NUM_BUCKET); targetKey+=4)\n"
|
||||
" {\n"
|
||||
" uint4 key[4];\n"
|
||||
" uint keySet[2];\n"
|
||||
" { // pack 4\n"
|
||||
" uint4 scannedKey[4];\n"
|
||||
" key[0] = scannedKey[0] = extractKeys( b, targetKey+0 );\n"
|
||||
" key[1] = scannedKey[1] = extractKeys( b, targetKey+1 );\n"
|
||||
" key[2] = scannedKey[2] = extractKeys( b, targetKey+2 );\n"
|
||||
" key[3] = scannedKey[3] = extractKeys( b, targetKey+3 );\n"
|
||||
" {\n"
|
||||
" uint s[4];\n"
|
||||
" s[0] = prefixScanVectorEx( scannedKey[0] );\n"
|
||||
" s[1] = prefixScanVectorEx( scannedKey[1] );\n"
|
||||
" s[2] = prefixScanVectorEx( scannedKey[2] );\n"
|
||||
" s[3] = prefixScanVectorEx( scannedKey[3] );\n"
|
||||
" keySet[0] = packKeys( s[0], s[1] );\n"
|
||||
" keySet[1] = packKeys( s[2], s[3] );\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" uint dstAddressBase[4];\n"
|
||||
" {\n"
|
||||
"\n"
|
||||
" uint totalSumPacked[2];\n"
|
||||
" uint dstAddressPacked[2];\n"
|
||||
"\n"
|
||||
" localPrefixScan128Dual( keySet[0], keySet[1], lIdx, dstAddressPacked[0], dstAddressPacked[1], totalSumPacked[0], totalSumPacked[1] );\n"
|
||||
"\n"
|
||||
" dstAddressBase[0] = extractLower( dstAddressPacked[0] );\n"
|
||||
" dstAddressBase[1] = extractUpper( dstAddressPacked[0] );\n"
|
||||
" dstAddressBase[2] = extractLower( dstAddressPacked[1] );\n"
|
||||
" dstAddressBase[3] = extractUpper( dstAddressPacked[1] );\n"
|
||||
"\n"
|
||||
" uint4 histogram;\n"
|
||||
" histogram.x = extractLower(totalSumPacked[0]);\n"
|
||||
" histogram.y = extractUpper(totalSumPacked[0]);\n"
|
||||
" histogram.z = extractLower(totalSumPacked[1]);\n"
|
||||
" histogram.w = extractUpper(totalSumPacked[1]);\n"
|
||||
"\n"
|
||||
" if( lIdx == targetKey + 0 ) myHistogram = histogram.x;\n"
|
||||
" else if( lIdx == targetKey + 1 ) myHistogram = histogram.y;\n"
|
||||
" else if( lIdx == targetKey + 2 ) myHistogram = histogram.z;\n"
|
||||
" else if( lIdx == targetKey + 3 ) myHistogram = histogram.w;\n"
|
||||
" \n"
|
||||
" uint histogramSum = prefixScanVectorEx( histogram );\n"
|
||||
"\n"
|
||||
" if( lIdx == targetKey + 0 ) localPrefixSum[targetKey+0] = localOffset+histogram.x;\n"
|
||||
" else if( lIdx == targetKey + 1 ) localPrefixSum[targetKey+1] = localOffset+histogram.y;\n"
|
||||
" else if( lIdx == targetKey + 2 ) localPrefixSum[targetKey+2] = localOffset+histogram.z;\n"
|
||||
" else if( lIdx == targetKey + 3 ) localPrefixSum[targetKey+3] = localOffset+histogram.w;\n"
|
||||
"\n"
|
||||
" localOffset += histogramSum;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
" for(int ie=0; ie<4; ie++)\n"
|
||||
" {\n"
|
||||
" uint4 scannedKey = key[ie];\n"
|
||||
" prefixScanVectorEx( scannedKey );\n"
|
||||
"\n"
|
||||
" uint offset = localPrefixSum[targetKey + ie] + dstAddressBase[ie];\n"
|
||||
" uint4 dstAddress = uint4( offset, offset, offset, offset ) + scannedKey;\n"
|
||||
"\n"
|
||||
" newOffset[0] += dstAddress.x*key[ie].x;\n"
|
||||
" newOffset[1] += dstAddress.y*key[ie].y;\n"
|
||||
" newOffset[2] += dstAddress.z*key[ie].z;\n"
|
||||
" newOffset[3] += dstAddress.w*key[ie].w;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" { // local scatter\n"
|
||||
" SET_LOCAL_SORT_DATA(newOffset[0], myData[0]);\n"
|
||||
" SET_LOCAL_SORT_DATA(newOffset[1], myData[1]);\n"
|
||||
" SET_LOCAL_SORT_DATA(newOffset[2], myData[2]);\n"
|
||||
" SET_LOCAL_SORT_DATA(newOffset[3], myData[3]);\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" { // write data\n"
|
||||
" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
|
||||
" {\n"
|
||||
" int dataIdx = 4*lIdx+i;\n"
|
||||
" KeyValuePair localData; GET_LOCAL_SORT_DATA( dataIdx, localData );\n"
|
||||
" int binIdx = (localData.key >> m_startBit) & 0xf;\n"
|
||||
" int groupOffset = localHistogramToCarry[binIdx];\n"
|
||||
" int myIdx = dataIdx - localPrefixSum[binIdx];\n"
|
||||
"\n"
|
||||
" dataToSortOut[ groupOffset + myIdx ] = localData;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" if( lIdx < NUM_BUCKET )\n"
|
||||
" {\n"
|
||||
" localHistogramToCarry[lIdx] += myHistogram;\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void SortAndScatterKernel1( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" u32 lIdx = GET_LOCAL_IDX;\n"
|
||||
" u32 wgIdx = GET_GROUP_IDX;\n"
|
||||
"\n"
|
||||
" if( lIdx < (NUM_BUCKET) )\n"
|
||||
" {\n"
|
||||
" localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx.x];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
|
||||
" {\n"
|
||||
" u32 myHistogram;\n"
|
||||
"\n"
|
||||
" KeyValuePair myData[4];\n"
|
||||
" uint startAddrBlock;\n"
|
||||
" { // read data\n"
|
||||
" int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
|
||||
" startAddrBlock = lIdx*4;\n"
|
||||
" uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
|
||||
"\n"
|
||||
" myData[0] = dataToSort[startAddress+0];\n"
|
||||
" myData[1] = dataToSort[startAddress+1];\n"
|
||||
" myData[2] = dataToSort[startAddress+2];\n"
|
||||
" myData[3] = dataToSort[startAddress+3];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" // local sort\n"
|
||||
" for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)\n"
|
||||
" {\n"
|
||||
" uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);\n"
|
||||
" uint total;\n"
|
||||
" uint4 rankOfP = localPrefixSum128V( keys, lIdx, total );\n"
|
||||
" uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );\n"
|
||||
"\n"
|
||||
" uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;\n"
|
||||
" \n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );\n"
|
||||
" SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );\n"
|
||||
" SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );\n"
|
||||
" SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );\n"
|
||||
" GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );\n"
|
||||
" GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );\n"
|
||||
" GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" {// create histogram -> prefix sum\n"
|
||||
" if( lIdx < NUM_BUCKET )\n"
|
||||
" {\n"
|
||||
" localHistogram[lIdx] = 0;\n"
|
||||
" localHistogram[NUM_BUCKET+lIdx] = 0;\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);\n"
|
||||
" \n"
|
||||
" InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );\n"
|
||||
" InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );\n"
|
||||
" InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );\n"
|
||||
" InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );\n"
|
||||
" \n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" uint hIdx = NUM_BUCKET+lIdx;\n"
|
||||
" if( lIdx < NUM_BUCKET )\n"
|
||||
" {\n"
|
||||
" myHistogram = localHistogram[hIdx];\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" if( lIdx < NUM_BUCKET )\n"
|
||||
" {\n"
|
||||
" localHistogram[hIdx] = localHistogram[hIdx-1];\n"
|
||||
"\n"
|
||||
" localHistogram[hIdx] += localHistogram[hIdx-1];\n"
|
||||
" localHistogram[hIdx] += localHistogram[hIdx-2];\n"
|
||||
" localHistogram[hIdx] += localHistogram[hIdx-4];\n"
|
||||
" localHistogram[hIdx] += localHistogram[hIdx-8];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" }\n"
|
||||
"/*\n"
|
||||
" {// write back\n"
|
||||
" int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
|
||||
" startAddrBlock = lIdx*4;\n"
|
||||
" uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
|
||||
"\n"
|
||||
" for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
|
||||
" {\n"
|
||||
" dataToSortOut[ startAddress+ie ] = myData[ie];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"*/\n"
|
||||
" {\n"
|
||||
" for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
|
||||
" {\n"
|
||||
" int dataIdx = startAddrBlock+ie;\n"
|
||||
" int binIdx = (myData[ie].key>>m_startBit)&0xf;\n"
|
||||
" int groupOffset = localHistogramToCarry[binIdx];\n"
|
||||
" int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
|
||||
" dataToSortOut[ groupOffset + myIdx ] = myData[ie];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" if( lIdx < NUM_BUCKET )\n"
|
||||
" {\n"
|
||||
" localHistogramToCarry[lIdx] += myHistogram;\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"/*\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void SortAndScatterKernel1( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )\n"
|
||||
"{\n"
|
||||
" if( lIdx.x < (NUM_BUCKET) )\n"
|
||||
" {\n"
|
||||
" localHistogramToCarry[lIdx.x] = rHistogram[lIdx.x*m_nWorkGroupsToExecute + gIdx.x];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
|
||||
" {\n"
|
||||
" u32 myHistogram;\n"
|
||||
"\n"
|
||||
" KeyValuePair myData[4];\n"
|
||||
" uint startAddrBlock;\n"
|
||||
" { // read data\n"
|
||||
" int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
|
||||
" startAddrBlock = lIdx.x*4;\n"
|
||||
" uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
|
||||
"\n"
|
||||
" myData[0] = dataToSort[startAddress+0];\n"
|
||||
" myData[1] = dataToSort[startAddress+1];\n"
|
||||
" myData[2] = dataToSort[startAddress+2];\n"
|
||||
" myData[3] = dataToSort[startAddress+3];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)\n"
|
||||
" {\n"
|
||||
" uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);\n"
|
||||
" uint total;\n"
|
||||
" uint4 rankOfP = localPrefixSum128V( keys, lIdx.x, total );\n"
|
||||
" uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );\n"
|
||||
"\n"
|
||||
" uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;\n"
|
||||
" \n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );\n"
|
||||
" SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );\n"
|
||||
" SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );\n"
|
||||
" SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );\n"
|
||||
" GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );\n"
|
||||
" GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );\n"
|
||||
" GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" {// create histogram -> prefix sum\n"
|
||||
" if( lIdx.x < NUM_BUCKET )\n"
|
||||
" {\n"
|
||||
" localHistogram[lIdx.x] = 0;\n"
|
||||
" localHistogram[NUM_BUCKET+lIdx.x] = 0;\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);\n"
|
||||
" \n"
|
||||
" InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );\n"
|
||||
" InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );\n"
|
||||
" InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );\n"
|
||||
" InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );\n"
|
||||
" \n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" uint hIdx = NUM_BUCKET+lIdx.x;\n"
|
||||
" if( lIdx.x < NUM_BUCKET )\n"
|
||||
" {\n"
|
||||
" myHistogram = localHistogram[hIdx];\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
"\n"
|
||||
" if( lIdx.x < NUM_BUCKET )\n"
|
||||
" {\n"
|
||||
" localHistogram[hIdx] = localHistogram[hIdx-1];\n"
|
||||
"\n"
|
||||
" localHistogram[hIdx] += localHistogram[hIdx-1];\n"
|
||||
" localHistogram[hIdx] += localHistogram[hIdx-2];\n"
|
||||
" localHistogram[hIdx] += localHistogram[hIdx-4];\n"
|
||||
" localHistogram[hIdx] += localHistogram[hIdx-8];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" }\n"
|
||||
" {// write back\n"
|
||||
" for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
|
||||
" {\n"
|
||||
" int dataIdx = startAddrBlock+ie;\n"
|
||||
" int binIdx = (myData[ie].key>>m_startBit)&0xf;\n"
|
||||
" int groupOffset = localHistogramToCarry[binIdx];\n"
|
||||
" int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
|
||||
" \n"
|
||||
" dataToSortOut[ groupOffset + myIdx ] = myData[ie];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" if( lIdx.x < NUM_BUCKET )\n"
|
||||
" {\n"
|
||||
" localHistogramToCarry[lIdx.x] += myHistogram;\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"StructuredBuffer<KeyValuePair> dataToSort1 : register( t0 );\n"
|
||||
"RWStructuredBuffer<u32> wHistogram1 : register(u0);\n"
|
||||
"\n"
|
||||
"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx.x]\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void StreamCountKernel( DEFAULT_ARGS ) \n"
|
||||
"{\n"
|
||||
" u32 lIdx = GET_LOCAL_IDX;\n"
|
||||
" u32 wgIdx = GET_GROUP_IDX;\n"
|
||||
"\n"
|
||||
" int myHistogram[NUM_BUCKET];\n"
|
||||
"\n"
|
||||
" for(int i=0; i<NUM_BUCKET; i++)\n"
|
||||
" {\n"
|
||||
" MY_HISTOGRAM(i) = 0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
|
||||
" {\n"
|
||||
" uint localKeys[4];\n"
|
||||
" { // read data\n"
|
||||
" int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
|
||||
"\n"
|
||||
" uint4 localAddress = uint4(lIdx, lIdx, lIdx, lIdx)*4+uint4(0,1,2,3);\n"
|
||||
" uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;\n"
|
||||
"\n"
|
||||
" KeyValuePair localData0 = dataToSort1[globalAddress.x];\n"
|
||||
" KeyValuePair localData1 = dataToSort1[globalAddress.y];\n"
|
||||
" KeyValuePair localData2 = dataToSort1[globalAddress.z];\n"
|
||||
" KeyValuePair localData3 = dataToSort1[globalAddress.w];\n"
|
||||
"\n"
|
||||
" localKeys[0] = (localData0.key >> m_startBit) & 0xf;\n"
|
||||
" localKeys[1] = (localData1.key >> m_startBit) & 0xf;\n"
|
||||
" localKeys[2] = (localData2.key >> m_startBit) & 0xf;\n"
|
||||
" localKeys[3] = (localData3.key >> m_startBit) & 0xf;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" MY_HISTOGRAM( localKeys[0] )++;\n"
|
||||
" MY_HISTOGRAM( localKeys[1] )++;\n"
|
||||
" MY_HISTOGRAM( localKeys[2] )++;\n"
|
||||
" MY_HISTOGRAM( localKeys[3] )++;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" { // reduce to 1\n"
|
||||
" if( lIdx < 64 )//WG_SIZE/2 )\n"
|
||||
" {\n"
|
||||
" for(int i=0; i<NUM_BUCKET/2; i++)\n"
|
||||
" {\n"
|
||||
" int idx = lIdx;\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" else if( lIdx < 128 )\n"
|
||||
" {\n"
|
||||
" for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)\n"
|
||||
" {\n"
|
||||
" int idx = lIdx-64;\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" { // write data\n"
|
||||
" if( lIdx < NUM_BUCKET )\n"
|
||||
" {\n"
|
||||
" wHistogram1[ lIdx*m_nWorkGroupsToExecute + wgIdx.x ] = localHistogramMat[ lIdx*WG_SIZE+0 ];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"/*\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void StreamCountKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID ) \n"
|
||||
"{\n"
|
||||
" int myHistogram[NUM_BUCKET];\n"
|
||||
"\n"
|
||||
" for(int i=0; i<NUM_BUCKET; i++)\n"
|
||||
" {\n"
|
||||
" myHistogram[i] = 0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
|
||||
" {\n"
|
||||
" uint localKeys[4];\n"
|
||||
" { // read data\n"
|
||||
" int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
|
||||
"\n"
|
||||
" uint4 localAddress = uint4(lIdx.x, lIdx.x, lIdx.x, lIdx.x)*4+uint4(0,1,2,3);\n"
|
||||
" uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;\n"
|
||||
"\n"
|
||||
" KeyValuePair localData0 = dataToSort1[globalAddress.x];\n"
|
||||
" KeyValuePair localData1 = dataToSort1[globalAddress.y];\n"
|
||||
" KeyValuePair localData2 = dataToSort1[globalAddress.z];\n"
|
||||
" KeyValuePair localData3 = dataToSort1[globalAddress.w];\n"
|
||||
"\n"
|
||||
" localKeys[0] = (localData0.key >> m_startBit) & 0xf;\n"
|
||||
" localKeys[1] = (localData1.key >> m_startBit) & 0xf;\n"
|
||||
" localKeys[2] = (localData2.key >> m_startBit) & 0xf;\n"
|
||||
" localKeys[3] = (localData3.key >> m_startBit) & 0xf;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" myHistogram[ localKeys[0] ]++;\n"
|
||||
" myHistogram[ localKeys[1] ]++;\n"
|
||||
" myHistogram[ localKeys[2] ]++;\n"
|
||||
" myHistogram[ localKeys[3] ]++;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" { // move to shared\n"
|
||||
" for(int i=0; i<NUM_BUCKET; i++)\n"
|
||||
" {\n"
|
||||
" localHistogramMat[i*WG_SIZE+lIdx.x] = myHistogram[i];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" { // reduce to 1\n"
|
||||
" if( lIdx.x < 64 )//WG_SIZE/2 )\n"
|
||||
" {\n"
|
||||
" for(int i=0; i<NUM_BUCKET/2; i++)\n"
|
||||
" {\n"
|
||||
" int idx = lIdx.x;\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" else if( lIdx.x < 128 )\n"
|
||||
" {\n"
|
||||
" for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)\n"
|
||||
" {\n"
|
||||
" int idx = lIdx.x-64;\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
|
||||
" localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" { // write data\n"
|
||||
" if( lIdx.x < NUM_BUCKET )\n"
|
||||
" {\n"
|
||||
" wHistogram1[ lIdx.x*m_nWorkGroupsToExecute + gIdx.x ] = localHistogramMat[ lIdx.x*WG_SIZE+0 ];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"/*\n"
|
||||
"// for MAX_WG_SIZE 20\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void PrefixScanKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID ) \n"
|
||||
"{\n"
|
||||
" uint4 myData = uint4(0,0,0,0);\n"
|
||||
" if( 4*lIdx.x+0 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
|
||||
" myData.x = wHistogram1[4*lIdx.x+0];\n"
|
||||
" if( 4*lIdx.x+1 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
|
||||
" myData.y = wHistogram1[4*lIdx.x+1];\n"
|
||||
" if( 4*lIdx.x+2 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
|
||||
" myData.z = wHistogram1[4*lIdx.x+2];\n"
|
||||
" if( 4*lIdx.x+3 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
|
||||
" myData.w = wHistogram1[4*lIdx.x+3];\n"
|
||||
"\n"
|
||||
" uint totalSum;\n"
|
||||
"\n"
|
||||
" uint4 scanned = localPrefixSum128V( myData, lIdx.x, totalSum );\n"
|
||||
"\n"
|
||||
" wHistogram1[4*lIdx.x+0] = scanned.x;\n"
|
||||
" wHistogram1[4*lIdx.x+1] = scanned.y;\n"
|
||||
" wHistogram1[4*lIdx.x+2] = scanned.z;\n"
|
||||
" wHistogram1[4*lIdx.x+3] = scanned.w;\n"
|
||||
"}\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"// for MAX_WG_SIZE 80\n"
|
||||
"// can hold up to WG_SIZE*12 (128*12 > 80*16 )\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void PrefixScanKernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" u32 lIdx = GET_LOCAL_IDX;\n"
|
||||
" u32 wgIdx = GET_GROUP_IDX;\n"
|
||||
"\n"
|
||||
" uint data[12] = {0,0,0,0,0,0,0,0,0,0,0,0};\n"
|
||||
" for(int i=0; i<12; i++)\n"
|
||||
" {\n"
|
||||
" if( int(12*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
|
||||
" data[i] = wHistogram1[12*lIdx+i];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" uint4 myData = uint4(0,0,0,0);\n"
|
||||
" myData.x = data[0] + data[1];\n"
|
||||
" myData.y = data[2] + data[3];\n"
|
||||
" myData.z = data[4] + data[5];\n"
|
||||
" myData.w = data[6] + data[7];\n"
|
||||
"\n"
|
||||
"\n"
|
||||
" uint totalSum;\n"
|
||||
" uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );\n"
|
||||
"\n"
|
||||
" data[11] = scanned.w + data[9] + data[10];\n"
|
||||
" data[10] = scanned.w + data[9];\n"
|
||||
" data[9] = scanned.w;\n"
|
||||
" data[8] = scanned.z + data[6] + data[7];\n"
|
||||
" data[7] = scanned.z + data[6];\n"
|
||||
" data[6] = scanned.z;\n"
|
||||
" data[5] = scanned.y + data[3] + data[4];\n"
|
||||
" data[4] = scanned.y + data[3];\n"
|
||||
" data[3] = scanned.y;\n"
|
||||
" data[2] = scanned.x + data[0] + data[1];\n"
|
||||
" data[1] = scanned.x + data[0];\n"
|
||||
" data[0] = scanned.x;\n"
|
||||
"\n"
|
||||
" for(int i=0; i<12; i++)\n"
|
||||
" {\n"
|
||||
" wHistogram1[12*lIdx+i] = data[i];\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"/*\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void PrefixScanKernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" u32 lIdx = GET_LOCAL_IDX;\n"
|
||||
" u32 wgIdx = GET_GROUP_IDX;\n"
|
||||
"\n"
|
||||
" uint data[8] = {0,0,0,0,0,0,0,0};\n"
|
||||
" for(int i=0; i<8; i++)\n"
|
||||
" {\n"
|
||||
" if( int(8*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
|
||||
" data[i] = wHistogram1[8*lIdx+i];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" uint4 myData = uint4(0,0,0,0);\n"
|
||||
" myData.x = data[0] + data[1];\n"
|
||||
" myData.y = data[2] + data[3];\n"
|
||||
" myData.z = data[4] + data[5];\n"
|
||||
" myData.w = data[6] + data[7];\n"
|
||||
"\n"
|
||||
"\n"
|
||||
" uint totalSum;\n"
|
||||
" uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );\n"
|
||||
"\n"
|
||||
" data[7] = scanned.w + data[6];\n"
|
||||
" data[6] = scanned.w;// + data[5];\n"
|
||||
" data[5] = scanned.z + data[4];\n"
|
||||
" data[4] = scanned.z;// + data[3];\n"
|
||||
" data[3] = scanned.y + data[2];\n"
|
||||
" data[2] = scanned.y;// + data[1];\n"
|
||||
" data[1] = scanned.x + data[0];\n"
|
||||
" data[0] = scanned.x;\n"
|
||||
"\n"
|
||||
" for(int i=0; i<8; i++)\n"
|
||||
" {\n"
|
||||
" wHistogram1[8*lIdx+i] = data[i];\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void CopyKernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" u32 lIdx = GET_LOCAL_IDX;\n"
|
||||
" u32 wgIdx = GET_GROUP_IDX;\n"
|
||||
"\n"
|
||||
" for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
|
||||
" {\n"
|
||||
" KeyValuePair myData[4];\n"
|
||||
" uint startAddrBlock;\n"
|
||||
" { // read data\n"
|
||||
" int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
|
||||
" startAddrBlock = lIdx*4;\n"
|
||||
" uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
|
||||
"\n"
|
||||
" myData[0] = dataToSort[startAddress+0];\n"
|
||||
" myData[1] = dataToSort[startAddress+1];\n"
|
||||
" myData[2] = dataToSort[startAddress+2];\n"
|
||||
" myData[3] = dataToSort[startAddress+3];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" {\n"
|
||||
" int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
|
||||
" uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
|
||||
"\n"
|
||||
" dataToSortOut[startAddress+0] = myData[0];\n"
|
||||
" dataToSortOut[startAddress+1] = myData[1];\n"
|
||||
" dataToSortOut[startAddress+2] = myData[2];\n"
|
||||
" dataToSortOut[startAddress+3] = myData[3];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
;
|
||||
@@ -0,0 +1,93 @@
|
||||
/*
|
||||
2011 Takahiro Harada
|
||||
*/
|
||||
|
||||
template<>
|
||||
class RadixSort<TYPE_HOST> : public RadixSortBase
|
||||
{
|
||||
public:
|
||||
struct Data
|
||||
{
|
||||
HostBuffer<SortData>* m_workBuffer;
|
||||
};
|
||||
|
||||
enum
|
||||
{
|
||||
BITS_PER_PASS = 8,
|
||||
NUM_TABLES = (1<<BITS_PER_PASS),
|
||||
};
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_STANDARD)
|
||||
{
|
||||
ADLASSERT( deviceData->m_type == TYPE_HOST );
|
||||
|
||||
Data* data = new Data;
|
||||
data->m_workBuffer = new HostBuffer<SortData>( deviceData, maxSize );
|
||||
return data;
|
||||
}
|
||||
|
||||
static
|
||||
void deallocate(Data* data)
|
||||
{
|
||||
delete data->m_workBuffer;
|
||||
delete data;
|
||||
}
|
||||
|
||||
static
|
||||
void execute(Data* data, Buffer<SortData>& inout, int n, int sortBits = 32)
|
||||
{
|
||||
ADLASSERT( inout.getType() == TYPE_HOST );
|
||||
|
||||
int tables[NUM_TABLES];
|
||||
int counter[NUM_TABLES];
|
||||
|
||||
SortData* src = inout.m_ptr;
|
||||
SortData* dst = data->m_workBuffer->m_ptr;
|
||||
|
||||
int count=0;
|
||||
for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
|
||||
{
|
||||
for(int i=0; i<NUM_TABLES; i++)
|
||||
{
|
||||
tables[i] = 0;
|
||||
}
|
||||
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
|
||||
tables[tableIdx]++;
|
||||
}
|
||||
|
||||
// prefix scan
|
||||
int sum = 0;
|
||||
for(int i=0; i<NUM_TABLES; i++)
|
||||
{
|
||||
int iData = tables[i];
|
||||
tables[i] = sum;
|
||||
sum += iData;
|
||||
counter[i] = 0;
|
||||
}
|
||||
|
||||
// distribute
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
|
||||
|
||||
dst[tables[tableIdx] + counter[tableIdx]] = src[i];
|
||||
counter[tableIdx] ++;
|
||||
}
|
||||
|
||||
swap2( src, dst );
|
||||
count++;
|
||||
}
|
||||
|
||||
{
|
||||
if (count&1)
|
||||
//if( src != inout.m_ptr )
|
||||
{
|
||||
memcpy( dst, src, sizeof(SortData)*n );
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -0,0 +1,134 @@
|
||||
static const char* radixSortSimpleKernelsCL = \
|
||||
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
|
||||
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
|
||||
"\n"
|
||||
"typedef unsigned int u32;\n"
|
||||
"#define GET_GROUP_IDX get_group_id(0)\n"
|
||||
"#define GET_LOCAL_IDX get_local_id(0)\n"
|
||||
"#define GET_GLOBAL_IDX get_global_id(0)\n"
|
||||
"#define GET_GROUP_SIZE get_local_size(0)\n"
|
||||
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define AtomInc(x) atom_inc(&(x))\n"
|
||||
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"#define WG_SIZE 128\n"
|
||||
"#define NUM_PER_WI 4\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_key;\n"
|
||||
" u32 m_value;\n"
|
||||
"}SortData;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_startBit;\n"
|
||||
" u32 m_numGroups;\n"
|
||||
" u32 m_padding[2];\n"
|
||||
"} ConstBuffer;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
|
||||
"void LocalCountKernel(__global SortData* sortData,\n"
|
||||
" __global u32* ldsHistogramOut,\n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" __local u32 ldsHistogram[16][256];\n"
|
||||
"\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" for(int i=0; i<16; i++)\n"
|
||||
" {\n"
|
||||
" ldsHistogram[i][lIdx] = 0.f;\n"
|
||||
" ldsHistogram[i][lIdx+128] = 0.f;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" SortData datas[NUM_PER_WI];\n"
|
||||
" datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
|
||||
" datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
|
||||
" datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
|
||||
" datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
|
||||
"\n"
|
||||
" datas[0].m_key = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
" datas[1].m_key = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
" datas[2].m_key = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
" datas[3].m_key = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
"\n"
|
||||
" int tableIdx = lIdx%16;\n"
|
||||
"\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" u32 sum0, sum1;\n"
|
||||
" sum0 = sum1 = 0;\n"
|
||||
" for(int i=0; i<16; i++)\n"
|
||||
" {\n"
|
||||
" sum0 += ldsHistogram[i][lIdx];\n"
|
||||
" sum1 += ldsHistogram[i][lIdx+128];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" ldsHistogramOut[lIdx*cb.m_numGroups+GET_GROUP_IDX] = sum0;\n"
|
||||
" ldsHistogramOut[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX] = sum1;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
|
||||
"void ScatterKernel(__global SortData* sortData,\n"
|
||||
" __global SortData* sortDataOut,\n"
|
||||
" __global u32* scannedHistogram,\n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" __local u32 ldsCurrentLocation[256];\n"
|
||||
"\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" {\n"
|
||||
" ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*cb.m_numGroups+GET_GROUP_IDX];\n"
|
||||
" ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" SortData datas[NUM_PER_WI];\n"
|
||||
" int keys[NUM_PER_WI];\n"
|
||||
" datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
|
||||
" datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
|
||||
" datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
|
||||
" datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
|
||||
"\n"
|
||||
" keys[0] = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
" keys[1] = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
" keys[2] = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
" keys[3] = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
"\n"
|
||||
" int dst[NUM_PER_WI];\n"
|
||||
" for(int i=0; i<WG_SIZE; i++)\n"
|
||||
" {\n"
|
||||
" if( i==lIdx )\n"
|
||||
" {\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" }\n"
|
||||
" sortDataOut[dst[0]] = datas[0];\n"
|
||||
" sortDataOut[dst[1]] = datas[1];\n"
|
||||
" sortDataOut[dst[2]] = datas[2];\n"
|
||||
" sortDataOut[dst[3]] = datas[3];\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"";
|
||||
@@ -0,0 +1,131 @@
|
||||
static const char* radixSortSimpleKernelsDX11 = \
|
||||
"typedef uint u32;\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_IDX groupIdx.x\n"
|
||||
"#define GET_LOCAL_IDX localIdx.x\n"
|
||||
"#define GET_GLOBAL_IDX globalIdx.x\n"
|
||||
"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
|
||||
"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
|
||||
"#define AtomInc(x) InterlockedAdd(x, 1)\n"
|
||||
"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
|
||||
"\n"
|
||||
"// takahiro end\n"
|
||||
"#define WG_SIZE 128\n"
|
||||
"#define NUM_PER_WI 4\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_SIZE WG_SIZE\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_key;\n"
|
||||
" u32 m_value;\n"
|
||||
"}SortData;\n"
|
||||
"\n"
|
||||
"cbuffer SortCB : register( b0 )\n"
|
||||
"{\n"
|
||||
" u32 m_startBit;\n"
|
||||
" u32 m_numGroups;\n"
|
||||
" u32 m_padding[2];\n"
|
||||
"};\n"
|
||||
"\n"
|
||||
"StructuredBuffer<SortData> sortData : register( t0 );\n"
|
||||
"RWStructuredBuffer<u32> ldsHistogramOut : register( u0 );\n"
|
||||
"\n"
|
||||
"groupshared u32 ldsHistogram[16][256];\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void LocalCountKernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" for(int i=0; i<16; i++)\n"
|
||||
" {\n"
|
||||
" ldsHistogram[i][lIdx] = 0.f;\n"
|
||||
" ldsHistogram[i][lIdx+128] = 0.f;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" SortData datas[NUM_PER_WI];\n"
|
||||
" datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
|
||||
" datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
|
||||
" datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
|
||||
" datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
|
||||
"\n"
|
||||
" datas[0].m_key = (datas[0].m_key >> m_startBit) & 0xff;\n"
|
||||
" datas[1].m_key = (datas[1].m_key >> m_startBit) & 0xff;\n"
|
||||
" datas[2].m_key = (datas[2].m_key >> m_startBit) & 0xff;\n"
|
||||
" datas[3].m_key = (datas[3].m_key >> m_startBit) & 0xff;\n"
|
||||
"\n"
|
||||
" int tableIdx = lIdx%16;\n"
|
||||
"\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" u32 sum0, sum1;\n"
|
||||
" sum0 = sum1 = 0;\n"
|
||||
" for(int i=0; i<16; i++)\n"
|
||||
" {\n"
|
||||
" sum0 += ldsHistogram[i][lIdx];\n"
|
||||
" sum1 += ldsHistogram[i][lIdx+128];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" ldsHistogramOut[lIdx*m_numGroups+GET_GROUP_IDX] = sum0;\n"
|
||||
" ldsHistogramOut[(lIdx+128)*m_numGroups+GET_GROUP_IDX] = sum1;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<SortData> sortDataOut : register( u0 );\n"
|
||||
"RWStructuredBuffer<u32> scannedHistogram : register( u1 );\n"
|
||||
"\n"
|
||||
"groupshared u32 ldsCurrentLocation[256];\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void ScatterKernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" {\n"
|
||||
" ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*m_numGroups+GET_GROUP_IDX];\n"
|
||||
" ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*m_numGroups+GET_GROUP_IDX];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" SortData datas[NUM_PER_WI];\n"
|
||||
" int keys[NUM_PER_WI];\n"
|
||||
" datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
|
||||
" datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
|
||||
" datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
|
||||
" datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
|
||||
"\n"
|
||||
" keys[0] = (datas[0].m_key >> m_startBit) & 0xff;\n"
|
||||
" keys[1] = (datas[1].m_key >> m_startBit) & 0xff;\n"
|
||||
" keys[2] = (datas[2].m_key >> m_startBit) & 0xff;\n"
|
||||
" keys[3] = (datas[3].m_key >> m_startBit) & 0xff;\n"
|
||||
"\n"
|
||||
" int dst[NUM_PER_WI];\n"
|
||||
" for(int i=0; i<WG_SIZE; i++)\n"
|
||||
"// for(int i=0; i<m_padding[0]; i++) // to reduce compile time\n"
|
||||
" {\n"
|
||||
" if( i==lIdx )\n"
|
||||
" {\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" }\n"
|
||||
" sortDataOut[dst[0]] = datas[0];\n"
|
||||
" sortDataOut[dst[1]] = datas[1];\n"
|
||||
" sortDataOut[dst[2]] = datas[2];\n"
|
||||
" sortDataOut[dst[3]] = datas[3];\n"
|
||||
"}\n"
|
||||
"";
|
||||
@@ -0,0 +1,147 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library
|
||||
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Author Takahiro Harada
|
||||
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||
|
||||
typedef unsigned int u32;
|
||||
#define GET_GROUP_IDX get_group_id(0)
|
||||
#define GET_LOCAL_IDX get_local_id(0)
|
||||
#define GET_GLOBAL_IDX get_global_id(0)
|
||||
#define GET_GROUP_SIZE get_local_size(0)
|
||||
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define AtomInc(x) atom_inc(&(x))
|
||||
#define AtomInc1(x, out) out = atom_inc(&(x))
|
||||
|
||||
|
||||
#define WG_SIZE 128
|
||||
#define NUM_PER_WI 4
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u32 m_key;
|
||||
u32 m_value;
|
||||
}SortData;
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u32 m_startBit;
|
||||
u32 m_numGroups;
|
||||
u32 m_padding[2];
|
||||
} ConstBuffer;
|
||||
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
|
||||
void LocalCountKernel(__global SortData* sortData,
|
||||
__global u32* ldsHistogramOut,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
__local u32 ldsHistogram[16][256];
|
||||
|
||||
int lIdx = GET_LOCAL_IDX;
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
for(int i=0; i<16; i++)
|
||||
{
|
||||
ldsHistogram[i][lIdx] = 0.f;
|
||||
ldsHistogram[i][lIdx+128] = 0.f;
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
SortData datas[NUM_PER_WI];
|
||||
datas[0] = sortData[gIdx*NUM_PER_WI+0];
|
||||
datas[1] = sortData[gIdx*NUM_PER_WI+1];
|
||||
datas[2] = sortData[gIdx*NUM_PER_WI+2];
|
||||
datas[3] = sortData[gIdx*NUM_PER_WI+3];
|
||||
|
||||
datas[0].m_key = (datas[0].m_key >> cb.m_startBit) & 0xff;
|
||||
datas[1].m_key = (datas[1].m_key >> cb.m_startBit) & 0xff;
|
||||
datas[2].m_key = (datas[2].m_key >> cb.m_startBit) & 0xff;
|
||||
datas[3].m_key = (datas[3].m_key >> cb.m_startBit) & 0xff;
|
||||
|
||||
int tableIdx = lIdx%16;
|
||||
|
||||
AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);
|
||||
AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);
|
||||
AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);
|
||||
AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
u32 sum0, sum1;
|
||||
sum0 = sum1 = 0;
|
||||
for(int i=0; i<16; i++)
|
||||
{
|
||||
sum0 += ldsHistogram[i][lIdx];
|
||||
sum1 += ldsHistogram[i][lIdx+128];
|
||||
}
|
||||
|
||||
ldsHistogramOut[lIdx*cb.m_numGroups+GET_GROUP_IDX] = sum0;
|
||||
ldsHistogramOut[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX] = sum1;
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
|
||||
void ScatterKernel(__global SortData* sortData,
|
||||
__global SortData* sortDataOut,
|
||||
__global u32* scannedHistogram,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
__local u32 ldsCurrentLocation[256];
|
||||
|
||||
int lIdx = GET_LOCAL_IDX;
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
{
|
||||
ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*cb.m_numGroups+GET_GROUP_IDX];
|
||||
ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX];
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
SortData datas[NUM_PER_WI];
|
||||
int keys[NUM_PER_WI];
|
||||
datas[0] = sortData[gIdx*NUM_PER_WI+0];
|
||||
datas[1] = sortData[gIdx*NUM_PER_WI+1];
|
||||
datas[2] = sortData[gIdx*NUM_PER_WI+2];
|
||||
datas[3] = sortData[gIdx*NUM_PER_WI+3];
|
||||
|
||||
keys[0] = (datas[0].m_key >> cb.m_startBit) & 0xff;
|
||||
keys[1] = (datas[1].m_key >> cb.m_startBit) & 0xff;
|
||||
keys[2] = (datas[2].m_key >> cb.m_startBit) & 0xff;
|
||||
keys[3] = (datas[3].m_key >> cb.m_startBit) & 0xff;
|
||||
|
||||
int dst[NUM_PER_WI];
|
||||
for(int i=0; i<WG_SIZE; i++)
|
||||
{
|
||||
if( i==lIdx )
|
||||
{
|
||||
AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);
|
||||
AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);
|
||||
AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);
|
||||
AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);
|
||||
}
|
||||
GROUP_LDS_BARRIER;
|
||||
}
|
||||
sortDataOut[dst[0]] = datas[0];
|
||||
sortDataOut[dst[1]] = datas[1];
|
||||
sortDataOut[dst[2]] = datas[2];
|
||||
sortDataOut[dst[3]] = datas[3];
|
||||
}
|
||||
@@ -0,0 +1,133 @@
|
||||
/*
|
||||
2011 Takahiro Harada
|
||||
*/
|
||||
|
||||
typedef uint u32;
|
||||
|
||||
#define GET_GROUP_IDX groupIdx.x
|
||||
#define GET_LOCAL_IDX localIdx.x
|
||||
#define GET_GLOBAL_IDX globalIdx.x
|
||||
#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
|
||||
#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
|
||||
#define AtomInc(x) InterlockedAdd(x, 1)
|
||||
#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
|
||||
|
||||
// takahiro end
|
||||
#define WG_SIZE 128
|
||||
#define NUM_PER_WI 4
|
||||
|
||||
#define GET_GROUP_SIZE WG_SIZE
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u32 m_key;
|
||||
u32 m_value;
|
||||
}SortData;
|
||||
|
||||
cbuffer SortCB : register( b0 )
|
||||
{
|
||||
u32 m_startBit;
|
||||
u32 m_numGroups;
|
||||
u32 m_padding[2];
|
||||
};
|
||||
|
||||
StructuredBuffer<SortData> sortData : register( t0 );
|
||||
RWStructuredBuffer<u32> ldsHistogramOut : register( u0 );
|
||||
|
||||
groupshared u32 ldsHistogram[16][256];
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void LocalCountKernel( DEFAULT_ARGS )
|
||||
{
|
||||
int lIdx = GET_LOCAL_IDX;
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
for(int i=0; i<16; i++)
|
||||
{
|
||||
ldsHistogram[i][lIdx] = 0.f;
|
||||
ldsHistogram[i][lIdx+128] = 0.f;
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
SortData datas[NUM_PER_WI];
|
||||
datas[0] = sortData[gIdx*NUM_PER_WI+0];
|
||||
datas[1] = sortData[gIdx*NUM_PER_WI+1];
|
||||
datas[2] = sortData[gIdx*NUM_PER_WI+2];
|
||||
datas[3] = sortData[gIdx*NUM_PER_WI+3];
|
||||
|
||||
datas[0].m_key = (datas[0].m_key >> m_startBit) & 0xff;
|
||||
datas[1].m_key = (datas[1].m_key >> m_startBit) & 0xff;
|
||||
datas[2].m_key = (datas[2].m_key >> m_startBit) & 0xff;
|
||||
datas[3].m_key = (datas[3].m_key >> m_startBit) & 0xff;
|
||||
|
||||
int tableIdx = lIdx%16;
|
||||
|
||||
AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);
|
||||
AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);
|
||||
AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);
|
||||
AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
u32 sum0, sum1;
|
||||
sum0 = sum1 = 0;
|
||||
for(int i=0; i<16; i++)
|
||||
{
|
||||
sum0 += ldsHistogram[i][lIdx];
|
||||
sum1 += ldsHistogram[i][lIdx+128];
|
||||
}
|
||||
|
||||
ldsHistogramOut[lIdx*m_numGroups+GET_GROUP_IDX] = sum0;
|
||||
ldsHistogramOut[(lIdx+128)*m_numGroups+GET_GROUP_IDX] = sum1;
|
||||
}
|
||||
|
||||
|
||||
RWStructuredBuffer<SortData> sortDataOut : register( u0 );
|
||||
RWStructuredBuffer<u32> scannedHistogram : register( u1 );
|
||||
|
||||
groupshared u32 ldsCurrentLocation[256];
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void ScatterKernel( DEFAULT_ARGS )
|
||||
{
|
||||
int lIdx = GET_LOCAL_IDX;
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
{
|
||||
ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*m_numGroups+GET_GROUP_IDX];
|
||||
ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*m_numGroups+GET_GROUP_IDX];
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
SortData datas[NUM_PER_WI];
|
||||
int keys[NUM_PER_WI];
|
||||
datas[0] = sortData[gIdx*NUM_PER_WI+0];
|
||||
datas[1] = sortData[gIdx*NUM_PER_WI+1];
|
||||
datas[2] = sortData[gIdx*NUM_PER_WI+2];
|
||||
datas[3] = sortData[gIdx*NUM_PER_WI+3];
|
||||
|
||||
keys[0] = (datas[0].m_key >> m_startBit) & 0xff;
|
||||
keys[1] = (datas[1].m_key >> m_startBit) & 0xff;
|
||||
keys[2] = (datas[2].m_key >> m_startBit) & 0xff;
|
||||
keys[3] = (datas[3].m_key >> m_startBit) & 0xff;
|
||||
|
||||
int dst[NUM_PER_WI];
|
||||
for(int i=0; i<WG_SIZE; i++)
|
||||
// for(int i=0; i<m_padding[0]; i++) // to reduce compile time
|
||||
{
|
||||
if( i==lIdx )
|
||||
{
|
||||
AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);
|
||||
AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);
|
||||
AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);
|
||||
AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);
|
||||
}
|
||||
GROUP_LDS_BARRIER;
|
||||
}
|
||||
sortDataOut[dst[0]] = datas[0];
|
||||
sortDataOut[dst[1]] = datas[1];
|
||||
sortDataOut[dst[2]] = datas[2];
|
||||
sortDataOut[dst[3]] = datas[3];
|
||||
}
|
||||
@@ -0,0 +1,149 @@
|
||||
static const char* radixSortSimpleKernelsCL= \
|
||||
"/*\n"
|
||||
"Bullet Continuous Collision Detection and Physics Library\n"
|
||||
"Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org\n"
|
||||
"\n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose, \n"
|
||||
"including commercial applications, and to alter it and redistribute it freely, \n"
|
||||
"subject to the following restrictions:\n"
|
||||
"\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Author Takahiro Harada\n"
|
||||
"\n"
|
||||
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
|
||||
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
|
||||
"\n"
|
||||
"typedef unsigned int u32;\n"
|
||||
"#define GET_GROUP_IDX get_group_id(0)\n"
|
||||
"#define GET_LOCAL_IDX get_local_id(0)\n"
|
||||
"#define GET_GLOBAL_IDX get_global_id(0)\n"
|
||||
"#define GET_GROUP_SIZE get_local_size(0)\n"
|
||||
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define AtomInc(x) atom_inc(&(x))\n"
|
||||
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"#define WG_SIZE 128\n"
|
||||
"#define NUM_PER_WI 4\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_key; \n"
|
||||
" u32 m_value;\n"
|
||||
"}SortData;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_startBit;\n"
|
||||
" u32 m_numGroups;\n"
|
||||
" u32 m_padding[2];\n"
|
||||
"} ConstBuffer;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
|
||||
"void LocalCountKernel(__global SortData* sortData, \n"
|
||||
" __global u32* ldsHistogramOut,\n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" __local u32 ldsHistogram[16][256];\n"
|
||||
"\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" \n"
|
||||
" for(int i=0; i<16; i++)\n"
|
||||
" {\n"
|
||||
" ldsHistogram[i][lIdx] = 0.f;\n"
|
||||
" ldsHistogram[i][lIdx+128] = 0.f;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" SortData datas[NUM_PER_WI];\n"
|
||||
" datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
|
||||
" datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
|
||||
" datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
|
||||
" datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
|
||||
"\n"
|
||||
" datas[0].m_key = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
" datas[1].m_key = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
" datas[2].m_key = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
" datas[3].m_key = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
"\n"
|
||||
" int tableIdx = lIdx%16;\n"
|
||||
" \n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" u32 sum0, sum1;\n"
|
||||
" sum0 = sum1 = 0;\n"
|
||||
" for(int i=0; i<16; i++)\n"
|
||||
" {\n"
|
||||
" sum0 += ldsHistogram[i][lIdx];\n"
|
||||
" sum1 += ldsHistogram[i][lIdx+128];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" ldsHistogramOut[lIdx*cb.m_numGroups+GET_GROUP_IDX] = sum0;\n"
|
||||
" ldsHistogramOut[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX] = sum1;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
|
||||
"void ScatterKernel(__global SortData* sortData,\n"
|
||||
" __global SortData* sortDataOut,\n"
|
||||
" __global u32* scannedHistogram, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" __local u32 ldsCurrentLocation[256];\n"
|
||||
"\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" \n"
|
||||
" {\n"
|
||||
" ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*cb.m_numGroups+GET_GROUP_IDX];\n"
|
||||
" ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" SortData datas[NUM_PER_WI];\n"
|
||||
" int keys[NUM_PER_WI];\n"
|
||||
" datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
|
||||
" datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
|
||||
" datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
|
||||
" datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
|
||||
"\n"
|
||||
" keys[0] = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
" keys[1] = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
" keys[2] = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
" keys[3] = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
|
||||
"\n"
|
||||
" int dst[NUM_PER_WI];\n"
|
||||
" for(int i=0; i<WG_SIZE; i++)\n"
|
||||
" {\n"
|
||||
" if( i==lIdx )\n"
|
||||
" {\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" }\n"
|
||||
" sortDataOut[dst[0]] = datas[0];\n"
|
||||
" sortDataOut[dst[1]] = datas[1];\n"
|
||||
" sortDataOut[dst[2]] = datas[2];\n"
|
||||
" sortDataOut[dst[3]] = datas[3];\n"
|
||||
"}\n"
|
||||
;
|
||||
@@ -0,0 +1,135 @@
|
||||
static const char* radixSortSimpleKernelsDX11= \
|
||||
"/*\n"
|
||||
" 2011 Takahiro Harada\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"typedef uint u32;\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_IDX groupIdx.x\n"
|
||||
"#define GET_LOCAL_IDX localIdx.x\n"
|
||||
"#define GET_GLOBAL_IDX globalIdx.x\n"
|
||||
"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
|
||||
"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
|
||||
"#define AtomInc(x) InterlockedAdd(x, 1)\n"
|
||||
"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
|
||||
"\n"
|
||||
"// takahiro end\n"
|
||||
"#define WG_SIZE 128\n"
|
||||
"#define NUM_PER_WI 4\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_SIZE WG_SIZE\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_key; \n"
|
||||
" u32 m_value;\n"
|
||||
"}SortData;\n"
|
||||
"\n"
|
||||
"cbuffer SortCB : register( b0 )\n"
|
||||
"{\n"
|
||||
" u32 m_startBit;\n"
|
||||
" u32 m_numGroups;\n"
|
||||
" u32 m_padding[2];\n"
|
||||
"};\n"
|
||||
" \n"
|
||||
"StructuredBuffer<SortData> sortData : register( t0 );\n"
|
||||
"RWStructuredBuffer<u32> ldsHistogramOut : register( u0 );\n"
|
||||
"\n"
|
||||
"groupshared u32 ldsHistogram[16][256];\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void LocalCountKernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" \n"
|
||||
" for(int i=0; i<16; i++)\n"
|
||||
" {\n"
|
||||
" ldsHistogram[i][lIdx] = 0.f;\n"
|
||||
" ldsHistogram[i][lIdx+128] = 0.f;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" SortData datas[NUM_PER_WI];\n"
|
||||
" datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
|
||||
" datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
|
||||
" datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
|
||||
" datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
|
||||
"\n"
|
||||
" datas[0].m_key = (datas[0].m_key >> m_startBit) & 0xff;\n"
|
||||
" datas[1].m_key = (datas[1].m_key >> m_startBit) & 0xff;\n"
|
||||
" datas[2].m_key = (datas[2].m_key >> m_startBit) & 0xff;\n"
|
||||
" datas[3].m_key = (datas[3].m_key >> m_startBit) & 0xff;\n"
|
||||
"\n"
|
||||
" int tableIdx = lIdx%16;\n"
|
||||
" \n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);\n"
|
||||
" AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" u32 sum0, sum1;\n"
|
||||
" sum0 = sum1 = 0;\n"
|
||||
" for(int i=0; i<16; i++)\n"
|
||||
" {\n"
|
||||
" sum0 += ldsHistogram[i][lIdx];\n"
|
||||
" sum1 += ldsHistogram[i][lIdx+128];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" ldsHistogramOut[lIdx*m_numGroups+GET_GROUP_IDX] = sum0;\n"
|
||||
" ldsHistogramOut[(lIdx+128)*m_numGroups+GET_GROUP_IDX] = sum1;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<SortData> sortDataOut : register( u0 );\n"
|
||||
"RWStructuredBuffer<u32> scannedHistogram : register( u1 );\n"
|
||||
"\n"
|
||||
"groupshared u32 ldsCurrentLocation[256];\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void ScatterKernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" \n"
|
||||
" {\n"
|
||||
" ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*m_numGroups+GET_GROUP_IDX];\n"
|
||||
" ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*m_numGroups+GET_GROUP_IDX];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" SortData datas[NUM_PER_WI];\n"
|
||||
" int keys[NUM_PER_WI];\n"
|
||||
" datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
|
||||
" datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
|
||||
" datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
|
||||
" datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
|
||||
"\n"
|
||||
" keys[0] = (datas[0].m_key >> m_startBit) & 0xff;\n"
|
||||
" keys[1] = (datas[1].m_key >> m_startBit) & 0xff;\n"
|
||||
" keys[2] = (datas[2].m_key >> m_startBit) & 0xff;\n"
|
||||
" keys[3] = (datas[3].m_key >> m_startBit) & 0xff;\n"
|
||||
"\n"
|
||||
" int dst[NUM_PER_WI];\n"
|
||||
" for(int i=0; i<WG_SIZE; i++)\n"
|
||||
"// for(int i=0; i<m_padding[0]; i++) // to reduce compile time\n"
|
||||
" {\n"
|
||||
" if( i==lIdx )\n"
|
||||
" {\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);\n"
|
||||
" AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" }\n"
|
||||
" sortDataOut[dst[0]] = datas[0];\n"
|
||||
" sortDataOut[dst[1]] = datas[1];\n"
|
||||
" sortDataOut[dst[2]] = datas[2];\n"
|
||||
" sortDataOut[dst[3]] = datas[3];\n"
|
||||
"}\n"
|
||||
;
|
||||
@@ -0,0 +1,177 @@
|
||||
/*
|
||||
2011 Takahiro Harada
|
||||
*/
|
||||
|
||||
#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Sort\\RadixSortStandardKernels"
|
||||
#define KERNEL0 "LocalSortKernel"
|
||||
#define KERNEL1 "ScatterKernel"
|
||||
#define KERNEL2 "CopyKernel"
|
||||
|
||||
#include <AdlPrimitives/Sort/RadixSortStandardKernelsCL.h>
|
||||
#include <AdlPrimitives/Sort/RadixSortStandardKernelsDX11.h>
|
||||
|
||||
template<DeviceType type>
|
||||
class RadixSortStandard : public RadixSortBase
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
enum
|
||||
{
|
||||
WG_SIZE = 128,
|
||||
NUM_PER_WI = 4,
|
||||
|
||||
BITS_PER_PASS = 4,
|
||||
};
|
||||
|
||||
struct Data : public RadixSort<type>::Data
|
||||
{
|
||||
Kernel* m_localSortKernel;
|
||||
Kernel* m_scatterKernel;
|
||||
Kernel* m_copyKernel;
|
||||
|
||||
Buffer<u32>* m_workBuffer0;
|
||||
Buffer<u32>* m_workBuffer1;
|
||||
Buffer<u32>* m_workBuffer2;
|
||||
Buffer<SortData>* m_workBuffer3;
|
||||
Buffer<int4>* m_constBuffer[32/BITS_PER_PASS];
|
||||
};
|
||||
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_NORMAL);
|
||||
|
||||
static
|
||||
void deallocate(void* data);
|
||||
|
||||
static
|
||||
void execute(void* data, Buffer<SortData>& inout, int n, int sortBits);
|
||||
};
|
||||
|
||||
template<DeviceType type>
|
||||
typename RadixSortStandard<type>::Data* RadixSortStandard<type>::allocate(const Device* deviceData, int maxSize, Option option)
|
||||
{
|
||||
ADLASSERT( type == deviceData->m_type );
|
||||
|
||||
u32 maxNumGroups = (maxSize+WG_SIZE*NUM_PER_WI-1)/(WG_SIZE*NUM_PER_WI);
|
||||
|
||||
const char* src[] =
|
||||
#if defined(ADL_LOAD_KERNEL_FROM_STRING)
|
||||
{radixSortStandardKernelsCL,radixSortStandardKernelsDX11};
|
||||
// ADLASSERT(0);
|
||||
#else
|
||||
{0,0};
|
||||
#endif
|
||||
|
||||
Data* data = new Data;
|
||||
data->m_option = option;
|
||||
data->m_deviceData = deviceData;
|
||||
|
||||
data->m_localSortKernel = deviceData->getKernel( PATH, KERNEL0, 0, src[type] );
|
||||
data->m_scatterKernel = deviceData->getKernel( PATH, KERNEL1, 0, src[type] );
|
||||
data->m_copyKernel = deviceData->getKernel( PATH, KERNEL2, 0, src[type] );
|
||||
|
||||
// is this correct?
|
||||
data->m_scanData = PrefixScan<type>::allocate( deviceData, maxNumGroups*(1<<BITS_PER_PASS) );
|
||||
|
||||
data->m_workBuffer0 = new Buffer<u32>( deviceData, maxNumGroups*(1<<BITS_PER_PASS) );
|
||||
data->m_workBuffer1 = new Buffer<u32>( deviceData, maxNumGroups*(1<<BITS_PER_PASS) );
|
||||
data->m_workBuffer2 = new Buffer<u32>( deviceData, maxNumGroups*(1<<BITS_PER_PASS) );
|
||||
data->m_workBuffer3 = new Buffer<SortData>( deviceData, maxSize );
|
||||
for(int i=0; i<32/BITS_PER_PASS; i++)
|
||||
data->m_constBuffer[i] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
|
||||
data->m_maxSize = maxSize;
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void RadixSortStandard<type>::deallocate(void* rawData)
|
||||
{
|
||||
Data* data = (Data*)rawData;
|
||||
|
||||
delete data->m_workBuffer0;
|
||||
delete data->m_workBuffer1;
|
||||
delete data->m_workBuffer2;
|
||||
delete data->m_workBuffer3;
|
||||
for(int i=0; i<32/BITS_PER_PASS; i++)
|
||||
delete data->m_constBuffer[i];
|
||||
|
||||
PrefixScan<type>::deallocate( data->m_scanData );
|
||||
|
||||
delete data;
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void RadixSortStandard<type>::execute(void* rawData, Buffer<SortData>& inout, int n, int sortBits)
|
||||
{
|
||||
Data* data = (Data*)rawData;
|
||||
|
||||
ADLASSERT( n%512 == 0 );
|
||||
ADLASSERT( n <= data->m_maxSize );
|
||||
ADLASSERT( NUM_PER_WI == 4 );
|
||||
|
||||
Buffer<SortData>* src = BufferUtils::map<type, true>( data->m_deviceData, &inout );
|
||||
Buffer<SortData>* dst = data->m_workBuffer3;
|
||||
|
||||
const Device* deviceData = data->m_deviceData;
|
||||
|
||||
int numGroups = (n+WG_SIZE*NUM_PER_WI-1)/(WG_SIZE*NUM_PER_WI);
|
||||
|
||||
int4 constBuffer;
|
||||
|
||||
int iPass = 0;
|
||||
for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS, iPass++)
|
||||
{
|
||||
constBuffer.x = startBit;
|
||||
constBuffer.y = numGroups;
|
||||
constBuffer.z = WG_SIZE;
|
||||
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( src ), BufferInfo( data->m_workBuffer0 ), BufferInfo( data->m_workBuffer1 ) };
|
||||
|
||||
Launcher launcher( deviceData, data->m_localSortKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
|
||||
launcher.launch1D( WG_SIZE*numGroups, WG_SIZE );
|
||||
}
|
||||
|
||||
PrefixScan<type>::execute( data->m_scanData, *data->m_workBuffer0, *data->m_workBuffer2, numGroups*(1<<BITS_PER_PASS) );
|
||||
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( data->m_workBuffer2, true ), BufferInfo( data->m_workBuffer1, true ),
|
||||
BufferInfo( dst ) };
|
||||
|
||||
Launcher launcher( deviceData, data->m_scatterKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
|
||||
launcher.launch1D( WG_SIZE*numGroups, WG_SIZE );
|
||||
}
|
||||
|
||||
if(0)
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( dst, true ), BufferInfo( src ) };
|
||||
|
||||
Launcher launcher( deviceData, data->m_copyKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.launch1D( n, WG_SIZE );
|
||||
}
|
||||
swap2( src, dst );
|
||||
}
|
||||
|
||||
if( src != &inout )
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( dst ) };
|
||||
|
||||
Launcher launcher( deviceData, data->m_copyKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.launch1D( n, WG_SIZE );
|
||||
}
|
||||
|
||||
BufferUtils::unmap<true>( src, &inout );
|
||||
}
|
||||
|
||||
#undef PATH
|
||||
#undef KERNEL0
|
||||
#undef KERNEL1
|
||||
#undef KERNEL2
|
||||
@@ -0,0 +1,345 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library
|
||||
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Author Takahiro Harada
|
||||
|
||||
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||
|
||||
typedef unsigned int u32;
|
||||
#define GET_GROUP_IDX get_group_id(0)
|
||||
#define GET_LOCAL_IDX get_local_id(0)
|
||||
#define GET_GLOBAL_IDX get_global_id(0)
|
||||
#define GET_GROUP_SIZE get_local_size(0)
|
||||
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
|
||||
#define AtomInc(x) atom_inc(&(x))
|
||||
#define AtomInc1(x, out) out = atom_inc(&(x))
|
||||
|
||||
#define make_uint4 (uint4)
|
||||
#define make_uint2 (uint2)
|
||||
|
||||
#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
|
||||
|
||||
#define WG_SIZE 128
|
||||
#define NUM_PER_WI 4
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u32 m_key;
|
||||
u32 m_value;
|
||||
}SortData;
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u32 m_startBit;
|
||||
u32 m_numGroups;
|
||||
u32 m_padding[2];
|
||||
} ConstBuffer;
|
||||
|
||||
#define BITS_PER_PASS 4
|
||||
|
||||
|
||||
|
||||
uint4 prefixScanVector( uint4 data )
|
||||
{
|
||||
data.y += data.x;
|
||||
data.w += data.z;
|
||||
data.z += data.y;
|
||||
data.w += data.y;
|
||||
return data;
|
||||
}
|
||||
|
||||
uint prefixScanVectorEx( uint4* data )
|
||||
{
|
||||
uint4 backup = data[0];
|
||||
data[0].y += data[0].x;
|
||||
data[0].w += data[0].z;
|
||||
data[0].z += data[0].y;
|
||||
data[0].w += data[0].y;
|
||||
uint sum = data[0].w;
|
||||
*data -= backup;
|
||||
return sum;
|
||||
}
|
||||
|
||||
uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32 sorterSharedMemory[] )
|
||||
{
|
||||
{ // Set data
|
||||
sorterSharedMemory[lIdx] = 0;
|
||||
sorterSharedMemory[lIdx+WG_SIZE] = prefixScanVectorEx( &pData );
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
{ // Prefix sum
|
||||
int idx = 2*lIdx + (WG_SIZE+1);
|
||||
if( lIdx < 64 )
|
||||
{
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
|
||||
GROUP_MEM_FENCE;
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-2];
|
||||
GROUP_MEM_FENCE;
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
|
||||
GROUP_MEM_FENCE;
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
|
||||
GROUP_MEM_FENCE;
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
|
||||
GROUP_MEM_FENCE;
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-32];
|
||||
GROUP_MEM_FENCE;
|
||||
sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
|
||||
GROUP_MEM_FENCE;
|
||||
|
||||
sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
|
||||
GROUP_MEM_FENCE;
|
||||
}
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
*totalSum = sorterSharedMemory[WG_SIZE*2-1];
|
||||
uint addValue = sorterSharedMemory[lIdx+127];
|
||||
return pData + make_uint4(addValue, addValue, addValue, addValue);
|
||||
}
|
||||
|
||||
|
||||
void generateHistogram(u32 lIdx, u32 wgIdx,
|
||||
uint4 sortedData,
|
||||
__local u32 *histogram)
|
||||
{
|
||||
if( lIdx < (1<<BITS_PER_PASS) )
|
||||
{
|
||||
histogram[lIdx] = 0;
|
||||
}
|
||||
|
||||
int mask = ((1<<BITS_PER_PASS)-1);
|
||||
uint4 keys = make_uint4( (sortedData.x)&mask, (sortedData.y)&mask, (sortedData.z)&mask, (sortedData.w)&mask );
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
AtomInc( histogram[keys.x] );
|
||||
AtomInc( histogram[keys.y] );
|
||||
AtomInc( histogram[keys.z] );
|
||||
AtomInc( histogram[keys.w] );
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
|
||||
void LocalSortKernel(__global SortData* sortDataIn,
|
||||
__global u32* ldsHistogramOut0,
|
||||
__global u32* ldsHistogramOut1,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
|
||||
__local u32 ldsSortData[ WG_SIZE*NUM_PER_WI + 16 ];
|
||||
|
||||
int nElemsPerWG = WG_SIZE*NUM_PER_WI;
|
||||
u32 lIdx = GET_LOCAL_IDX;
|
||||
u32 wgIdx = GET_GROUP_IDX;
|
||||
u32 wgSize = GET_GROUP_SIZE;
|
||||
|
||||
uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
|
||||
|
||||
|
||||
SortData sortData[NUM_PER_WI];
|
||||
|
||||
{
|
||||
u32 offset = nElemsPerWG*wgIdx;
|
||||
sortData[0] = sortDataIn[offset+localAddr.x];
|
||||
sortData[1] = sortDataIn[offset+localAddr.y];
|
||||
sortData[2] = sortDataIn[offset+localAddr.z];
|
||||
sortData[3] = sortDataIn[offset+localAddr.w];
|
||||
}
|
||||
|
||||
int bitIdx = cb.m_startBit;
|
||||
do
|
||||
{
|
||||
// what is this?
|
||||
// if( lIdx == wgSize-1 ) ldsSortData[256] = sortData[3].m_key;
|
||||
u32 mask = (1<<bitIdx);
|
||||
uint4 cmpResult = make_uint4( sortData[0].m_key & mask, sortData[1].m_key & mask, sortData[2].m_key & mask, sortData[3].m_key & mask );
|
||||
uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );
|
||||
u32 total;
|
||||
prefixSum = localPrefixSum128V( prefixSum, lIdx, &total, ldsSortData );
|
||||
|
||||
{
|
||||
uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );
|
||||
dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
ldsSortData[dstAddr.x] = sortData[0].m_key;
|
||||
ldsSortData[dstAddr.y] = sortData[1].m_key;
|
||||
ldsSortData[dstAddr.z] = sortData[2].m_key;
|
||||
ldsSortData[dstAddr.w] = sortData[3].m_key;
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
sortData[0].m_key = ldsSortData[localAddr.x];
|
||||
sortData[1].m_key = ldsSortData[localAddr.y];
|
||||
sortData[2].m_key = ldsSortData[localAddr.z];
|
||||
sortData[3].m_key = ldsSortData[localAddr.w];
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
ldsSortData[dstAddr.x] = sortData[0].m_value;
|
||||
ldsSortData[dstAddr.y] = sortData[1].m_value;
|
||||
ldsSortData[dstAddr.z] = sortData[2].m_value;
|
||||
ldsSortData[dstAddr.w] = sortData[3].m_value;
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
sortData[0].m_value = ldsSortData[localAddr.x];
|
||||
sortData[1].m_value = ldsSortData[localAddr.y];
|
||||
sortData[2].m_value = ldsSortData[localAddr.z];
|
||||
sortData[3].m_value = ldsSortData[localAddr.w];
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
}
|
||||
bitIdx ++;
|
||||
}
|
||||
while( bitIdx <(cb.m_startBit+BITS_PER_PASS) );
|
||||
|
||||
{ // generate historgram
|
||||
uint4 localKeys = make_uint4( sortData[0].m_key>>cb.m_startBit, sortData[1].m_key>>cb.m_startBit,
|
||||
sortData[2].m_key>>cb.m_startBit, sortData[3].m_key>>cb.m_startBit );
|
||||
|
||||
generateHistogram( lIdx, wgIdx, localKeys, ldsSortData );
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
int nBins = (1<<BITS_PER_PASS);
|
||||
if( lIdx < nBins )
|
||||
{
|
||||
u32 histValues = ldsSortData[lIdx];
|
||||
|
||||
u32 globalAddresses = nBins*wgIdx + lIdx;
|
||||
u32 globalAddressesRadixMajor = cb.m_numGroups*lIdx + wgIdx;
|
||||
|
||||
ldsHistogramOut0[globalAddressesRadixMajor] = histValues;
|
||||
ldsHistogramOut1[globalAddresses] = histValues;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
{ // write
|
||||
u32 offset = nElemsPerWG*wgIdx;
|
||||
uint4 dstAddr = make_uint4(offset+localAddr.x, offset+localAddr.y, offset+localAddr.z, offset+localAddr.w );
|
||||
|
||||
sortDataIn[ dstAddr.x + 0 ] = sortData[0];
|
||||
sortDataIn[ dstAddr.x + 1 ] = sortData[1];
|
||||
sortDataIn[ dstAddr.x + 2 ] = sortData[2];
|
||||
sortDataIn[ dstAddr.x + 3 ] = sortData[3];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
|
||||
void ScatterKernel(__global SortData *src,
|
||||
__global u32 *histogramGlobalRadixMajor,
|
||||
__global u32 *histogramLocalGroupMajor,
|
||||
__global SortData *dst,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
__local u32 sorterLocalMemory[3*(1<<BITS_PER_PASS)];
|
||||
__local u32 *ldsLocalHistogram = sorterLocalMemory + (1<<BITS_PER_PASS);
|
||||
__local u32 *ldsGlobalHistogram = sorterLocalMemory;
|
||||
|
||||
|
||||
u32 lIdx = GET_LOCAL_IDX;
|
||||
u32 wgIdx = GET_GROUP_IDX;
|
||||
u32 ldsOffset = (1<<BITS_PER_PASS);
|
||||
|
||||
// load and prefix scan local histogram
|
||||
if( lIdx < ((1<<BITS_PER_PASS)/2) )
|
||||
{
|
||||
uint2 myIdx = make_uint2(lIdx, lIdx+8);
|
||||
|
||||
ldsLocalHistogram[ldsOffset+myIdx.x] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.x];
|
||||
ldsLocalHistogram[ldsOffset+myIdx.y] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.y];
|
||||
ldsLocalHistogram[ldsOffset+myIdx.x-(1<<BITS_PER_PASS)] = 0;
|
||||
ldsLocalHistogram[ldsOffset+myIdx.y-(1<<BITS_PER_PASS)] = 0;
|
||||
|
||||
int idx = ldsOffset+2*lIdx;
|
||||
ldsLocalHistogram[idx] += ldsLocalHistogram[idx-1];
|
||||
GROUP_MEM_FENCE;
|
||||
ldsLocalHistogram[idx] += ldsLocalHistogram[idx-2];
|
||||
GROUP_MEM_FENCE;
|
||||
ldsLocalHistogram[idx] += ldsLocalHistogram[idx-4];
|
||||
GROUP_MEM_FENCE;
|
||||
ldsLocalHistogram[idx] += ldsLocalHistogram[idx-8];
|
||||
GROUP_MEM_FENCE;
|
||||
|
||||
// Propagate intermediate values through
|
||||
ldsLocalHistogram[idx-1] += ldsLocalHistogram[idx-2];
|
||||
GROUP_MEM_FENCE;
|
||||
|
||||
// Grab and propagate for whole WG - loading the - 1 value
|
||||
uint2 localValues;
|
||||
localValues.x = ldsLocalHistogram[ldsOffset+myIdx.x-1];
|
||||
localValues.y = ldsLocalHistogram[ldsOffset+myIdx.y-1];
|
||||
|
||||
ldsLocalHistogram[myIdx.x] = localValues.x;
|
||||
ldsLocalHistogram[myIdx.y] = localValues.y;
|
||||
|
||||
|
||||
ldsGlobalHistogram[myIdx.x] = histogramGlobalRadixMajor[cb.m_numGroups*myIdx.x + wgIdx];
|
||||
ldsGlobalHistogram[myIdx.y] = histogramGlobalRadixMajor[cb.m_numGroups*myIdx.y + wgIdx];
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
|
||||
|
||||
SortData sortData[4];
|
||||
{
|
||||
uint4 globalAddr = wgIdx*WG_SIZE*NUM_PER_WI + localAddr;
|
||||
sortData[0] = src[globalAddr.x];
|
||||
sortData[1] = src[globalAddr.y];
|
||||
sortData[2] = src[globalAddr.z];
|
||||
sortData[3] = src[globalAddr.w];
|
||||
}
|
||||
|
||||
uint cmpValue = ((1<<BITS_PER_PASS)-1);
|
||||
uint4 radix = make_uint4( (sortData[0].m_key>>cb.m_startBit)&cmpValue, (sortData[1].m_key>>cb.m_startBit)&cmpValue,
|
||||
(sortData[2].m_key>>cb.m_startBit)&cmpValue, (sortData[3].m_key>>cb.m_startBit)&cmpValue );;
|
||||
|
||||
// data is already sorted. So simply subtract local prefix sum
|
||||
uint4 dstAddr;
|
||||
dstAddr.x = ldsGlobalHistogram[radix.x] + (localAddr.x - ldsLocalHistogram[radix.x]);
|
||||
dstAddr.y = ldsGlobalHistogram[radix.y] + (localAddr.y - ldsLocalHistogram[radix.y]);
|
||||
dstAddr.z = ldsGlobalHistogram[radix.z] + (localAddr.z - ldsLocalHistogram[radix.z]);
|
||||
dstAddr.w = ldsGlobalHistogram[radix.w] + (localAddr.w - ldsLocalHistogram[radix.w]);
|
||||
|
||||
dst[dstAddr.x] = sortData[0];
|
||||
dst[dstAddr.y] = sortData[1];
|
||||
dst[dstAddr.z] = sortData[2];
|
||||
dst[dstAddr.w] = sortData[3];
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
|
||||
void CopyKernel(__global SortData *src, __global SortData *dst)
|
||||
{
|
||||
dst[ GET_GLOBAL_IDX ] = src[ GET_GLOBAL_IDX ];
|
||||
}
|
||||
@@ -0,0 +1,322 @@
|
||||
/*
|
||||
2011 Takahiro Harada
|
||||
*/
|
||||
|
||||
typedef uint u32;
|
||||
|
||||
#define GET_GROUP_IDX groupIdx.x
|
||||
#define GET_LOCAL_IDX localIdx.x
|
||||
#define GET_GLOBAL_IDX globalIdx.x
|
||||
#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
|
||||
#define GROUP_MEM_FENCE
|
||||
#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
|
||||
#define AtomInc(x) InterlockedAdd(x, 1)
|
||||
#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
|
||||
|
||||
#define make_uint4 uint4
|
||||
#define make_uint2 uint2
|
||||
|
||||
uint4 SELECT_UINT4(uint4 b,uint4 a,uint4 condition ){ return make_uint4( ((condition).x)?a.x:b.x, ((condition).y)?a.y:b.y, ((condition).z)?a.z:b.z, ((condition).w)?a.w:b.w ); }
|
||||
|
||||
// takahiro end
|
||||
#define WG_SIZE 128
|
||||
#define NUM_PER_WI 4
|
||||
|
||||
#define GET_GROUP_SIZE WG_SIZE
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u32 m_key;
|
||||
u32 m_value;
|
||||
}SortData;
|
||||
|
||||
cbuffer SortCB : register( b0 )
|
||||
{
|
||||
u32 m_startBit;
|
||||
u32 m_numGroups;
|
||||
u32 m_padding[2];
|
||||
};
|
||||
|
||||
#define BITS_PER_PASS 4
|
||||
|
||||
|
||||
uint4 prefixScanVector( uint4 data )
|
||||
{
|
||||
data.y += data.x;
|
||||
data.w += data.z;
|
||||
data.z += data.y;
|
||||
data.w += data.y;
|
||||
return data;
|
||||
}
|
||||
|
||||
uint prefixScanVectorEx( inout uint4 data )
|
||||
{
|
||||
uint4 backup = data;
|
||||
data.y += data.x;
|
||||
data.w += data.z;
|
||||
data.z += data.y;
|
||||
data.w += data.y;
|
||||
uint sum = data.w;
|
||||
data -= backup;
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
|
||||
RWStructuredBuffer<SortData> sortDataIn : register( u0 );
|
||||
RWStructuredBuffer<u32> ldsHistogramOut0 : register( u1 );
|
||||
RWStructuredBuffer<u32> ldsHistogramOut1 : register( u2 );
|
||||
|
||||
groupshared u32 ldsSortData[ WG_SIZE*NUM_PER_WI + 16 ];
|
||||
|
||||
|
||||
uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum )
|
||||
{
|
||||
{ // Set data
|
||||
ldsSortData[lIdx] = 0;
|
||||
ldsSortData[lIdx+WG_SIZE] = prefixScanVectorEx( pData );
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
{ // Prefix sum
|
||||
int idx = 2*lIdx + (WG_SIZE+1);
|
||||
if( lIdx < 64 )
|
||||
{
|
||||
ldsSortData[idx] += ldsSortData[idx-1];
|
||||
GROUP_MEM_FENCE;
|
||||
ldsSortData[idx] += ldsSortData[idx-2];
|
||||
GROUP_MEM_FENCE;
|
||||
ldsSortData[idx] += ldsSortData[idx-4];
|
||||
GROUP_MEM_FENCE;
|
||||
ldsSortData[idx] += ldsSortData[idx-8];
|
||||
GROUP_MEM_FENCE;
|
||||
ldsSortData[idx] += ldsSortData[idx-16];
|
||||
GROUP_MEM_FENCE;
|
||||
ldsSortData[idx] += ldsSortData[idx-32];
|
||||
GROUP_MEM_FENCE;
|
||||
ldsSortData[idx] += ldsSortData[idx-64];
|
||||
GROUP_MEM_FENCE;
|
||||
|
||||
ldsSortData[idx-1] += ldsSortData[idx-2];
|
||||
GROUP_MEM_FENCE;
|
||||
}
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
totalSum = ldsSortData[WG_SIZE*2-1];
|
||||
uint addValue = ldsSortData[lIdx+127];
|
||||
return pData + make_uint4(addValue, addValue, addValue, addValue);
|
||||
}
|
||||
|
||||
void generateHistogram(u32 lIdx, u32 wgIdx,
|
||||
uint4 sortedData)
|
||||
{
|
||||
if( lIdx < (1<<BITS_PER_PASS) )
|
||||
{
|
||||
ldsSortData[lIdx] = 0;
|
||||
}
|
||||
|
||||
int mask = ((1<<BITS_PER_PASS)-1);
|
||||
uint4 keys = make_uint4( (sortedData.x)&mask, (sortedData.y)&mask, (sortedData.z)&mask, (sortedData.w)&mask );
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
AtomInc( ldsSortData[keys.x] );
|
||||
AtomInc( ldsSortData[keys.y] );
|
||||
AtomInc( ldsSortData[keys.z] );
|
||||
AtomInc( ldsSortData[keys.w] );
|
||||
}
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void LocalSortKernel( DEFAULT_ARGS )
|
||||
{
|
||||
int nElemsPerWG = WG_SIZE*NUM_PER_WI;
|
||||
u32 lIdx = GET_LOCAL_IDX;
|
||||
u32 wgIdx = GET_GROUP_IDX;
|
||||
u32 wgSize = GET_GROUP_SIZE;
|
||||
|
||||
uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
|
||||
|
||||
|
||||
SortData sortData[NUM_PER_WI];
|
||||
|
||||
{
|
||||
u32 offset = nElemsPerWG*wgIdx;
|
||||
sortData[0] = sortDataIn[offset+localAddr.x];
|
||||
sortData[1] = sortDataIn[offset+localAddr.y];
|
||||
sortData[2] = sortDataIn[offset+localAddr.z];
|
||||
sortData[3] = sortDataIn[offset+localAddr.w];
|
||||
}
|
||||
|
||||
int bitIdx = m_startBit;
|
||||
do
|
||||
{
|
||||
// what is this?
|
||||
// if( lIdx == wgSize-1 ) ldsSortData[256] = sortData[3].m_key;
|
||||
u32 mask = (1<<bitIdx);
|
||||
uint4 cmpResult = make_uint4( sortData[0].m_key & mask, sortData[1].m_key & mask, sortData[2].m_key & mask, sortData[3].m_key & mask );
|
||||
uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );
|
||||
u32 total;
|
||||
prefixSum = localPrefixSum128V( prefixSum, lIdx, total );
|
||||
|
||||
{
|
||||
uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );
|
||||
dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
ldsSortData[dstAddr.x] = sortData[0].m_key;
|
||||
ldsSortData[dstAddr.y] = sortData[1].m_key;
|
||||
ldsSortData[dstAddr.z] = sortData[2].m_key;
|
||||
ldsSortData[dstAddr.w] = sortData[3].m_key;
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
sortData[0].m_key = ldsSortData[localAddr.x];
|
||||
sortData[1].m_key = ldsSortData[localAddr.y];
|
||||
sortData[2].m_key = ldsSortData[localAddr.z];
|
||||
sortData[3].m_key = ldsSortData[localAddr.w];
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
ldsSortData[dstAddr.x] = sortData[0].m_value;
|
||||
ldsSortData[dstAddr.y] = sortData[1].m_value;
|
||||
ldsSortData[dstAddr.z] = sortData[2].m_value;
|
||||
ldsSortData[dstAddr.w] = sortData[3].m_value;
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
sortData[0].m_value = ldsSortData[localAddr.x];
|
||||
sortData[1].m_value = ldsSortData[localAddr.y];
|
||||
sortData[2].m_value = ldsSortData[localAddr.z];
|
||||
sortData[3].m_value = ldsSortData[localAddr.w];
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
}
|
||||
bitIdx ++;
|
||||
}
|
||||
while( bitIdx <(m_startBit+BITS_PER_PASS) );
|
||||
|
||||
{ // generate historgram
|
||||
uint4 localKeys = make_uint4( sortData[0].m_key>>m_startBit, sortData[1].m_key>>m_startBit,
|
||||
sortData[2].m_key>>m_startBit, sortData[3].m_key>>m_startBit );
|
||||
|
||||
generateHistogram( lIdx, wgIdx, localKeys );
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
int nBins = (1<<BITS_PER_PASS);
|
||||
if( lIdx < nBins )
|
||||
{
|
||||
u32 histValues = ldsSortData[lIdx];
|
||||
|
||||
u32 globalAddresses = nBins*wgIdx + lIdx;
|
||||
u32 globalAddressesRadixMajor = m_numGroups*lIdx + wgIdx;
|
||||
|
||||
ldsHistogramOut0[globalAddressesRadixMajor] = histValues;
|
||||
ldsHistogramOut1[globalAddresses] = histValues;
|
||||
}
|
||||
}
|
||||
|
||||
{ // write
|
||||
u32 offset = nElemsPerWG*wgIdx;
|
||||
uint4 dstAddr = make_uint4(offset+localAddr.x, offset+localAddr.y, offset+localAddr.z, offset+localAddr.w );
|
||||
|
||||
sortDataIn[ dstAddr.x + 0 ] = sortData[0];
|
||||
sortDataIn[ dstAddr.x + 1 ] = sortData[1];
|
||||
sortDataIn[ dstAddr.x + 2 ] = sortData[2];
|
||||
sortDataIn[ dstAddr.x + 3 ] = sortData[3];
|
||||
}
|
||||
}
|
||||
|
||||
StructuredBuffer<SortData> src : register( t0 );
|
||||
StructuredBuffer<u32> histogramGlobalRadixMajor : register( t1 );
|
||||
StructuredBuffer<u32> histogramLocalGroupMajor : register( t2 );
|
||||
|
||||
RWStructuredBuffer<SortData> dst : register( u0 );
|
||||
|
||||
groupshared u32 ldsLocalHistogram[ 2*(1<<BITS_PER_PASS) ];
|
||||
groupshared u32 ldsGlobalHistogram[ (1<<BITS_PER_PASS) ];
|
||||
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void ScatterKernel( DEFAULT_ARGS )
|
||||
{
|
||||
u32 lIdx = GET_LOCAL_IDX;
|
||||
u32 wgIdx = GET_GROUP_IDX;
|
||||
u32 ldsOffset = (1<<BITS_PER_PASS);
|
||||
|
||||
// load and prefix scan local histogram
|
||||
if( lIdx < ((1<<BITS_PER_PASS)/2) )
|
||||
{
|
||||
uint2 myIdx = make_uint2(lIdx, lIdx+8);
|
||||
|
||||
ldsLocalHistogram[ldsOffset+myIdx.x] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.x];
|
||||
ldsLocalHistogram[ldsOffset+myIdx.y] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.y];
|
||||
ldsLocalHistogram[ldsOffset+myIdx.x-(1<<BITS_PER_PASS)] = 0;
|
||||
ldsLocalHistogram[ldsOffset+myIdx.y-(1<<BITS_PER_PASS)] = 0;
|
||||
|
||||
int idx = ldsOffset+2*lIdx;
|
||||
ldsLocalHistogram[idx] += ldsLocalHistogram[idx-1];
|
||||
GROUP_MEM_FENCE;
|
||||
ldsLocalHistogram[idx] += ldsLocalHistogram[idx-2];
|
||||
GROUP_MEM_FENCE;
|
||||
ldsLocalHistogram[idx] += ldsLocalHistogram[idx-4];
|
||||
GROUP_MEM_FENCE;
|
||||
ldsLocalHistogram[idx] += ldsLocalHistogram[idx-8];
|
||||
GROUP_MEM_FENCE;
|
||||
|
||||
// Propagate intermediate values through
|
||||
ldsLocalHistogram[idx-1] += ldsLocalHistogram[idx-2];
|
||||
GROUP_MEM_FENCE;
|
||||
|
||||
// Grab and propagate for whole WG - loading the - 1 value
|
||||
uint2 localValues;
|
||||
localValues.x = ldsLocalHistogram[ldsOffset+myIdx.x-1];
|
||||
localValues.y = ldsLocalHistogram[ldsOffset+myIdx.y-1];
|
||||
|
||||
ldsLocalHistogram[myIdx.x] = localValues.x;
|
||||
ldsLocalHistogram[myIdx.y] = localValues.y;
|
||||
|
||||
|
||||
ldsGlobalHistogram[myIdx.x] = histogramGlobalRadixMajor[m_numGroups*myIdx.x + wgIdx];
|
||||
ldsGlobalHistogram[myIdx.y] = histogramGlobalRadixMajor[m_numGroups*myIdx.y + wgIdx];
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
|
||||
|
||||
SortData sortData[4];
|
||||
{
|
||||
uint4 globalAddr = wgIdx*WG_SIZE*NUM_PER_WI + localAddr;
|
||||
sortData[0] = src[globalAddr.x];
|
||||
sortData[1] = src[globalAddr.y];
|
||||
sortData[2] = src[globalAddr.z];
|
||||
sortData[3] = src[globalAddr.w];
|
||||
}
|
||||
|
||||
uint cmpValue = ((1<<BITS_PER_PASS)-1);
|
||||
uint4 radix = make_uint4( (sortData[0].m_key>>m_startBit)&cmpValue, (sortData[1].m_key>>m_startBit)&cmpValue,
|
||||
(sortData[2].m_key>>m_startBit)&cmpValue, (sortData[3].m_key>>m_startBit)&cmpValue );;
|
||||
|
||||
// data is already sorted. So simply subtract local prefix sum
|
||||
uint4 dstAddr;
|
||||
dstAddr.x = ldsGlobalHistogram[radix.x] + (localAddr.x - ldsLocalHistogram[radix.x]);
|
||||
dstAddr.y = ldsGlobalHistogram[radix.y] + (localAddr.y - ldsLocalHistogram[radix.y]);
|
||||
dstAddr.z = ldsGlobalHistogram[radix.z] + (localAddr.z - ldsLocalHistogram[radix.z]);
|
||||
dstAddr.w = ldsGlobalHistogram[radix.w] + (localAddr.w - ldsLocalHistogram[radix.w]);
|
||||
|
||||
dst[dstAddr.x] = sortData[0];
|
||||
dst[dstAddr.y] = sortData[1];
|
||||
dst[dstAddr.z] = sortData[2];
|
||||
dst[dstAddr.w] = sortData[3];
|
||||
}
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void CopyKernel( DEFAULT_ARGS )
|
||||
{
|
||||
dst[ GET_GLOBAL_IDX ] = src[ GET_GLOBAL_IDX ];
|
||||
}
|
||||
@@ -0,0 +1,347 @@
|
||||
static const char* radixSortStandardKernelsCL= \
|
||||
"/*\n"
|
||||
"Bullet Continuous Collision Detection and Physics Library\n"
|
||||
"Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org\n"
|
||||
"\n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose, \n"
|
||||
"including commercial applications, and to alter it and redistribute it freely, \n"
|
||||
"subject to the following restrictions:\n"
|
||||
"\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Author Takahiro Harada\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
|
||||
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
|
||||
"\n"
|
||||
"typedef unsigned int u32;\n"
|
||||
"#define GET_GROUP_IDX get_group_id(0)\n"
|
||||
"#define GET_LOCAL_IDX get_local_id(0)\n"
|
||||
"#define GET_GLOBAL_IDX get_global_id(0)\n"
|
||||
"#define GET_GROUP_SIZE get_local_size(0)\n"
|
||||
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define AtomInc(x) atom_inc(&(x))\n"
|
||||
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
|
||||
"\n"
|
||||
"#define make_uint4 (uint4)\n"
|
||||
"#define make_uint2 (uint2)\n"
|
||||
"\n"
|
||||
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
|
||||
"\n"
|
||||
"#define WG_SIZE 128\n"
|
||||
"#define NUM_PER_WI 4\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_key; \n"
|
||||
" u32 m_value;\n"
|
||||
"}SortData;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_startBit;\n"
|
||||
" u32 m_numGroups;\n"
|
||||
" u32 m_padding[2];\n"
|
||||
"} ConstBuffer;\n"
|
||||
"\n"
|
||||
"#define BITS_PER_PASS 4\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"uint4 prefixScanVector( uint4 data )\n"
|
||||
"{\n"
|
||||
" data.y += data.x;\n"
|
||||
" data.w += data.z;\n"
|
||||
" data.z += data.y;\n"
|
||||
" data.w += data.y;\n"
|
||||
" return data;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint prefixScanVectorEx( uint4* data )\n"
|
||||
"{\n"
|
||||
" uint4 backup = data[0];\n"
|
||||
" data[0].y += data[0].x;\n"
|
||||
" data[0].w += data[0].z;\n"
|
||||
" data[0].z += data[0].y;\n"
|
||||
" data[0].w += data[0].y;\n"
|
||||
" uint sum = data[0].w;\n"
|
||||
" *data -= backup;\n"
|
||||
" return sum;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32 sorterSharedMemory[] )\n"
|
||||
"{\n"
|
||||
" { // Set data\n"
|
||||
" sorterSharedMemory[lIdx] = 0;\n"
|
||||
" sorterSharedMemory[lIdx+WG_SIZE] = prefixScanVectorEx( &pData );\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" { // Prefix sum\n"
|
||||
" int idx = 2*lIdx + (WG_SIZE+1);\n"
|
||||
" if( lIdx < 64 )\n"
|
||||
" {\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; \n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-32]; \n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
"\n"
|
||||
" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" *totalSum = sorterSharedMemory[WG_SIZE*2-1];\n"
|
||||
" uint addValue = sorterSharedMemory[lIdx+127];\n"
|
||||
" return pData + make_uint4(addValue, addValue, addValue, addValue);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"void generateHistogram(u32 lIdx, u32 wgIdx, \n"
|
||||
" uint4 sortedData,\n"
|
||||
" __local u32 *histogram)\n"
|
||||
"{\n"
|
||||
" if( lIdx < (1<<BITS_PER_PASS) )\n"
|
||||
" {\n"
|
||||
" histogram[lIdx] = 0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" int mask = ((1<<BITS_PER_PASS)-1);\n"
|
||||
" uint4 keys = make_uint4( (sortedData.x)&mask, (sortedData.y)&mask, (sortedData.z)&mask, (sortedData.w)&mask );\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" AtomInc( histogram[keys.x] );\n"
|
||||
" AtomInc( histogram[keys.y] );\n"
|
||||
" AtomInc( histogram[keys.z] );\n"
|
||||
" AtomInc( histogram[keys.w] );\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"//\n"
|
||||
"//\n"
|
||||
"//\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
|
||||
"void LocalSortKernel(__global SortData* sortDataIn, \n"
|
||||
" __global u32* ldsHistogramOut0,\n"
|
||||
" __global u32* ldsHistogramOut1,\n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
"\n"
|
||||
" __local u32 ldsSortData[ WG_SIZE*NUM_PER_WI + 16 ];\n"
|
||||
"\n"
|
||||
" int nElemsPerWG = WG_SIZE*NUM_PER_WI;\n"
|
||||
" u32 lIdx = GET_LOCAL_IDX;\n"
|
||||
" u32 wgIdx = GET_GROUP_IDX;\n"
|
||||
" u32 wgSize = GET_GROUP_SIZE;\n"
|
||||
"\n"
|
||||
" uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
|
||||
"\n"
|
||||
"\n"
|
||||
" SortData sortData[NUM_PER_WI];\n"
|
||||
"\n"
|
||||
" {\n"
|
||||
" u32 offset = nElemsPerWG*wgIdx;\n"
|
||||
" sortData[0] = sortDataIn[offset+localAddr.x];\n"
|
||||
" sortData[1] = sortDataIn[offset+localAddr.y];\n"
|
||||
" sortData[2] = sortDataIn[offset+localAddr.z];\n"
|
||||
" sortData[3] = sortDataIn[offset+localAddr.w];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" int bitIdx = cb.m_startBit;\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
"// what is this?\n"
|
||||
"// if( lIdx == wgSize-1 ) ldsSortData[256] = sortData[3].m_key;\n"
|
||||
" u32 mask = (1<<bitIdx);\n"
|
||||
" uint4 cmpResult = make_uint4( sortData[0].m_key & mask, sortData[1].m_key & mask, sortData[2].m_key & mask, sortData[3].m_key & mask );\n"
|
||||
" uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n"
|
||||
" u32 total;\n"
|
||||
" prefixSum = localPrefixSum128V( prefixSum, lIdx, &total, ldsSortData );\n"
|
||||
"\n"
|
||||
" {\n"
|
||||
" uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n"
|
||||
" dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" ldsSortData[dstAddr.x] = sortData[0].m_key;\n"
|
||||
" ldsSortData[dstAddr.y] = sortData[1].m_key;\n"
|
||||
" ldsSortData[dstAddr.z] = sortData[2].m_key;\n"
|
||||
" ldsSortData[dstAddr.w] = sortData[3].m_key;\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" sortData[0].m_key = ldsSortData[localAddr.x];\n"
|
||||
" sortData[1].m_key = ldsSortData[localAddr.y];\n"
|
||||
" sortData[2].m_key = ldsSortData[localAddr.z];\n"
|
||||
" sortData[3].m_key = ldsSortData[localAddr.w];\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" ldsSortData[dstAddr.x] = sortData[0].m_value;\n"
|
||||
" ldsSortData[dstAddr.y] = sortData[1].m_value;\n"
|
||||
" ldsSortData[dstAddr.z] = sortData[2].m_value;\n"
|
||||
" ldsSortData[dstAddr.w] = sortData[3].m_value;\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" sortData[0].m_value = ldsSortData[localAddr.x];\n"
|
||||
" sortData[1].m_value = ldsSortData[localAddr.y];\n"
|
||||
" sortData[2].m_value = ldsSortData[localAddr.z];\n"
|
||||
" sortData[3].m_value = ldsSortData[localAddr.w];\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" }\n"
|
||||
" bitIdx ++;\n"
|
||||
" }\n"
|
||||
" while( bitIdx <(cb.m_startBit+BITS_PER_PASS) );\n"
|
||||
"\n"
|
||||
" { // generate historgram\n"
|
||||
" uint4 localKeys = make_uint4( sortData[0].m_key>>cb.m_startBit, sortData[1].m_key>>cb.m_startBit, \n"
|
||||
" sortData[2].m_key>>cb.m_startBit, sortData[3].m_key>>cb.m_startBit );\n"
|
||||
"\n"
|
||||
" generateHistogram( lIdx, wgIdx, localKeys, ldsSortData );\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" int nBins = (1<<BITS_PER_PASS);\n"
|
||||
" if( lIdx < nBins )\n"
|
||||
" {\n"
|
||||
" u32 histValues = ldsSortData[lIdx];\n"
|
||||
"\n"
|
||||
" u32 globalAddresses = nBins*wgIdx + lIdx;\n"
|
||||
" u32 globalAddressesRadixMajor = cb.m_numGroups*lIdx + wgIdx;\n"
|
||||
" \n"
|
||||
" ldsHistogramOut0[globalAddressesRadixMajor] = histValues;\n"
|
||||
" ldsHistogramOut1[globalAddresses] = histValues;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
"\n"
|
||||
" { // write\n"
|
||||
" u32 offset = nElemsPerWG*wgIdx;\n"
|
||||
" uint4 dstAddr = make_uint4(offset+localAddr.x, offset+localAddr.y, offset+localAddr.z, offset+localAddr.w );\n"
|
||||
"\n"
|
||||
" sortDataIn[ dstAddr.x + 0 ] = sortData[0];\n"
|
||||
" sortDataIn[ dstAddr.x + 1 ] = sortData[1];\n"
|
||||
" sortDataIn[ dstAddr.x + 2 ] = sortData[2];\n"
|
||||
" sortDataIn[ dstAddr.x + 3 ] = sortData[3];\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
|
||||
"void ScatterKernel(__global SortData *src,\n"
|
||||
" __global u32 *histogramGlobalRadixMajor,\n"
|
||||
" __global u32 *histogramLocalGroupMajor,\n"
|
||||
" __global SortData *dst,\n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" __local u32 sorterLocalMemory[3*(1<<BITS_PER_PASS)];\n"
|
||||
" __local u32 *ldsLocalHistogram = sorterLocalMemory + (1<<BITS_PER_PASS);\n"
|
||||
" __local u32 *ldsGlobalHistogram = sorterLocalMemory;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
" u32 lIdx = GET_LOCAL_IDX;\n"
|
||||
" u32 wgIdx = GET_GROUP_IDX;\n"
|
||||
" u32 ldsOffset = (1<<BITS_PER_PASS);\n"
|
||||
"\n"
|
||||
" // load and prefix scan local histogram\n"
|
||||
" if( lIdx < ((1<<BITS_PER_PASS)/2) )\n"
|
||||
" {\n"
|
||||
" uint2 myIdx = make_uint2(lIdx, lIdx+8);\n"
|
||||
"\n"
|
||||
" ldsLocalHistogram[ldsOffset+myIdx.x] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.x];\n"
|
||||
" ldsLocalHistogram[ldsOffset+myIdx.y] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.y];\n"
|
||||
" ldsLocalHistogram[ldsOffset+myIdx.x-(1<<BITS_PER_PASS)] = 0;\n"
|
||||
" ldsLocalHistogram[ldsOffset+myIdx.y-(1<<BITS_PER_PASS)] = 0;\n"
|
||||
"\n"
|
||||
" int idx = ldsOffset+2*lIdx;\n"
|
||||
" ldsLocalHistogram[idx] += ldsLocalHistogram[idx-1];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" ldsLocalHistogram[idx] += ldsLocalHistogram[idx-2];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" ldsLocalHistogram[idx] += ldsLocalHistogram[idx-4];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" ldsLocalHistogram[idx] += ldsLocalHistogram[idx-8];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
"\n"
|
||||
" // Propagate intermediate values through\n"
|
||||
" ldsLocalHistogram[idx-1] += ldsLocalHistogram[idx-2];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
"\n"
|
||||
" // Grab and propagate for whole WG - loading the - 1 value\n"
|
||||
" uint2 localValues;\n"
|
||||
" localValues.x = ldsLocalHistogram[ldsOffset+myIdx.x-1];\n"
|
||||
" localValues.y = ldsLocalHistogram[ldsOffset+myIdx.y-1];\n"
|
||||
"\n"
|
||||
" ldsLocalHistogram[myIdx.x] = localValues.x;\n"
|
||||
" ldsLocalHistogram[myIdx.y] = localValues.y;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
" ldsGlobalHistogram[myIdx.x] = histogramGlobalRadixMajor[cb.m_numGroups*myIdx.x + wgIdx];\n"
|
||||
" ldsGlobalHistogram[myIdx.y] = histogramGlobalRadixMajor[cb.m_numGroups*myIdx.y + wgIdx];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
|
||||
"\n"
|
||||
" SortData sortData[4];\n"
|
||||
" {\n"
|
||||
" uint4 globalAddr = wgIdx*WG_SIZE*NUM_PER_WI + localAddr;\n"
|
||||
" sortData[0] = src[globalAddr.x];\n"
|
||||
" sortData[1] = src[globalAddr.y];\n"
|
||||
" sortData[2] = src[globalAddr.z];\n"
|
||||
" sortData[3] = src[globalAddr.w];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" uint cmpValue = ((1<<BITS_PER_PASS)-1);\n"
|
||||
" uint4 radix = make_uint4( (sortData[0].m_key>>cb.m_startBit)&cmpValue, (sortData[1].m_key>>cb.m_startBit)&cmpValue, \n"
|
||||
" (sortData[2].m_key>>cb.m_startBit)&cmpValue, (sortData[3].m_key>>cb.m_startBit)&cmpValue );;\n"
|
||||
"\n"
|
||||
" // data is already sorted. So simply subtract local prefix sum\n"
|
||||
" uint4 dstAddr;\n"
|
||||
" dstAddr.x = ldsGlobalHistogram[radix.x] + (localAddr.x - ldsLocalHistogram[radix.x]);\n"
|
||||
" dstAddr.y = ldsGlobalHistogram[radix.y] + (localAddr.y - ldsLocalHistogram[radix.y]);\n"
|
||||
" dstAddr.z = ldsGlobalHistogram[radix.z] + (localAddr.z - ldsLocalHistogram[radix.z]);\n"
|
||||
" dstAddr.w = ldsGlobalHistogram[radix.w] + (localAddr.w - ldsLocalHistogram[radix.w]);\n"
|
||||
"\n"
|
||||
" dst[dstAddr.x] = sortData[0];\n"
|
||||
" dst[dstAddr.y] = sortData[1];\n"
|
||||
" dst[dstAddr.z] = sortData[2];\n"
|
||||
" dst[dstAddr.w] = sortData[3];\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
|
||||
"void CopyKernel(__global SortData *src, __global SortData *dst)\n"
|
||||
"{\n"
|
||||
" dst[ GET_GLOBAL_IDX ] = src[ GET_GLOBAL_IDX ];\n"
|
||||
"}\n"
|
||||
;
|
||||
@@ -0,0 +1,324 @@
|
||||
static const char* radixSortStandardKernelsDX11= \
|
||||
"/*\n"
|
||||
" 2011 Takahiro Harada\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"typedef uint u32;\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_IDX groupIdx.x\n"
|
||||
"#define GET_LOCAL_IDX localIdx.x\n"
|
||||
"#define GET_GLOBAL_IDX globalIdx.x\n"
|
||||
"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
|
||||
"#define GROUP_MEM_FENCE\n"
|
||||
"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
|
||||
"#define AtomInc(x) InterlockedAdd(x, 1)\n"
|
||||
"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
|
||||
"\n"
|
||||
"#define make_uint4 uint4\n"
|
||||
"#define make_uint2 uint2\n"
|
||||
"\n"
|
||||
"uint4 SELECT_UINT4(uint4 b,uint4 a,uint4 condition ){ return make_uint4( ((condition).x)?a.x:b.x, ((condition).y)?a.y:b.y, ((condition).z)?a.z:b.z, ((condition).w)?a.w:b.w ); }\n"
|
||||
"\n"
|
||||
"// takahiro end\n"
|
||||
"#define WG_SIZE 128\n"
|
||||
"#define NUM_PER_WI 4\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_SIZE WG_SIZE\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_key; \n"
|
||||
" u32 m_value;\n"
|
||||
"}SortData;\n"
|
||||
"\n"
|
||||
"cbuffer SortCB : register( b0 )\n"
|
||||
"{\n"
|
||||
" u32 m_startBit;\n"
|
||||
" u32 m_numGroups;\n"
|
||||
" u32 m_padding[2];\n"
|
||||
"};\n"
|
||||
"\n"
|
||||
"#define BITS_PER_PASS 4\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"uint4 prefixScanVector( uint4 data )\n"
|
||||
"{\n"
|
||||
" data.y += data.x;\n"
|
||||
" data.w += data.z;\n"
|
||||
" data.z += data.y;\n"
|
||||
" data.w += data.y;\n"
|
||||
" return data;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"uint prefixScanVectorEx( inout uint4 data )\n"
|
||||
"{\n"
|
||||
" uint4 backup = data;\n"
|
||||
" data.y += data.x;\n"
|
||||
" data.w += data.z;\n"
|
||||
" data.z += data.y;\n"
|
||||
" data.w += data.y;\n"
|
||||
" uint sum = data.w;\n"
|
||||
" data -= backup;\n"
|
||||
" return sum;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<SortData> sortDataIn : register( u0 );\n"
|
||||
"RWStructuredBuffer<u32> ldsHistogramOut0 : register( u1 );\n"
|
||||
"RWStructuredBuffer<u32> ldsHistogramOut1 : register( u2 );\n"
|
||||
"\n"
|
||||
"groupshared u32 ldsSortData[ WG_SIZE*NUM_PER_WI + 16 ];\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum )\n"
|
||||
"{\n"
|
||||
" { // Set data\n"
|
||||
" ldsSortData[lIdx] = 0;\n"
|
||||
" ldsSortData[lIdx+WG_SIZE] = prefixScanVectorEx( pData );\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" { // Prefix sum\n"
|
||||
" int idx = 2*lIdx + (WG_SIZE+1);\n"
|
||||
" if( lIdx < 64 )\n"
|
||||
" {\n"
|
||||
" ldsSortData[idx] += ldsSortData[idx-1];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" ldsSortData[idx] += ldsSortData[idx-2]; \n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" ldsSortData[idx] += ldsSortData[idx-4];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" ldsSortData[idx] += ldsSortData[idx-8];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" ldsSortData[idx] += ldsSortData[idx-16];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" ldsSortData[idx] += ldsSortData[idx-32]; \n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" ldsSortData[idx] += ldsSortData[idx-64];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
"\n"
|
||||
" ldsSortData[idx-1] += ldsSortData[idx-2];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" totalSum = ldsSortData[WG_SIZE*2-1];\n"
|
||||
" uint addValue = ldsSortData[lIdx+127];\n"
|
||||
" return pData + make_uint4(addValue, addValue, addValue, addValue);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"void generateHistogram(u32 lIdx, u32 wgIdx, \n"
|
||||
" uint4 sortedData)\n"
|
||||
"{\n"
|
||||
" if( lIdx < (1<<BITS_PER_PASS) )\n"
|
||||
" {\n"
|
||||
" ldsSortData[lIdx] = 0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" int mask = ((1<<BITS_PER_PASS)-1);\n"
|
||||
" uint4 keys = make_uint4( (sortedData.x)&mask, (sortedData.y)&mask, (sortedData.z)&mask, (sortedData.w)&mask );\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" \n"
|
||||
" AtomInc( ldsSortData[keys.x] );\n"
|
||||
" AtomInc( ldsSortData[keys.y] );\n"
|
||||
" AtomInc( ldsSortData[keys.z] );\n"
|
||||
" AtomInc( ldsSortData[keys.w] );\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void LocalSortKernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int nElemsPerWG = WG_SIZE*NUM_PER_WI;\n"
|
||||
" u32 lIdx = GET_LOCAL_IDX;\n"
|
||||
" u32 wgIdx = GET_GROUP_IDX;\n"
|
||||
" u32 wgSize = GET_GROUP_SIZE;\n"
|
||||
"\n"
|
||||
" uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
|
||||
"\n"
|
||||
"\n"
|
||||
" SortData sortData[NUM_PER_WI];\n"
|
||||
"\n"
|
||||
" {\n"
|
||||
" u32 offset = nElemsPerWG*wgIdx;\n"
|
||||
" sortData[0] = sortDataIn[offset+localAddr.x];\n"
|
||||
" sortData[1] = sortDataIn[offset+localAddr.y];\n"
|
||||
" sortData[2] = sortDataIn[offset+localAddr.z];\n"
|
||||
" sortData[3] = sortDataIn[offset+localAddr.w];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" int bitIdx = m_startBit;\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
"// what is this?\n"
|
||||
"// if( lIdx == wgSize-1 ) ldsSortData[256] = sortData[3].m_key;\n"
|
||||
" u32 mask = (1<<bitIdx);\n"
|
||||
" uint4 cmpResult = make_uint4( sortData[0].m_key & mask, sortData[1].m_key & mask, sortData[2].m_key & mask, sortData[3].m_key & mask );\n"
|
||||
" uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n"
|
||||
" u32 total;\n"
|
||||
" prefixSum = localPrefixSum128V( prefixSum, lIdx, total );\n"
|
||||
"\n"
|
||||
" {\n"
|
||||
" uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n"
|
||||
" dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" ldsSortData[dstAddr.x] = sortData[0].m_key;\n"
|
||||
" ldsSortData[dstAddr.y] = sortData[1].m_key;\n"
|
||||
" ldsSortData[dstAddr.z] = sortData[2].m_key;\n"
|
||||
" ldsSortData[dstAddr.w] = sortData[3].m_key;\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" sortData[0].m_key = ldsSortData[localAddr.x];\n"
|
||||
" sortData[1].m_key = ldsSortData[localAddr.y];\n"
|
||||
" sortData[2].m_key = ldsSortData[localAddr.z];\n"
|
||||
" sortData[3].m_key = ldsSortData[localAddr.w];\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" ldsSortData[dstAddr.x] = sortData[0].m_value;\n"
|
||||
" ldsSortData[dstAddr.y] = sortData[1].m_value;\n"
|
||||
" ldsSortData[dstAddr.z] = sortData[2].m_value;\n"
|
||||
" ldsSortData[dstAddr.w] = sortData[3].m_value;\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" sortData[0].m_value = ldsSortData[localAddr.x];\n"
|
||||
" sortData[1].m_value = ldsSortData[localAddr.y];\n"
|
||||
" sortData[2].m_value = ldsSortData[localAddr.z];\n"
|
||||
" sortData[3].m_value = ldsSortData[localAddr.w];\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" }\n"
|
||||
" bitIdx ++;\n"
|
||||
" }\n"
|
||||
" while( bitIdx <(m_startBit+BITS_PER_PASS) );\n"
|
||||
"\n"
|
||||
" { // generate historgram\n"
|
||||
" uint4 localKeys = make_uint4( sortData[0].m_key>>m_startBit, sortData[1].m_key>>m_startBit, \n"
|
||||
" sortData[2].m_key>>m_startBit, sortData[3].m_key>>m_startBit );\n"
|
||||
"\n"
|
||||
" generateHistogram( lIdx, wgIdx, localKeys );\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" int nBins = (1<<BITS_PER_PASS);\n"
|
||||
" if( lIdx < nBins )\n"
|
||||
" {\n"
|
||||
" u32 histValues = ldsSortData[lIdx];\n"
|
||||
"\n"
|
||||
" u32 globalAddresses = nBins*wgIdx + lIdx;\n"
|
||||
" u32 globalAddressesRadixMajor = m_numGroups*lIdx + wgIdx;\n"
|
||||
" \n"
|
||||
" ldsHistogramOut0[globalAddressesRadixMajor] = histValues;\n"
|
||||
" ldsHistogramOut1[globalAddresses] = histValues;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" { // write\n"
|
||||
" u32 offset = nElemsPerWG*wgIdx;\n"
|
||||
" uint4 dstAddr = make_uint4(offset+localAddr.x, offset+localAddr.y, offset+localAddr.z, offset+localAddr.w );\n"
|
||||
"\n"
|
||||
" sortDataIn[ dstAddr.x + 0 ] = sortData[0];\n"
|
||||
" sortDataIn[ dstAddr.x + 1 ] = sortData[1];\n"
|
||||
" sortDataIn[ dstAddr.x + 2 ] = sortData[2];\n"
|
||||
" sortDataIn[ dstAddr.x + 3 ] = sortData[3];\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"StructuredBuffer<SortData> src : register( t0 );\n"
|
||||
"StructuredBuffer<u32> histogramGlobalRadixMajor : register( t1 );\n"
|
||||
"StructuredBuffer<u32> histogramLocalGroupMajor : register( t2 );\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<SortData> dst : register( u0 );\n"
|
||||
"\n"
|
||||
"groupshared u32 ldsLocalHistogram[ 2*(1<<BITS_PER_PASS) ];\n"
|
||||
"groupshared u32 ldsGlobalHistogram[ (1<<BITS_PER_PASS) ];\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void ScatterKernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" u32 lIdx = GET_LOCAL_IDX;\n"
|
||||
" u32 wgIdx = GET_GROUP_IDX;\n"
|
||||
" u32 ldsOffset = (1<<BITS_PER_PASS);\n"
|
||||
"\n"
|
||||
" // load and prefix scan local histogram\n"
|
||||
" if( lIdx < ((1<<BITS_PER_PASS)/2) )\n"
|
||||
" {\n"
|
||||
" uint2 myIdx = make_uint2(lIdx, lIdx+8);\n"
|
||||
"\n"
|
||||
" ldsLocalHistogram[ldsOffset+myIdx.x] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.x];\n"
|
||||
" ldsLocalHistogram[ldsOffset+myIdx.y] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.y];\n"
|
||||
" ldsLocalHistogram[ldsOffset+myIdx.x-(1<<BITS_PER_PASS)] = 0;\n"
|
||||
" ldsLocalHistogram[ldsOffset+myIdx.y-(1<<BITS_PER_PASS)] = 0;\n"
|
||||
"\n"
|
||||
" int idx = ldsOffset+2*lIdx;\n"
|
||||
" ldsLocalHistogram[idx] += ldsLocalHistogram[idx-1];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" ldsLocalHistogram[idx] += ldsLocalHistogram[idx-2];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" ldsLocalHistogram[idx] += ldsLocalHistogram[idx-4];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
" ldsLocalHistogram[idx] += ldsLocalHistogram[idx-8];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
"\n"
|
||||
" // Propagate intermediate values through\n"
|
||||
" ldsLocalHistogram[idx-1] += ldsLocalHistogram[idx-2];\n"
|
||||
" GROUP_MEM_FENCE;\n"
|
||||
"\n"
|
||||
" // Grab and propagate for whole WG - loading the - 1 value\n"
|
||||
" uint2 localValues;\n"
|
||||
" localValues.x = ldsLocalHistogram[ldsOffset+myIdx.x-1];\n"
|
||||
" localValues.y = ldsLocalHistogram[ldsOffset+myIdx.y-1];\n"
|
||||
"\n"
|
||||
" ldsLocalHistogram[myIdx.x] = localValues.x;\n"
|
||||
" ldsLocalHistogram[myIdx.y] = localValues.y;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
" ldsGlobalHistogram[myIdx.x] = histogramGlobalRadixMajor[m_numGroups*myIdx.x + wgIdx];\n"
|
||||
" ldsGlobalHistogram[myIdx.y] = histogramGlobalRadixMajor[m_numGroups*myIdx.y + wgIdx];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
|
||||
"\n"
|
||||
" SortData sortData[4];\n"
|
||||
" {\n"
|
||||
" uint4 globalAddr = wgIdx*WG_SIZE*NUM_PER_WI + localAddr;\n"
|
||||
" sortData[0] = src[globalAddr.x];\n"
|
||||
" sortData[1] = src[globalAddr.y];\n"
|
||||
" sortData[2] = src[globalAddr.z];\n"
|
||||
" sortData[3] = src[globalAddr.w];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" uint cmpValue = ((1<<BITS_PER_PASS)-1);\n"
|
||||
" uint4 radix = make_uint4( (sortData[0].m_key>>m_startBit)&cmpValue, (sortData[1].m_key>>m_startBit)&cmpValue, \n"
|
||||
" (sortData[2].m_key>>m_startBit)&cmpValue, (sortData[3].m_key>>m_startBit)&cmpValue );;\n"
|
||||
"\n"
|
||||
" // data is already sorted. So simply subtract local prefix sum\n"
|
||||
" uint4 dstAddr;\n"
|
||||
" dstAddr.x = ldsGlobalHistogram[radix.x] + (localAddr.x - ldsLocalHistogram[radix.x]);\n"
|
||||
" dstAddr.y = ldsGlobalHistogram[radix.y] + (localAddr.y - ldsLocalHistogram[radix.y]);\n"
|
||||
" dstAddr.z = ldsGlobalHistogram[radix.z] + (localAddr.z - ldsLocalHistogram[radix.z]);\n"
|
||||
" dstAddr.w = ldsGlobalHistogram[radix.w] + (localAddr.w - ldsLocalHistogram[radix.w]);\n"
|
||||
"\n"
|
||||
" dst[dstAddr.x] = sortData[0];\n"
|
||||
" dst[dstAddr.y] = sortData[1];\n"
|
||||
" dst[dstAddr.z] = sortData[2];\n"
|
||||
" dst[dstAddr.w] = sortData[3];\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void CopyKernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" dst[ GET_GLOBAL_IDX ] = src[ GET_GLOBAL_IDX ];\n"
|
||||
"}\n"
|
||||
;
|
||||
@@ -0,0 +1,31 @@
|
||||
/*
|
||||
2011 Takahiro Harada
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AdlPrimitives/Math/Math.h>
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
struct SortData
|
||||
{
|
||||
SortData(){}
|
||||
SortData( u32 key, u32 value ) : m_key(key), m_value(value) {}
|
||||
|
||||
union
|
||||
{
|
||||
u32 m_key;
|
||||
struct { u16 m_key16[2]; };
|
||||
};
|
||||
u32 m_value;
|
||||
|
||||
friend bool operator <(const SortData& a, const SortData& b)
|
||||
{
|
||||
return a.m_key < b.m_key;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
};
|
||||
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
2011 Takahiro Harada
|
||||
*/
|
||||
|
||||
#define PATH "..\\..\\AdlPrimitives\\Sort\\RadixSortAdvancedKernels"
|
||||
#define KERNEL0 "StreamCountKernel"
|
||||
#define KERNEL1 "SortAndScatterKernel1"
|
||||
#define KERNEL2 "PrefixScanKernel"
|
||||
|
||||
template<DeviceType type>
|
||||
class RadixSortAdvanced : public RadixSortBase
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
enum
|
||||
{
|
||||
WG_SIZE = 128,
|
||||
NUM_PER_WI = 4,
|
||||
MAX_NUM_WORKGROUPS = 60,
|
||||
};
|
||||
|
||||
struct Data : public RadixSort<type>::Data
|
||||
{
|
||||
Kernel* m_localCountKernel;
|
||||
Kernel* m_scatterKernel;
|
||||
Kernel* m_scanKernel;
|
||||
|
||||
Buffer<u32>* m_workBuffer0;
|
||||
Buffer<SortData>* m_workBuffer1;
|
||||
Buffer<int4>* m_constBuffer[32/4];
|
||||
};
|
||||
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_NORMAL);
|
||||
|
||||
static
|
||||
void deallocate(void* data);
|
||||
|
||||
static
|
||||
void execute(void* data, Buffer<SortData>& inout, int n, int sortBits);
|
||||
};
|
||||
|
||||
template<DeviceType type>
|
||||
typename RadixSortAdvanced<type>::Data* RadixSortAdvanced<type>::allocate(const Device* deviceData, int maxSize, Option option)
|
||||
{
|
||||
ADLASSERT( type == deviceData->m_type );
|
||||
|
||||
const char* src[] = { 0, 0, 0 };
|
||||
|
||||
Data* data = new Data;
|
||||
data->m_option = option;
|
||||
data->m_deviceData = deviceData;
|
||||
|
||||
data->m_localCountKernel = deviceData->getKernel( PATH, KERNEL0, 0, src[type] );
|
||||
data->m_scatterKernel = deviceData->getKernel( PATH, KERNEL1, 0, src[type] );
|
||||
data->m_scanKernel = deviceData->getKernel( PATH, KERNEL2, 0, src[type] );
|
||||
|
||||
data->m_workBuffer0 = new Buffer<u32>( deviceData, MAX_NUM_WORKGROUPS*16 );
|
||||
data->m_workBuffer1 = new Buffer<SortData>( deviceData, maxSize );
|
||||
for(int i=0; i<32/4; i++)
|
||||
data->m_constBuffer[i] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
|
||||
data->m_maxSize = maxSize;
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void RadixSortAdvanced<type>::deallocate(void* rawData)
|
||||
{
|
||||
Data* data = (Data*)rawData;
|
||||
|
||||
delete data->m_workBuffer0;
|
||||
delete data->m_workBuffer1;
|
||||
for(int i=0; i<32/4; i++)
|
||||
delete data->m_constBuffer[i];
|
||||
|
||||
delete data;
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void RadixSortAdvanced<type>::execute(void* rawData, Buffer<SortData>& inout, int n, int sortBits)
|
||||
{
|
||||
Data* data = (Data*)rawData;
|
||||
|
||||
ADLASSERT( sortBits == 32 );
|
||||
|
||||
ADLASSERT( NUM_PER_WI == 4 );
|
||||
ADLASSERT( n%(WG_SIZE*NUM_PER_WI) == 0 );
|
||||
ADLASSERT( MAX_NUM_WORKGROUPS < 128*8/16 );
|
||||
|
||||
Buffer<SortData>* src = &inout;
|
||||
Buffer<SortData>* dst = data->m_workBuffer1;
|
||||
|
||||
const Device* deviceData = data->m_deviceData;
|
||||
|
||||
int nBlocks = n/(NUM_PER_WI*WG_SIZE);
|
||||
const int nWorkGroupsToExecute = min2((int)MAX_NUM_WORKGROUPS, nBlocks);
|
||||
int nBlocksPerGroup = (nBlocks+nWorkGroupsToExecute-1)/nWorkGroupsToExecute;
|
||||
ADLASSERT( nWorkGroupsToExecute <= MAX_NUM_WORKGROUPS );
|
||||
|
||||
int4 constBuffer = make_int4(0, nBlocks, nWorkGroupsToExecute, nBlocksPerGroup);
|
||||
|
||||
int iPass = 0;
|
||||
int startBit = 0;
|
||||
for(int startBit=0; startBit<32; startBit+=4, iPass++)
|
||||
{
|
||||
constBuffer.x = startBit;
|
||||
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( data->m_workBuffer0 ) };
|
||||
|
||||
Launcher launcher( deviceData, data->m_localCountKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
|
||||
launcher.launch1D( WG_SIZE* nWorkGroupsToExecute, WG_SIZE );
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( data->m_workBuffer0 ) };
|
||||
|
||||
Launcher launcher( deviceData, data->m_scanKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
|
||||
launcher.launch1D( WG_SIZE, WG_SIZE );
|
||||
}
|
||||
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( data->m_workBuffer0, true ), BufferInfo( src ), BufferInfo( dst ) };
|
||||
|
||||
Launcher launcher( deviceData, data->m_scatterKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
|
||||
launcher.launch1D( WG_SIZE*nWorkGroupsToExecute, WG_SIZE );
|
||||
}
|
||||
|
||||
swap2( src, dst );
|
||||
}
|
||||
}
|
||||
|
||||
#undef PATH
|
||||
#undef KERNEL0
|
||||
#undef KERNEL1
|
||||
#undef KERNEL2
|
||||
@@ -0,0 +1,149 @@
|
||||
/*
|
||||
2011 Takahiro Harada
|
||||
*/
|
||||
|
||||
#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Sort\\RadixSortSimpleKernels"
|
||||
#define KERNEL0 "LocalCountKernel"
|
||||
#define KERNEL1 "ScatterKernel"
|
||||
|
||||
#include <AdlPrimitives/Sort/RadixSortSimpleCL.h>
|
||||
#include <AdlPrimitives/Sort/RadixSortSimpleDX11.h>
|
||||
|
||||
template<DeviceType type>
|
||||
class RadixSortSimple : public RadixSortBase
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
enum
|
||||
{
|
||||
WG_SIZE = 128,
|
||||
NUM_PER_WI = 4,
|
||||
};
|
||||
|
||||
struct Data : public RadixSort<type>::Data
|
||||
{
|
||||
Kernel* m_localCountKernel;
|
||||
Kernel* m_scatterKernel;
|
||||
|
||||
Buffer<u32>* m_workBuffer0;
|
||||
Buffer<u32>* m_workBuffer1;
|
||||
Buffer<SortData>* m_workBuffer2;
|
||||
Buffer<int4>* m_constBuffer[4];
|
||||
};
|
||||
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_NORMAL);
|
||||
|
||||
static
|
||||
void deallocate(void* data);
|
||||
|
||||
static
|
||||
void execute(void* data, Buffer<SortData>& inout, int n, int sortBits);
|
||||
};
|
||||
|
||||
template<DeviceType type>
|
||||
typename RadixSortSimple<type>::Data* RadixSortSimple<type>::allocate(const Device* deviceData, int maxSize, Option option)
|
||||
{
|
||||
ADLASSERT( type == deviceData->m_type );
|
||||
|
||||
const char* src[] =
|
||||
#if defined(ADL_LOAD_KERNEL_FROM_STRING)
|
||||
{radixSortSimpleKernelsCL, radixSortSimpleKernelsDX11};
|
||||
#else
|
||||
{ 0, 0 };
|
||||
#endif
|
||||
u32 maxNumGroups = (maxSize+WG_SIZE*NUM_PER_WI-1)/(WG_SIZE*NUM_PER_WI);
|
||||
|
||||
Data* data = new Data;
|
||||
data->m_option = option;
|
||||
data->m_deviceData = deviceData;
|
||||
|
||||
data->m_localCountKernel = deviceData->getKernel( PATH, KERNEL0, 0, src[type] );
|
||||
data->m_scatterKernel = deviceData->getKernel( PATH, KERNEL1, 0, src[type] );
|
||||
|
||||
data->m_scanData = PrefixScan<type>::allocate( deviceData, maxSize );
|
||||
|
||||
data->m_workBuffer0 = new Buffer<u32>( deviceData, maxNumGroups*256 );
|
||||
data->m_workBuffer1 = new Buffer<u32>( deviceData, maxNumGroups*256 );
|
||||
data->m_workBuffer2 = new Buffer<SortData>( deviceData, maxSize );
|
||||
data->m_constBuffer[0] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
|
||||
data->m_constBuffer[1] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
|
||||
data->m_constBuffer[2] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
|
||||
data->m_constBuffer[3] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
|
||||
data->m_maxSize = maxSize;
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void RadixSortSimple<type>::deallocate(void* rawData)
|
||||
{
|
||||
Data* data = (Data*)rawData;
|
||||
|
||||
delete data->m_workBuffer0;
|
||||
delete data->m_workBuffer1;
|
||||
delete data->m_workBuffer2;
|
||||
delete data->m_constBuffer[0];
|
||||
delete data->m_constBuffer[1];
|
||||
delete data->m_constBuffer[2];
|
||||
delete data->m_constBuffer[3];
|
||||
|
||||
PrefixScan<type>::deallocate( data->m_scanData );
|
||||
|
||||
delete data;
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void RadixSortSimple<type>::execute(void* rawData, Buffer<SortData>& inout, int n, int sortBits)
|
||||
{
|
||||
Data* data = (Data*)rawData;
|
||||
|
||||
ADLASSERT( sortBits == 32 );
|
||||
ADLASSERT( n%512 == 0 );
|
||||
ADLASSERT( n <= data->m_maxSize );
|
||||
|
||||
Buffer<SortData>* src = &inout;
|
||||
Buffer<SortData>* dst = data->m_workBuffer2;
|
||||
|
||||
const Device* deviceData = data->m_deviceData;
|
||||
|
||||
int numGroups = (n+WG_SIZE*NUM_PER_WI-1)/(WG_SIZE*NUM_PER_WI);
|
||||
|
||||
int4 constBuffer;
|
||||
|
||||
int iPass = 0;
|
||||
for(int startBit=0; startBit<32; startBit+=8, iPass++)
|
||||
{
|
||||
constBuffer.x = startBit;
|
||||
constBuffer.y = numGroups;
|
||||
constBuffer.z = WG_SIZE;
|
||||
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( data->m_workBuffer0 ) };
|
||||
|
||||
Launcher launcher( deviceData, data->m_localCountKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
|
||||
launcher.launch1D( WG_SIZE*numGroups, WG_SIZE );
|
||||
}
|
||||
|
||||
PrefixScan<type>::execute( data->m_scanData, *data->m_workBuffer0, *data->m_workBuffer1, numGroups*256 );
|
||||
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( dst ), BufferInfo( data->m_workBuffer1 ) };
|
||||
|
||||
Launcher launcher( deviceData, data->m_scatterKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
|
||||
launcher.launch1D( WG_SIZE*numGroups, WG_SIZE );
|
||||
}
|
||||
|
||||
swap2( src, dst );
|
||||
}
|
||||
}
|
||||
|
||||
#undef PATH
|
||||
#undef KERNEL0
|
||||
#undef KERNEL1
|
||||
@@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env python
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
|
||||
arg = sys.argv[1]
|
||||
fh = open(arg)
|
||||
|
||||
print 'static const char* '+sys.argv[2]+'= \\'
|
||||
for line in fh.readlines():
|
||||
a = line.strip('\n')
|
||||
print '"'+a+'\\n"'
|
||||
print ';'
|
||||
@@ -0,0 +1,22 @@
|
||||
stringify.py Fill/FillKernels.cl fillKernelsCL >Fill/FillKernelsCL.h
|
||||
stringify.py Fill/FillKernels.hlsl fillKernelsDX11 >Fill/FillKernelsDX11.h
|
||||
stringify.py Scan/PrefixScanKernels.cl prefixScanKernelsCL >Scan/PrefixScanKernelsCL.h
|
||||
stringify.py Scan/PrefixScanKernels.hlsl prefixScanKernelsDX11 >Scan/PrefixScanKernelsDX11.h
|
||||
stringify.py Search/BoundSearchKernels.cl boundSearchKernelsCL >Search/BoundSearchKernelsCL.h
|
||||
stringify.py Search/BoundSearchKernels.hlsl boundSearchKernelsDX11 >Search/BoundSearchKernelsDX11.h
|
||||
stringify.py Sort/RadixSortSimpleKernels.cl radixSortSimpleKernelsCL >Sort/RadixSortSimpleKernelsCL.h
|
||||
stringify.py Sort/RadixSortSimpleKernels.hlsl radixSortSimpleKernelsDX11 >Sort/RadixSortSimpleKernelsDX11.h
|
||||
stringify.py Sort/RadixSortStandardKernels.cl radixSortStandardKernelsCL >Sort/RadixSortStandardKernelsCL.h
|
||||
|
||||
stringify.py Sort/RadixSort32Kernels.cl radixSort32KernelsCL >Sort/RadixSort32KernelsCL.h
|
||||
stringify.py Sort/RadixSort32Kernels.hlsl radixSort32KernelsDX11 >Sort/RadixSort32KernelsDX11.h
|
||||
|
||||
stringify.py Copy/CopyKernels.cl copyKernelsCL >Copy/CopyKernelsCL.h
|
||||
stringify.py Copy/CopyKernels.hlsl copyKernelsDX11 >Copy/CopyKernelsDX11.h
|
||||
|
||||
stringify.py Sort/RadixSortStandardKernels.hlsl radixSortStandardKernelsDX11 >Sort/RadixSortStandardKernelsDX11.h
|
||||
stringify.py Sort/RadixSortAdvancedKernels.hlsl radixSortAdvancedKernelsDX11 >Sort/RadixSortAdvancedKernelsDX11.h
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
|
||||
hasCL = findOpenCL_AMD()
|
||||
hasDX11 = findDirectX11()
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project "OpenCL_DX11_primitives_test_AMD"
|
||||
|
||||
initOpenCL_AMD()
|
||||
|
||||
if (hasDX11) then
|
||||
initDirectX11()
|
||||
end
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../../bin"
|
||||
includedirs {"..","../.."}
|
||||
|
||||
links {
|
||||
"OpenCL"
|
||||
}
|
||||
|
||||
files {
|
||||
"../main.cpp",
|
||||
"../RadixSortBenchmark.h",
|
||||
"../UnitTests.h"
|
||||
}
|
||||
|
||||
end
|
||||
@@ -0,0 +1,31 @@
|
||||
|
||||
hasCL = findOpenCL_Intel()
|
||||
hasDX11 = findDirectX11()
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project "OpenCL_DX11_primitives_test_Intel"
|
||||
|
||||
initOpenCL_Intel()
|
||||
|
||||
if (hasDX11) then
|
||||
initDirectX11()
|
||||
end
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../../bin"
|
||||
includedirs {"..","../.."}
|
||||
|
||||
links {
|
||||
"OpenCL"
|
||||
}
|
||||
|
||||
files {
|
||||
"../main.cpp",
|
||||
"../RadixSortBenchmark.h",
|
||||
"../UnitTests.h"
|
||||
}
|
||||
|
||||
end
|
||||
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#include <AdlPrimitives/Copy/Copy.h>
|
||||
|
||||
|
||||
|
||||
template<DeviceType TYPE>
|
||||
__inline
|
||||
void copyTest( Device* device )
|
||||
{
|
||||
int size = 65*1024;
|
||||
|
||||
Buffer<float4> buf0( device, size );
|
||||
Buffer<float4> buf1( device, size );
|
||||
|
||||
Stopwatch sw( device );
|
||||
|
||||
Copy<TYPE>::Data* data = Copy<TYPE>::allocate( device );
|
||||
|
||||
for(int i=0; i<10; i++)
|
||||
Copy<TYPE>::execute( data, buf1, buf0, size, CopyBase::PER_WI_1 );
|
||||
DeviceUtils::waitForCompletion( device );
|
||||
|
||||
{
|
||||
const int nTests = 12;
|
||||
|
||||
float t[nTests];
|
||||
|
||||
for(int ii=0; ii<nTests; ii++)
|
||||
{
|
||||
int iter = 1<<ii;
|
||||
|
||||
DeviceUtils::waitForCompletion( device );
|
||||
sw.start();
|
||||
for(int i=0; i<iter; i++)
|
||||
{
|
||||
Copy<TYPE>::execute( data, buf1, buf0, size, CopyBase::PER_WI_1 );
|
||||
}
|
||||
DeviceUtils::waitForCompletion( device );
|
||||
sw.stop();
|
||||
|
||||
t[ii] = sw.getMs()/(float)iter;
|
||||
}
|
||||
|
||||
for(int ii=0; ii<nTests; ii++)
|
||||
{
|
||||
printf("%d: %3.4fms (%3.2fGB/s)\n", (1<<ii), t[ii], size*16*2/1024.f/1024.f/t[ii]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
}
|
||||
|
||||
Copy<TYPE>::deallocate( data );
|
||||
}
|
||||
|
||||
void launchOverheadBenchmark()
|
||||
{
|
||||
printf("LaunchOverheadBenchmark\n");
|
||||
|
||||
|
||||
Device* ddcl;
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
Device* dddx;
|
||||
#endif
|
||||
{
|
||||
DeviceUtils::Config cfg;
|
||||
ddcl = DeviceUtils::allocate( TYPE_CL, cfg );
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
dddx = DeviceUtils::allocate( TYPE_DX11, cfg );
|
||||
#endif
|
||||
}
|
||||
|
||||
{
|
||||
printf("CL\n");
|
||||
copyTest<TYPE_CL>( ddcl );
|
||||
}
|
||||
#ifdef ADL_ENABLE_DX11
|
||||
{
|
||||
printf("DX11\n");
|
||||
copyTest<TYPE_DX11>( dddx );
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
//1, 2, 4, 8, 16, 32, 64, 128, 256,
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
|
||||
hasCL = findOpenCL_NVIDIA()
|
||||
hasDX11 = findDirectX11()
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project "OpenCL_DX11_primitives_test_NVIDIA"
|
||||
|
||||
initOpenCL_NVIDIA()
|
||||
|
||||
if (hasDX11) then
|
||||
initDirectX11()
|
||||
end
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../../bin"
|
||||
includedirs {"..","../.."}
|
||||
|
||||
links {
|
||||
"OpenCL"
|
||||
}
|
||||
|
||||
files {
|
||||
"../main.cpp",
|
||||
"../RadixSortBenchmark.h",
|
||||
"../UnitTests.h"
|
||||
}
|
||||
|
||||
end
|
||||
@@ -0,0 +1,121 @@
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void run( Device* device, int minSize = 512, int maxSize = 64*1024 )//, int increment = 512 )
|
||||
{
|
||||
ADLASSERT( TYPE == device->m_type );
|
||||
|
||||
Stopwatch sw( device );
|
||||
|
||||
// RadixSort<TYPE>::Data* data0 = RadixSort<TYPE>::allocate( device, maxSize, RadixSortBase::SORT_SIMPLE );
|
||||
RadixSort<TYPE>::Data* data0 = RadixSort<TYPE>::allocate( device, maxSize, RadixSortBase::SORT_STANDARD );
|
||||
RadixSort<TYPE>::Data* data1 = RadixSort<TYPE>::allocate( device, maxSize, RadixSortBase::SORT_STANDARD );
|
||||
RadixSort<TYPE>::Data* data2 = RadixSort<TYPE>::allocate( device, maxSize, RadixSortBase::SORT_ADVANCED );
|
||||
|
||||
Buffer<SortData> buf0( device, maxSize );
|
||||
Buffer<SortData> buf1( device, maxSize );
|
||||
Buffer<SortData> buf2( device, maxSize );
|
||||
|
||||
SortData* input = new SortData[ maxSize ];
|
||||
|
||||
// for(int iter = minSize; iter<=maxSize; iter+=increment)
|
||||
for(int iter = minSize; iter<=maxSize; iter*=2)
|
||||
{
|
||||
int size = NEXTMULTIPLEOF( iter, 512 );
|
||||
|
||||
for(int i=0; i<size; i++) input[i] = SortData( getRandom(0,0xff), i );
|
||||
|
||||
buf0.write( input, size );
|
||||
buf1.write( input, size );
|
||||
buf2.write( input, size );
|
||||
DeviceUtils::waitForCompletion( device );
|
||||
|
||||
|
||||
sw.start();
|
||||
|
||||
RadixSort<TYPE>::execute( data0, buf0, size );
|
||||
|
||||
sw.split();
|
||||
|
||||
RadixSort<TYPE>::execute( data1, buf1, size );
|
||||
|
||||
sw.split();
|
||||
|
||||
RadixSort<TYPE>::execute( data2, buf2, size );
|
||||
|
||||
sw.stop();
|
||||
|
||||
|
||||
float t[3];
|
||||
sw.getMs( t, 3 );
|
||||
// printf(" %d %3.2f %3.2f %3.2f\n", size, t[0], t[1], t[2]);
|
||||
printf(" %d %3.2f %3.2f\n", size, t[1], t[2]);
|
||||
}
|
||||
|
||||
RadixSort<TYPE>::deallocate( data0 );
|
||||
RadixSort<TYPE>::deallocate( data1 );
|
||||
RadixSort<TYPE>::deallocate( data2 );
|
||||
|
||||
delete [] input;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void run32( Device* device, int size )
|
||||
{
|
||||
//Cayman: 4194.30Keys: 373.05MKeys/s
|
||||
//Cypress: 4194.30Keys: 315.13MKeys/s
|
||||
ADLASSERT( TYPE == device->m_type );
|
||||
|
||||
Stopwatch sw( device );
|
||||
|
||||
RadixSort32<TYPE>::Data* data = RadixSort32<TYPE>::allocate( device, size );
|
||||
Copy<TYPE>::Data* copyData = Copy<TYPE>::allocate( device );
|
||||
|
||||
Buffer<u32> inputMaster( device, size );
|
||||
Buffer<u32> input( device, size );
|
||||
Buffer<u32> output( device, size );
|
||||
{
|
||||
u32* host = new u32[size];
|
||||
for(int i=0; i<size; i++) host[i] = getRandom(0u, 0xffffffffu);
|
||||
inputMaster.write( host, size );
|
||||
DeviceUtils::waitForCompletion( device );
|
||||
delete [] host;
|
||||
}
|
||||
|
||||
int nIter = 100;
|
||||
sw.start();
|
||||
for(int iter=0; iter<nIter; iter++)
|
||||
{
|
||||
// Copy<TYPE>::execute( copyData, (Buffer<float>&)input, (Buffer<float>&)inputMaster, size );
|
||||
// RadixSort32<TYPE>::execute( data, input, size );
|
||||
RadixSort32<TYPE>::execute( data, input, output, size );
|
||||
}
|
||||
sw.stop();
|
||||
|
||||
{
|
||||
float tInS = sw.getMs()/1000.f/(float)nIter;
|
||||
float mKeysPerS = size/1000.f/1000.f/tInS;
|
||||
printf("%3.2fMKeys: %3.2fMKeys/s\n", size/1000.f, mKeysPerS);
|
||||
}
|
||||
|
||||
RadixSort32<TYPE>::deallocate( data );
|
||||
Copy<TYPE>::deallocate( copyData );
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void radixSortBenchmark()
|
||||
{
|
||||
|
||||
Device* device;
|
||||
{
|
||||
DeviceUtils::Config cfg;
|
||||
device = DeviceUtils::allocate( TYPE, cfg );
|
||||
}
|
||||
|
||||
run32<TYPE>( device, 256*1024*8*2 );
|
||||
// run32<TYPE>( device, 256*20*6 );
|
||||
|
||||
// run<TYPE>( device, 512, 1024*128*4 );
|
||||
|
||||
DeviceUtils::deallocate( device );
|
||||
|
||||
}
|
||||
@@ -0,0 +1,801 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#include <AdlPrimitives/Scan/PrefixScan.h>
|
||||
#include <AdlPrimitives/Sort/RadixSort.h>
|
||||
#include <AdlPrimitives/Sort/RadixSort32.h>
|
||||
#include <AdlPrimitives/Search/BoundSearch.h>
|
||||
#include <AdlPrimitives/Fill/Fill.h>
|
||||
#include <AdlPrimitives/Copy/Copy.h>
|
||||
|
||||
#include <time.h>
|
||||
|
||||
using namespace adl;
|
||||
|
||||
#define NUM_TESTS 10
|
||||
|
||||
int g_nPassed = 0;
|
||||
int g_nFailed = 0;
|
||||
bool g_testFailed = 0;
|
||||
|
||||
//#define TEST_INIT bool g_testFailed = 0;
|
||||
#define TEST_INIT g_testFailed = 0;
|
||||
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
|
||||
//#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;ADLASSERT(x);}
|
||||
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
|
||||
|
||||
void memCpyTest( Device* deviceData )
|
||||
{
|
||||
TEST_INIT;
|
||||
int maxSize = 64*1024;
|
||||
Buffer<u32> buff( deviceData, maxSize );
|
||||
|
||||
u32* hostBuff = new u32[maxSize];
|
||||
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = getRandom( 1024, maxSize );
|
||||
|
||||
for(int i=0; i<size; i++) hostBuff[i] = i;
|
||||
|
||||
buff.write( hostBuff, size );
|
||||
|
||||
DeviceUtils::waitForCompletion( deviceData );
|
||||
for(int i=0; i<size; i++) hostBuff[i] = 0;
|
||||
|
||||
buff.read( hostBuff, size );
|
||||
|
||||
DeviceUtils::waitForCompletion( deviceData );
|
||||
for(int i=0; i<size; i++) TEST_ASSERT( hostBuff[i] == i );
|
||||
}
|
||||
|
||||
delete [] hostBuff;
|
||||
TEST_REPORT( "memCpyTest" );
|
||||
}
|
||||
|
||||
void kernelTest( Device* deviceData )
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
KernelManager* manager = new KernelManager();
|
||||
|
||||
Kernel* kernel = manager->query(deviceData, ".\\Kernel", "VectorAddKernel" );
|
||||
|
||||
{
|
||||
int size = 1024;
|
||||
Buffer<int> buf0( deviceData, size );
|
||||
Buffer<int> buf1( deviceData, size );
|
||||
Buffer<float4> cBuf( deviceData, 1, BufferBase::BUFFER_CONST );
|
||||
int* hostBuf0 = new int[size];
|
||||
int* hostBuf1 = new int[size];
|
||||
for(int i=0; i<size; i++) { hostBuf0[i] = i; hostBuf1[i] = 1; }
|
||||
buf0.write( hostBuf0, size );
|
||||
buf1.write( hostBuf1, size );
|
||||
DeviceUtils::waitForCompletion( deviceData );
|
||||
|
||||
float4 constBuffer;
|
||||
constBuffer.x = (float)size;
|
||||
constBuffer.y = 2.f;
|
||||
constBuffer.z = 0.f;
|
||||
constBuffer.w = 0.f;
|
||||
{
|
||||
Launcher::BufferInfo bInfo[] = { Launcher::BufferInfo( (Buffer<float>*)&buf0 ), Launcher::BufferInfo( (Buffer<float>*)&buf1, true ) };
|
||||
|
||||
Launcher launcher( deviceData, kernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( (Buffer<float4>&)cBuf, constBuffer );
|
||||
launcher.launch1D( size );
|
||||
|
||||
buf0.read( hostBuf0, size );
|
||||
buf1.read( hostBuf1, size );
|
||||
DeviceUtils::waitForCompletion( deviceData );
|
||||
}
|
||||
|
||||
for(int i=0; i<size; i++) { TEST_ASSERT( hostBuf0[i] == i+1+2 ); }
|
||||
|
||||
delete [] hostBuf0;
|
||||
delete [] hostBuf1;
|
||||
}
|
||||
TEST_REPORT( "kernelTest" );
|
||||
}
|
||||
|
||||
void stopwatchTest( Device* deviceData )
|
||||
{
|
||||
{
|
||||
Stopwatch sw( deviceData );
|
||||
|
||||
sw.start();
|
||||
Sleep(2);
|
||||
sw.split();
|
||||
Sleep(2);
|
||||
sw.stop();
|
||||
|
||||
float t[2];
|
||||
sw.getMs( t, 2 );
|
||||
}
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void scanTest( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
ADLASSERT( type == deviceGPU->m_type );
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
HostBuffer<u32> buf0( deviceHost, maxSize );
|
||||
HostBuffer<u32> buf1( deviceHost, maxSize );
|
||||
Buffer<u32> buf2( deviceGPU, maxSize );
|
||||
Buffer<u32> buf3( deviceGPU, maxSize );
|
||||
|
||||
PrefixScan<type>::Data* data0 = PrefixScan<type>::allocate( deviceGPU, maxSize );
|
||||
PrefixScan<TYPE_HOST>::Data* data1 = PrefixScan<TYPE_HOST>::allocate( deviceHost, maxSize );
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = min2( 128+dx*iter, maxSize );
|
||||
|
||||
for(int i=0; i<size; i++) buf0[i] = 1;
|
||||
buf2.write( buf0.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
u32 sumHost, sumGPU;
|
||||
PrefixScan<TYPE_HOST>::execute( data1, buf0, buf1, size, &sumHost );
|
||||
PrefixScan<type>::execute( data0, buf2, buf3, size, &sumGPU );
|
||||
|
||||
buf3.read( buf0.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
TEST_ASSERT( sumHost == sumGPU );
|
||||
for(int i=0; i<size; i++) TEST_ASSERT( buf1[i] == buf0[i] );
|
||||
}
|
||||
|
||||
PrefixScan<TYPE_HOST>::deallocate( data1 );
|
||||
PrefixScan<type>::deallocate( data0 );
|
||||
|
||||
TEST_REPORT( "scanTest" );
|
||||
}
|
||||
|
||||
template<DeviceType type, RadixSortBase::Option SORT_TYPE>
|
||||
bool radixSortTest( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
ADLASSERT( type == deviceGPU->m_type );
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
HostBuffer<SortData> buf0( deviceHost, maxSize );
|
||||
HostBuffer<SortData> buf1( deviceHost, maxSize );
|
||||
Buffer<SortData> buf2( deviceGPU, maxSize );
|
||||
|
||||
RadixSort<TYPE_HOST>::Data* dataH = RadixSort<TYPE_HOST>::allocate( deviceHost, maxSize, RadixSortBase::SORT_SIMPLE );
|
||||
RadixSort<type>::Data* dataC = RadixSort<type>::allocate( deviceGPU, maxSize, SORT_TYPE );
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = min2( 128+dx*iter, maxSize-512 );
|
||||
size = NEXTMULTIPLEOF( size, 512 );
|
||||
|
||||
for(int i=0; i<size; i++) buf0[i] = SortData( getRandom(0,0xff), i );
|
||||
buf2.write( buf0.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
RadixSort<TYPE_HOST>::execute( dataH, buf0, size );
|
||||
RadixSort<type>::execute( dataC, buf2, size );
|
||||
|
||||
buf2.read( buf1.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
for(int i=0; i<size; i++) TEST_ASSERT( buf0[i].m_value == buf1[i].m_value && buf0[i].m_key == buf1[i].m_key );
|
||||
}
|
||||
|
||||
RadixSort<TYPE_HOST>::deallocate( dataH );
|
||||
RadixSort<type>::deallocate( dataC );
|
||||
|
||||
return g_testFailed;
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void radixSortSimpleTest( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
g_testFailed = radixSortTest<type, RadixSortBase::SORT_SIMPLE>(deviceGPU, deviceHost);
|
||||
TEST_REPORT( "radixSortSimpleTest" );
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void radixSortStandardTest( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
g_testFailed = radixSortTest<type, RadixSortBase::SORT_STANDARD>(deviceGPU, deviceHost);
|
||||
TEST_REPORT( "radixSortStandardTest" );
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void radixSortAdvancedTest( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
g_testFailed = radixSortTest<type, RadixSortBase::SORT_ADVANCED>(deviceGPU, deviceHost);
|
||||
TEST_REPORT( "radixSortAdvancedTest" );
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void boundSearchTest( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
ADLASSERT( type == deviceGPU->m_type );
|
||||
|
||||
int maxSize = 1024*256;
|
||||
int bucketSize = 256;
|
||||
|
||||
HostBuffer<SortData> buf0( deviceHost, maxSize );
|
||||
HostBuffer<u32> lowerH( deviceHost, maxSize );
|
||||
HostBuffer<u32> upperH( deviceHost, maxSize );
|
||||
|
||||
Buffer<SortData> buf( deviceGPU, maxSize );
|
||||
Buffer<u32> lower( deviceGPU, maxSize );
|
||||
Buffer<u32> upper( deviceGPU, maxSize );
|
||||
|
||||
BoundSearch<type>::Data* dataH = BoundSearch<type>::allocate( deviceGPU );
|
||||
RadixSort<TYPE_HOST>::Data* dataHSort = RadixSort<TYPE_HOST>::allocate( deviceHost, maxSize, RadixSortBase::SORT_SIMPLE );
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = min2( 128+dx*iter, maxSize );
|
||||
for(int i=0; i<size; i++) buf0[i] = SortData( getRandom(0,bucketSize), i );
|
||||
RadixSort<TYPE_HOST>::execute( dataHSort, buf0, size );
|
||||
buf.write( buf0.m_ptr, size );
|
||||
{
|
||||
u32* host = new u32[size];
|
||||
for(int i=0; i<size; i++) host[i] = -1;
|
||||
lower.write( host, size );
|
||||
upper.write( host, size );
|
||||
}
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
BoundSearch<type>::execute( dataH, buf, size, lower, bucketSize, BoundSearchBase::BOUND_LOWER );
|
||||
BoundSearch<type>::execute( dataH, buf, size, upper, bucketSize, BoundSearchBase::BOUND_UPPER );
|
||||
|
||||
lower.read( lowerH.m_ptr, bucketSize );
|
||||
upper.read( upperH.m_ptr, bucketSize );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
/*
|
||||
for(u32 i=1; i<(u32)bucketSize; i++)
|
||||
{
|
||||
for(u32 j=lowerH[i-1]; j<lowerH[i]; j++)
|
||||
{
|
||||
TEST_ASSERT( buf0[j].m_key < i );
|
||||
}
|
||||
}
|
||||
|
||||
for(u32 i=0; i<(u32)bucketSize; i++)
|
||||
{
|
||||
int jMin = (i==0)?0:upperH[i-1];
|
||||
for(u32 j=jMin; j<upperH[i]; j++)
|
||||
{
|
||||
TEST_ASSERT( buf0[j].m_key <= i );
|
||||
}
|
||||
}
|
||||
*/
|
||||
for(u32 i=0; i<(u32)bucketSize; i++)
|
||||
{
|
||||
for(u32 j=lowerH[i]; j<upperH[i]; j++)
|
||||
{
|
||||
if ( buf0[j].m_key != i )
|
||||
{
|
||||
printf("error %d != %d\n",buf0[j].m_key,i);
|
||||
}
|
||||
TEST_ASSERT( buf0[j].m_key == i );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
BoundSearch<type>::deallocate( dataH );
|
||||
RadixSort<TYPE_HOST>::deallocate( dataHSort );
|
||||
|
||||
TEST_REPORT( "boundSearchTest" );
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void fillIntTest( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
ADLASSERT( type == deviceGPU->m_type );
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
HostBuffer<int> buf0( deviceHost, maxSize );
|
||||
HostBuffer<int> buf1( deviceHost, maxSize );
|
||||
Buffer<int> buf2( deviceGPU, maxSize );
|
||||
|
||||
Fill<TYPE_HOST>::Data* data0 = Fill<TYPE_HOST>::allocate( deviceHost );
|
||||
Fill<type>::Data* data1 = Fill<type>::allocate( deviceGPU );
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = min2( 128+dx*iter, maxSize );
|
||||
for(int i=0; i<size; i++) buf0[i] = -1;
|
||||
buf2.write( buf0.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
Fill<TYPE_HOST>::execute( data0, buf0, 12, size );
|
||||
Fill<type>::execute( data1, buf2, 12, size );
|
||||
|
||||
buf2.read( buf1.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
for(int i=0; i<size; i++) TEST_ASSERT( buf1[i] == buf0[i] );
|
||||
}
|
||||
|
||||
Fill<TYPE_HOST>::deallocate( data0 );
|
||||
Fill<type>::deallocate( data1 );
|
||||
|
||||
TEST_REPORT( "fillIntTest" );
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void fillInt2Test( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
ADLASSERT( type == deviceGPU->m_type );
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
HostBuffer<int2> buf0( deviceHost, maxSize );
|
||||
HostBuffer<int2> buf1( deviceHost, maxSize );
|
||||
Buffer<int2> buf2( deviceGPU, maxSize );
|
||||
|
||||
Fill<TYPE_HOST>::Data* data0 = Fill<TYPE_HOST>::allocate( deviceHost );
|
||||
Fill<type>::Data* data1 = Fill<type>::allocate( deviceGPU );
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = min2( 128+dx*iter, maxSize );
|
||||
for(int i=0; i<size; i++) buf0[i] = make_int2( -1, -1 );
|
||||
buf2.write( buf0.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
Fill<TYPE_HOST>::execute( data0, buf0, make_int2( 12, 12 ), size );
|
||||
Fill<type>::execute( data1, buf2, make_int2( 12, 12 ), size );
|
||||
|
||||
buf2.read( buf1.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
for(int i=0; i<size; i++) TEST_ASSERT( buf1[i] == buf0[i] );
|
||||
}
|
||||
|
||||
Fill<TYPE_HOST>::deallocate( data0 );
|
||||
Fill<type>::deallocate( data1 );
|
||||
|
||||
TEST_REPORT( "fillInt2Test" );
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void fillInt4Test( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
ADLASSERT( type == deviceGPU->m_type );
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
HostBuffer<int4> buf0( deviceHost, maxSize );
|
||||
HostBuffer<int4> buf1( deviceHost, maxSize );
|
||||
Buffer<int4> buf2( deviceGPU, maxSize );
|
||||
|
||||
Fill<TYPE_HOST>::Data* data0 = Fill<TYPE_HOST>::allocate( deviceHost );
|
||||
Fill<type>::Data* data1 = Fill<type>::allocate( deviceGPU );
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = min2( 128+dx*iter, maxSize );
|
||||
for(int i=0; i<size; i++) buf0[i] = make_int4( -1 );
|
||||
buf2.write( buf0.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
Fill<TYPE_HOST>::execute( data0, buf0, make_int4( 12 ), size );
|
||||
Fill<type>::execute( data1, buf2, make_int4( 12 ), size );
|
||||
|
||||
buf2.read( buf1.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
for(int i=0; i<size; i++) TEST_ASSERT( buf1[i] == buf0[i] );
|
||||
}
|
||||
|
||||
Fill<TYPE_HOST>::deallocate( data0 );
|
||||
Fill<type>::deallocate( data1 );
|
||||
|
||||
TEST_REPORT( "fillInt4Test" );
|
||||
}
|
||||
|
||||
|
||||
template<DeviceType type, CopyBase::Option OPTION>
|
||||
bool CopyF4Test( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
ADLASSERT( type == deviceGPU->m_type );
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
HostBuffer<float4> buf0( deviceHost, maxSize );
|
||||
HostBuffer<float4> buf1( deviceHost, maxSize );
|
||||
Buffer<float4> buf2( deviceGPU, maxSize );
|
||||
Buffer<float4> buf3( deviceGPU, maxSize );
|
||||
HostBuffer<float4> devResult( deviceHost, maxSize );
|
||||
|
||||
Copy<TYPE_HOST>::Data* data0 = Copy<TYPE_HOST>::allocate( deviceHost );
|
||||
Copy<type>::Data* data1 = Copy<type>::allocate( deviceGPU );
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = min2( 128+dx*iter, maxSize-4 );
|
||||
size = NEXTMULTIPLEOF( size, 4 );
|
||||
float r = 10000.f;
|
||||
for(int i=0; i<size; i++) buf0[i] = make_float4( getRandom( -r, r ), getRandom( -r, r ), getRandom( -r, r ), getRandom( -r, r ) );
|
||||
buf2.write( buf0.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
Copy<TYPE_HOST>::execute( data0, buf1, buf0, size, OPTION );
|
||||
Copy<type>::execute( data1, buf3, buf2, size, OPTION );
|
||||
|
||||
buf3.read( devResult.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
TEST_ASSERT( buf1[i] == devResult[i] );
|
||||
TEST_ASSERT( buf0[i] == devResult[i] );
|
||||
}
|
||||
}
|
||||
|
||||
Copy<TYPE_HOST>::deallocate( data0 );
|
||||
Copy<type>::deallocate( data1 );
|
||||
|
||||
return g_testFailed;
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void Copy1F4Test( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
g_testFailed = CopyF4Test<type, CopyBase::PER_WI_1>( deviceGPU, deviceHost );
|
||||
TEST_REPORT( "Copy1F4Test" );
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void Copy2F4Test( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
g_testFailed = CopyF4Test<type, CopyBase::PER_WI_2>( deviceGPU, deviceHost );
|
||||
TEST_REPORT( "Copy2F4Test" );
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void Copy4F4Test( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
g_testFailed = CopyF4Test<type, CopyBase::PER_WI_4>( deviceGPU, deviceHost );
|
||||
TEST_REPORT( "Copy4F4Test" );
|
||||
}
|
||||
|
||||
|
||||
template<DeviceType type>
|
||||
void CopyF1Test( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
ADLASSERT( type == deviceGPU->m_type );
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
HostBuffer<float> buf0( deviceHost, maxSize );
|
||||
HostBuffer<float> buf1( deviceHost, maxSize );
|
||||
Buffer<float> buf2( deviceGPU, maxSize );
|
||||
Buffer<float> buf3( deviceGPU, maxSize );
|
||||
HostBuffer<float> devResult( deviceHost, maxSize );
|
||||
|
||||
Copy<TYPE_HOST>::Data* data0 = Copy<TYPE_HOST>::allocate( deviceHost );
|
||||
Copy<type>::Data* data1 = Copy<type>::allocate( deviceGPU );
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = min2( 128+dx*iter, maxSize-4 );
|
||||
size = NEXTMULTIPLEOF( size, 4 );
|
||||
float r = 10000.f;
|
||||
for(int i=0; i<size; i++) buf0[i] = getRandom( -r, r );
|
||||
buf2.write( buf0.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
Copy<TYPE_HOST>::execute( data0, buf1, buf0, size );
|
||||
Copy<type>::execute( data1, buf3, buf2, size );
|
||||
|
||||
buf3.read( devResult.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
TEST_ASSERT( buf1[i] == devResult[i] );
|
||||
TEST_ASSERT( buf0[i] == devResult[i] );
|
||||
}
|
||||
}
|
||||
|
||||
Copy<TYPE_HOST>::deallocate( data0 );
|
||||
Copy<type>::deallocate( data1 );
|
||||
|
||||
TEST_REPORT( "CopyF1Test" );
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void CopyF2Test( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
ADLASSERT( type == deviceGPU->m_type );
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
HostBuffer<float2> buf0( deviceHost, maxSize );
|
||||
HostBuffer<float2> buf1( deviceHost, maxSize );
|
||||
Buffer<float2> buf2( deviceGPU, maxSize );
|
||||
Buffer<float2> buf3( deviceGPU, maxSize );
|
||||
HostBuffer<float2> devResult( deviceHost, maxSize );
|
||||
|
||||
Copy<TYPE_HOST>::Data* data0 = Copy<TYPE_HOST>::allocate( deviceHost );
|
||||
Copy<type>::Data* data1 = Copy<type>::allocate( deviceGPU );
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = min2( 128+dx*iter, maxSize-4 );
|
||||
size = NEXTMULTIPLEOF( size, 4 );
|
||||
float r = 10000.f;
|
||||
for(int i=0; i<size; i++) buf0[i] = make_float2( getRandom( -r, r ), getRandom( -r, r ) );
|
||||
buf2.write( buf0.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
Copy<TYPE_HOST>::execute( data0, buf1, buf0, size );
|
||||
Copy<type>::execute( data1, buf3, buf2, size );
|
||||
|
||||
buf3.read( devResult.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
TEST_ASSERT( buf1[i] == devResult[i] );
|
||||
TEST_ASSERT( buf0[i] == devResult[i] );
|
||||
}
|
||||
}
|
||||
|
||||
Copy<TYPE_HOST>::deallocate( data0 );
|
||||
Copy<type>::deallocate( data1 );
|
||||
|
||||
TEST_REPORT( "CopyF2Test" );
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void radixSort32Test( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
ADLASSERT( type == deviceGPU->m_type );
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
HostBuffer<u32> buf0( deviceHost, maxSize );
|
||||
HostBuffer<u32> buf1( deviceHost, maxSize );
|
||||
Buffer<u32> buf2( deviceGPU, maxSize );
|
||||
|
||||
RadixSort32<TYPE_HOST>::Data* dataH = RadixSort32<TYPE_HOST>::allocate( deviceHost, maxSize );
|
||||
RadixSort32<type>::Data* dataC = RadixSort32<type>::allocate( deviceGPU, maxSize );
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = min2( 128+dx*iter, maxSize-512 );
|
||||
size = NEXTMULTIPLEOF( size, 512 );
|
||||
|
||||
for(int i=0; i<size; i++) buf0[i] = getRandom(0u,0xffffffffu);
|
||||
buf2.write( buf0.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
RadixSort32<TYPE_HOST>::execute( dataH, buf0, size, 32 );
|
||||
RadixSort32<type>::execute( dataC, buf2, size, 32 );
|
||||
|
||||
buf2.read( buf1.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
// for(int i=0; i<size-1; i++) TEST_ASSERT( buf1[i] <= buf1[i+1] );
|
||||
for(int i=0; i<size; i++) TEST_ASSERT( buf0[i] == buf1[i] );
|
||||
}
|
||||
|
||||
RadixSort32<TYPE_HOST>::deallocate( dataH );
|
||||
RadixSort32<type>::deallocate( dataC );
|
||||
|
||||
TEST_REPORT( "RadixSort32Test" );
|
||||
}
|
||||
|
||||
template<DeviceType type>
|
||||
void radixSortKeyValue32Test( Device* deviceGPU, Device* deviceHost )
|
||||
{
|
||||
TEST_INIT;
|
||||
ADLASSERT( type == deviceGPU->m_type );
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
// Host buffers
|
||||
HostBuffer<u32> buf0( deviceHost, maxSize ); // Buffer for keys in host and will be sorted by host.
|
||||
HostBuffer<u32> buf1( deviceHost, maxSize ); // Buffer for keys in host and will be saved by device after sorting in device.
|
||||
HostBuffer<u32> buf2( deviceHost, maxSize ); // Buffer for values in host. This buffer is paired with buf0.
|
||||
HostBuffer<u32> buf3( deviceHost, maxSize ); // Buffer for values in host and will be saved by device after sorting. It is paired with buf1.
|
||||
|
||||
// Device buffers
|
||||
Buffer<u32> buf4( deviceGPU, maxSize ); // Buffer for input keys for device.
|
||||
Buffer<u32> buf5( deviceGPU, maxSize ); // Buffer for output keys from device and will be sorted by device. This key data will be saved to buf1 to be compared with a result(buf0) from host.
|
||||
Buffer<u32> buf6( deviceGPU, maxSize ); // Buffer for input values in device.
|
||||
Buffer<u32> buf7( deviceGPU, maxSize ); // Buffer for output values in device.
|
||||
|
||||
RadixSort32<TYPE_HOST>::Data* dataH = RadixSort32<TYPE_HOST>::allocate( deviceHost, maxSize );
|
||||
RadixSort32<type>::Data* dataC = RadixSort32<type>::allocate( deviceGPU, maxSize );
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = min2( 128+dx*iter, maxSize-512 );
|
||||
size = NEXTMULTIPLEOF( size, 512 );
|
||||
|
||||
// keys
|
||||
seedRandom((int)time(NULL)/2);
|
||||
for(int i=0; i<size; i++) buf0[i] = getRandom(0u,0xffffffffu);
|
||||
buf4.write( buf0.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
// values
|
||||
seedRandom((int)time(NULL)/2);
|
||||
for(int i=0; i<size; i++) buf2[i] = getRandom(0u,0xffffffffu);
|
||||
buf6.write( buf2.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
RadixSort32<TYPE_HOST>::execute( dataH, buf0, buf2, size, 32 );
|
||||
RadixSort32<type>::execute( dataC, buf4, buf5, buf6, buf7, size, 32 );
|
||||
buf5.read( buf1.m_ptr, size );
|
||||
buf7.read( buf3.m_ptr, size );
|
||||
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
// Comparing keys. One is done by Host and the other is done by Device.
|
||||
TEST_ASSERT( buf0[i] == buf1[i] );
|
||||
|
||||
// Comparing values. One is done by Host and the other is done by Device.
|
||||
TEST_ASSERT( buf2[i] == buf3[i] );
|
||||
}
|
||||
}
|
||||
|
||||
RadixSort32<TYPE_HOST>::deallocate( dataH );
|
||||
RadixSort32<type>::deallocate( dataC );
|
||||
|
||||
TEST_REPORT( "RadixSortKeyValue32Test" );
|
||||
}
|
||||
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
#define RUN_GPU( func ) func(ddcl); func(dddx);
|
||||
#define RUN_GPU_TEMPLATE( func ) func<TYPE_CL>( ddcl, ddhost ); func<TYPE_DX11>( dddx, ddhost );
|
||||
#define RUN_CL_TEMPLATE( func ) func<TYPE_CL>( ddcl, ddhost );
|
||||
#else
|
||||
#define RUN_GPU( func ) func(ddcl);
|
||||
#define RUN_GPU_TEMPLATE( func ) func<TYPE_CL>( ddcl, ddhost );
|
||||
#endif
|
||||
#define RUN_ALL( func ) RUN_GPU( func ); func(ddhost);
|
||||
|
||||
void runAllTest()
|
||||
{
|
||||
g_nPassed = 0;
|
||||
g_nFailed = 0;
|
||||
|
||||
|
||||
Device* ddcl;
|
||||
Device* ddhost;
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
Device* dddx;
|
||||
#endif
|
||||
|
||||
{
|
||||
DeviceUtils::Config cfg;
|
||||
|
||||
// Choose AMD or NVidia
|
||||
#ifdef CL_PLATFORM_AMD
|
||||
cfg.m_vendor = adl::DeviceUtils::Config::VD_AMD;
|
||||
#endif
|
||||
|
||||
#ifdef CL_PLATFORM_INTEL
|
||||
cfg.m_vendor = adl::DeviceUtils::Config::VD_INTEL;
|
||||
cfg.m_type = DeviceUtils::Config::DEVICE_CPU;
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef CL_PLATFORM_NVIDIA
|
||||
cfg.m_vendor = adl::DeviceUtils::Config::VD_NV;
|
||||
#endif
|
||||
|
||||
|
||||
ddcl = DeviceUtils::allocate( TYPE_CL, cfg );
|
||||
ddhost = DeviceUtils::allocate( TYPE_HOST, cfg );
|
||||
// cfg.m_type = DeviceUtils::Config::DEVICE_GPU;
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
dddx = DeviceUtils::allocate( TYPE_DX11, cfg );
|
||||
#endif
|
||||
}
|
||||
|
||||
{
|
||||
char name[128];
|
||||
ddcl->getDeviceName( name );
|
||||
printf("CL: %s\n", name);
|
||||
#ifdef ADL_ENABLE_DX11
|
||||
dddx->getDeviceName( name );
|
||||
printf("DX11: %s\n", name);
|
||||
#endif
|
||||
}
|
||||
|
||||
RUN_GPU_TEMPLATE( radixSort32Test );
|
||||
RUN_GPU_TEMPLATE( radixSortKeyValue32Test );
|
||||
|
||||
if (1)
|
||||
{
|
||||
RUN_GPU_TEMPLATE( CopyF1Test );
|
||||
RUN_GPU_TEMPLATE( CopyF2Test );
|
||||
|
||||
boundSearchTest<TYPE_HOST>( ddhost, ddhost );
|
||||
// fillTest<TYPE_HOST>( ddhost, ddhost );
|
||||
// fillTest<TYPE_CL>( ddcl, ddhost );
|
||||
|
||||
|
||||
|
||||
|
||||
RUN_GPU_TEMPLATE( boundSearchTest );
|
||||
|
||||
RUN_GPU_TEMPLATE( fillIntTest );
|
||||
RUN_GPU_TEMPLATE( fillInt2Test );
|
||||
RUN_GPU_TEMPLATE( fillInt4Test );
|
||||
|
||||
RUN_ALL( stopwatchTest );
|
||||
RUN_ALL( memCpyTest );
|
||||
// RUN_GPU( kernelTest );
|
||||
RUN_GPU_TEMPLATE( scanTest );
|
||||
RUN_GPU_TEMPLATE( radixSortSimpleTest );
|
||||
|
||||
RUN_GPU_TEMPLATE( radixSortStandardTest );
|
||||
|
||||
RUN_GPU_TEMPLATE( radixSort32Test );
|
||||
|
||||
// RUN_GPU_TEMPLATE( boundSearchTest );
|
||||
RUN_GPU_TEMPLATE( Copy1F4Test );
|
||||
RUN_GPU_TEMPLATE( Copy2F4Test );
|
||||
RUN_GPU_TEMPLATE( Copy4F4Test );
|
||||
}
|
||||
|
||||
DeviceUtils::deallocate( ddcl );
|
||||
DeviceUtils::deallocate( ddhost );
|
||||
#if defined(ADL_ENABLE_DX11)
|
||||
DeviceUtils::deallocate( dddx );
|
||||
#endif
|
||||
|
||||
printf("=========\n%d Passed\n%d Failed\n", g_nPassed, g_nFailed);
|
||||
|
||||
|
||||
}
|
||||
118
Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/main.cpp
Normal file
118
Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/main.cpp
Normal file
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <Adl/Adl.h>
|
||||
#include <AdlPrimitives/Math/Math.h>
|
||||
|
||||
#include "UnitTests.h"
|
||||
#include "RadixSortBenchmark.h"
|
||||
#include "LaunchOverheadBenchmark.h"
|
||||
|
||||
|
||||
#undef NUM_TESTS
|
||||
|
||||
|
||||
struct ConstBuffer
|
||||
{
|
||||
float4 m_a;
|
||||
float4 m_b;
|
||||
float4 m_c;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
if(0)
|
||||
{ // radix sort test
|
||||
Device* deviceHost;
|
||||
Device* deviceGPU;
|
||||
{
|
||||
DeviceUtils::Config cfg;
|
||||
|
||||
// Choose AMD or NVidia
|
||||
#ifdef CL_PLATFORM_AMD
|
||||
cfg.m_vendor = DeviceUtils::Config::VD_AMD;
|
||||
#endif
|
||||
|
||||
#ifdef CL_PLATFORM_INTEL
|
||||
cfg.m_vendor = DeviceUtils::Config::VD_INTEL;
|
||||
#endif
|
||||
|
||||
#ifdef CL_PLATFORM_NVIDIA
|
||||
cfg.m_vendor = adl::DeviceUtils::Config::VD_NV;
|
||||
#endif
|
||||
deviceGPU = DeviceUtils::allocate( TYPE_DX11, cfg );
|
||||
deviceHost = DeviceUtils::allocate( TYPE_HOST, cfg );
|
||||
}
|
||||
|
||||
{
|
||||
int maxSize = 512*20;
|
||||
int size = maxSize;
|
||||
|
||||
HostBuffer<SortData> buf0( deviceHost, maxSize );
|
||||
HostBuffer<SortData> buf1( deviceHost, maxSize );
|
||||
Buffer<SortData> buf2( deviceGPU, maxSize );
|
||||
|
||||
RadixSort<TYPE_HOST>::Data* dataH = RadixSort<TYPE_HOST>::allocate( deviceHost, maxSize, RadixSortBase::SORT_STANDARD );
|
||||
RadixSort<TYPE_DX11>::Data* dataC = RadixSort<TYPE_DX11>::allocate( deviceGPU, maxSize, RadixSortBase::SORT_ADVANCED );
|
||||
|
||||
{
|
||||
size = NEXTMULTIPLEOF( size, 512 );
|
||||
|
||||
for(int i=0; i<size; i++) buf0[i] = SortData( getRandom(0,0xfff), i );
|
||||
buf2.write( buf0.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
|
||||
RadixSort<TYPE_HOST>::execute( dataH, buf0, size );
|
||||
RadixSort<TYPE_DX11>::execute( dataC, buf2, size );
|
||||
|
||||
buf2.read( buf1.m_ptr, size );
|
||||
DeviceUtils::waitForCompletion( deviceGPU );
|
||||
for(int i=0; i<size; i++) ADLASSERT( buf0[i].m_value == buf1[i].m_value && buf0[i].m_key == buf1[i].m_key );
|
||||
}
|
||||
|
||||
RadixSort<TYPE_HOST>::deallocate( dataH );
|
||||
RadixSort<TYPE_DX11>::deallocate( dataC );
|
||||
}
|
||||
|
||||
DeviceUtils::deallocate( deviceHost );
|
||||
DeviceUtils::deallocate( deviceGPU );
|
||||
}
|
||||
|
||||
if(0)
|
||||
{
|
||||
launchOverheadBenchmark();
|
||||
}
|
||||
|
||||
if(0)
|
||||
{
|
||||
radixSortBenchmark<TYPE_DX11>();
|
||||
}
|
||||
|
||||
if(0)
|
||||
{
|
||||
radixSortBenchmark<TYPE_CL>();
|
||||
}
|
||||
|
||||
if(1)
|
||||
{
|
||||
runAllTest();
|
||||
}
|
||||
printf("End, press <enter>\n");
|
||||
getchar();
|
||||
}
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
|
||||
include "AMD"
|
||||
include "NVIDIA"
|
||||
include "Intel"
|
||||
@@ -0,0 +1,29 @@
|
||||
|
||||
hasCL = findOpenCL_AMD()
|
||||
hasDX11 = findDirectX11()
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project "OpenCL_DX11_radixsort_benchmark_AMD"
|
||||
|
||||
initOpenCL_AMD()
|
||||
|
||||
if (hasDX11) then
|
||||
initDirectX11()
|
||||
end
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../../bin"
|
||||
includedirs {"..","../.."}
|
||||
|
||||
links {
|
||||
"OpenCL"
|
||||
}
|
||||
|
||||
files {
|
||||
"../test_large_problem_sorting.cpp"
|
||||
}
|
||||
|
||||
end
|
||||
@@ -0,0 +1,29 @@
|
||||
|
||||
hasCL = findOpenCL_NVIDIA()
|
||||
hasDX11 = findDirectX11()
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project "OpenCL_DX11_radixsort_benchmark_NVIDIA"
|
||||
|
||||
initOpenCL_NVIDIA()
|
||||
|
||||
if (hasDX11) then
|
||||
initDirectX11()
|
||||
end
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../../bin"
|
||||
includedirs {"..","../.."}
|
||||
|
||||
links {
|
||||
"OpenCL"
|
||||
}
|
||||
|
||||
files {
|
||||
"../test_large_problem_sorting.cpp"
|
||||
}
|
||||
|
||||
end
|
||||
@@ -0,0 +1,2 @@
|
||||
include "AMD"
|
||||
include "NVIDIA"
|
||||
@@ -0,0 +1,705 @@
|
||||
/******************************************************************************
|
||||
* Copyright 2010 Duane Merrill
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* AUTHORS' REQUEST:
|
||||
*
|
||||
* If you use|reference|benchmark this code, please cite our Technical
|
||||
* Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
|
||||
*
|
||||
* @TechReport{ Merrill:Sorting:2010,
|
||||
* author = "Duane Merrill and Andrew Grimshaw",
|
||||
* title = "Revisiting Sorting for GPGPU Stream Architectures",
|
||||
* year = "2010",
|
||||
* institution = "University of Virginia, Department of Computer Science",
|
||||
* address = "Charlottesville, VA, USA",
|
||||
* number = "CS2010-03"
|
||||
* }
|
||||
*
|
||||
* For more information, see our Google Code project site:
|
||||
* http://code.google.com/p/back40computing/
|
||||
*
|
||||
* Thanks!
|
||||
******************************************************************************/
|
||||
|
||||
/******************************************************************************
|
||||
* Simple test driver program for *large-problem* radix sorting.
|
||||
*
|
||||
* Useful for demonstrating how to integrate radix sorting into
|
||||
* your application
|
||||
******************************************************************************/
|
||||
|
||||
/******************************************************************************
|
||||
* Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
|
||||
******************************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
#define BUFFERSIZE_WORKAROUND
|
||||
|
||||
//#include <iostream>
|
||||
#include <sstream>
|
||||
/**********************
|
||||
*
|
||||
*/
|
||||
|
||||
#include "Adl/Adl.h"
|
||||
#include "AdlPrimitives/Sort/RadixSort32.h"
|
||||
#include "AdlPrimitives/Sort/SortData.h"
|
||||
|
||||
using namespace adl;
|
||||
|
||||
|
||||
/***********************
|
||||
*
|
||||
*/
|
||||
|
||||
bool g_verbose;
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Routines
|
||||
******************************************************************************/
|
||||
|
||||
|
||||
/**
|
||||
* Keys-only sorting. Uses the GPU to sort the specified vector of elements for the given
|
||||
* number of iterations, displaying runtime information.
|
||||
*
|
||||
* @param[in] num_elements
|
||||
* Size in elements of the vector to sort
|
||||
* @param[in] h_keys
|
||||
* Vector of keys to sort
|
||||
* @param[in] iterations
|
||||
* Number of times to invoke the GPU sorting primitive
|
||||
* @param[in] cfg
|
||||
* Config
|
||||
*/
|
||||
template <typename K, DeviceType type>
|
||||
void TimedSort(
|
||||
unsigned int num_elements,
|
||||
K *h_keys,
|
||||
unsigned int iterations, const DeviceUtils::Config& cfg)
|
||||
{
|
||||
std::string sType = "No type selected";
|
||||
|
||||
if (type == TYPE_CL)
|
||||
sType = "OpenCL";
|
||||
else if (type == TYPE_DX11)
|
||||
sType = "DX11";
|
||||
|
||||
printf("Keys-only, %s, %d iterations, %d elements\n", sType.c_str(), iterations, num_elements);
|
||||
|
||||
int max_elements = num_elements;
|
||||
|
||||
#ifdef BUFFERSIZE_WORKAROUND
|
||||
if (max_elements < 1024*256)
|
||||
max_elements = 1024*256;
|
||||
#endif
|
||||
|
||||
// Allocate device storage
|
||||
Device* deviceData = NULL;
|
||||
|
||||
if ( type == TYPE_CL )
|
||||
deviceData = new DeviceCL();
|
||||
#ifdef ADL_ENABLE_DX11
|
||||
else if ( type == TYPE_DX11 )
|
||||
deviceData = new DeviceDX11();
|
||||
#endif //ADL_ENABLE_DX11
|
||||
|
||||
deviceData->initialize(cfg);
|
||||
|
||||
RadixSort32<type>::Data* planData = RadixSort32<type>::allocate( deviceData, max_elements);
|
||||
|
||||
{
|
||||
Buffer<unsigned int> keysInOut(deviceData,max_elements);
|
||||
|
||||
// Create sorting enactor
|
||||
keysInOut.write(h_keys,num_elements);
|
||||
DeviceUtils::waitForCompletion( deviceData);
|
||||
|
||||
RadixSort32<type>::execute( planData,keysInOut,num_elements, 32);
|
||||
DeviceUtils::waitForCompletion( deviceData);
|
||||
|
||||
// Perform the timed number of sorting iterations
|
||||
double elapsed = 0;
|
||||
float duration = 0;
|
||||
StopwatchHost watch;
|
||||
watch.init(deviceData);
|
||||
|
||||
watch.start();
|
||||
|
||||
for (int i = 0; i < iterations; i++)
|
||||
{
|
||||
|
||||
// Move a fresh copy of the problem into device storage
|
||||
keysInOut.write(h_keys,num_elements);
|
||||
DeviceUtils::waitForCompletion( deviceData);
|
||||
|
||||
// Start GPU timing record
|
||||
watch.start();
|
||||
|
||||
// Call the sorting API routine
|
||||
RadixSort32<type>::execute( planData,keysInOut,num_elements, 32);
|
||||
DeviceUtils::waitForCompletion( deviceData);
|
||||
|
||||
watch.stop();
|
||||
duration = watch.getMs();
|
||||
|
||||
// End GPU timing record
|
||||
elapsed += (double) duration;
|
||||
}
|
||||
|
||||
// Display timing information
|
||||
double avg_runtime = elapsed / iterations;
|
||||
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
|
||||
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
|
||||
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
|
||||
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
|
||||
|
||||
// Copy out data
|
||||
keysInOut.read(h_keys,num_elements);
|
||||
|
||||
DeviceUtils::waitForCompletion( deviceData);
|
||||
|
||||
}
|
||||
// Free allocated memory
|
||||
RadixSort32<type>::deallocate( planData);
|
||||
delete deviceData;
|
||||
// Clean up events
|
||||
}
|
||||
|
||||
/**
|
||||
* Key-value sorting. Uses the GPU to sort the specified vector of elements for the given
|
||||
* number of iterations, displaying runtime information.
|
||||
*
|
||||
* @param[in] num_elements
|
||||
* Size in elements of the vector to sort
|
||||
* @param[in] h_keys
|
||||
* Vector of keys to sort
|
||||
* @param[in,out] h_values
|
||||
* Vector of values to sort
|
||||
* @param[in] iterations
|
||||
* Number of times to invoke the GPU sorting primitive
|
||||
* @param[in] cfg
|
||||
* Config
|
||||
*/
|
||||
template <typename K, typename V, DeviceType type>
|
||||
void TimedSort(
|
||||
unsigned int num_elements,
|
||||
K *h_keys,
|
||||
V *h_values,
|
||||
unsigned int iterations, const DeviceUtils::Config& cfg)
|
||||
{
|
||||
std::string sType = "No type selected";
|
||||
|
||||
if (type == TYPE_CL)
|
||||
sType = "OpenCL";
|
||||
else if (type == TYPE_DX11)
|
||||
sType = "DX11";
|
||||
|
||||
printf("Key-values, %s, %d iterations, %d elements\n", sType.c_str(), iterations, num_elements);
|
||||
|
||||
int max_elements = num_elements;
|
||||
|
||||
#ifdef BUFFERSIZE_WORKAROUND
|
||||
if (max_elements < 1024*256)
|
||||
max_elements = 1024*256;
|
||||
#endif
|
||||
|
||||
// Allocate device storage
|
||||
Device* deviceData = NULL;
|
||||
|
||||
if ( type == TYPE_CL )
|
||||
deviceData = new DeviceCL();
|
||||
#ifdef ADL_ENABLE_DX11
|
||||
else if ( type == TYPE_DX11 )
|
||||
deviceData = new DeviceDX11();
|
||||
#endif //ADL_ENABLE_DX11
|
||||
|
||||
deviceData->initialize(cfg);
|
||||
RadixSort32<type>::Data* planData = RadixSort32<type>::allocate( deviceData, max_elements);
|
||||
{
|
||||
Buffer<unsigned int> keysIn(deviceData,max_elements);
|
||||
Buffer<unsigned int> valuesIn(deviceData,max_elements);
|
||||
|
||||
Buffer<unsigned int> keysOut(deviceData,max_elements);
|
||||
Buffer<unsigned int> valuesOut(deviceData,max_elements);
|
||||
|
||||
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
|
||||
|
||||
// Create sorting enactor
|
||||
keysIn.write(h_keys,num_elements);
|
||||
DeviceUtils::waitForCompletion( deviceData);
|
||||
valuesIn.write(h_values,num_elements);
|
||||
DeviceUtils::waitForCompletion( deviceData);
|
||||
|
||||
|
||||
// Perform a single sorting iteration to allocate memory, prime code caches, etc.
|
||||
//RadixSort<type>::execute( planData, buffer, num_elements );
|
||||
|
||||
//RadixSort32<type>::execute( planData, keysIn,keysOut, valuesIn,valuesOut, num_elements, 32);
|
||||
RadixSort32<type>::execute( planData, keysIn,keysOut, valuesIn,valuesOut, num_elements, 32);
|
||||
DeviceUtils::waitForCompletion( deviceData);
|
||||
|
||||
// Perform the timed number of sorting iterations
|
||||
double elapsed = 0;
|
||||
float duration = 0;
|
||||
StopwatchHost watch;
|
||||
watch.init(deviceData);
|
||||
|
||||
watch.start();
|
||||
|
||||
for (int i = 0; i < iterations; i++)
|
||||
{
|
||||
|
||||
// Move a fresh copy of the problem into device storage
|
||||
keysIn.write(h_keys,num_elements);
|
||||
valuesIn.write(h_values,num_elements);
|
||||
|
||||
DeviceUtils::waitForCompletion( deviceData);
|
||||
|
||||
// Start GPU timing record
|
||||
watch.start();
|
||||
|
||||
// Call the sorting API routine
|
||||
|
||||
RadixSort32<type>::execute( planData, keysIn,keysOut, valuesIn,valuesOut, num_elements, 32);
|
||||
|
||||
DeviceUtils::waitForCompletion( deviceData);
|
||||
|
||||
watch.stop();
|
||||
duration = watch.getMs();
|
||||
|
||||
// End GPU timing record
|
||||
elapsed += (double) duration;
|
||||
}
|
||||
|
||||
// Display timing information
|
||||
double avg_runtime = elapsed / iterations;
|
||||
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
|
||||
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
|
||||
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
|
||||
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
|
||||
|
||||
//memset(h_keys,1,num_elements);
|
||||
//memset(h_values,1,num_elements);
|
||||
// Copy out data
|
||||
keysOut.read(h_keys,num_elements);
|
||||
valuesOut.read(h_values,num_elements);
|
||||
|
||||
DeviceUtils::waitForCompletion( deviceData);
|
||||
}
|
||||
|
||||
// Free allocated memory
|
||||
RadixSort32<type>::deallocate( planData);
|
||||
delete deviceData;
|
||||
// Clean up events
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Generates random 32-bit keys.
|
||||
*
|
||||
* We always take the second-order byte from rand() because the higher-order
|
||||
* bits returned by rand() are commonly considered more uniformly distributed
|
||||
* than the lower-order bits.
|
||||
*
|
||||
* We can decrease the entropy level of keys by adopting the technique
|
||||
* of Thearling and Smith in which keys are computed from the bitwise AND of
|
||||
* multiple random samples:
|
||||
*
|
||||
* entropy_reduction | Effectively-unique bits per key
|
||||
* -----------------------------------------------------
|
||||
* -1 | 0
|
||||
* 0 | 32
|
||||
* 1 | 25.95
|
||||
* 2 | 17.41
|
||||
* 3 | 10.78
|
||||
* 4 | 6.42
|
||||
* ... | ...
|
||||
*
|
||||
*/
|
||||
template <typename K>
|
||||
void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
|
||||
{
|
||||
const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
|
||||
unsigned char key_bits[NUM_UCHARS];
|
||||
|
||||
do {
|
||||
|
||||
for (int j = 0; j < NUM_UCHARS; j++) {
|
||||
unsigned char quarterword = 0xff;
|
||||
for (int i = 0; i <= entropy_reduction; i++) {
|
||||
quarterword &= (rand() >> 7);
|
||||
}
|
||||
key_bits[j] = quarterword;
|
||||
}
|
||||
|
||||
if (lower_key_bits < sizeof(K) * 8) {
|
||||
unsigned long long base = 0;
|
||||
memcpy(&base, key_bits, sizeof(K));
|
||||
base &= (1 << lower_key_bits) - 1;
|
||||
memcpy(key_bits, &base, sizeof(K));
|
||||
}
|
||||
|
||||
memcpy(&key, key_bits, sizeof(K));
|
||||
|
||||
} while (key != key); // avoids NaNs when generating random floating point numbers
|
||||
}
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Templated routines for printing keys/values to the console
|
||||
******************************************************************************/
|
||||
|
||||
template<typename T>
|
||||
void PrintValue(T val) {
|
||||
printf("%d", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<float>(float val) {
|
||||
printf("%f", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<double>(double val) {
|
||||
printf("%f", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned char>(unsigned char val) {
|
||||
printf("%u", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned short>(unsigned short val) {
|
||||
printf("%u", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned int>(unsigned int val) {
|
||||
printf("%u", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<long>(long val) {
|
||||
printf("%ld", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned long>(unsigned long val) {
|
||||
printf("%lu", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<long long>(long long val) {
|
||||
printf("%lld", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned long long>(unsigned long long val) {
|
||||
printf("%llu", val);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Compares the equivalence of two arrays
|
||||
*/
|
||||
template <typename T, typename SizeT>
|
||||
int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true)
|
||||
{
|
||||
printf("\n");
|
||||
for (SizeT i = 0; i < len; i++) {
|
||||
|
||||
if (computed[i] != reference[i]) {
|
||||
printf("INCORRECT: [%lu]: ", (unsigned long) i);
|
||||
PrintValue<T>(computed[i]);
|
||||
printf(" != ");
|
||||
PrintValue<T>(reference[i]);
|
||||
|
||||
if (verbose) {
|
||||
printf("\nresult[...");
|
||||
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
|
||||
PrintValue<T>(computed[j]);
|
||||
printf(", ");
|
||||
}
|
||||
printf("...]");
|
||||
printf("\nreference[...");
|
||||
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
|
||||
PrintValue<T>(reference[j]);
|
||||
printf(", ");
|
||||
}
|
||||
printf("...]");
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("CORRECT\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an example sorting problem whose keys is a vector of the specified
|
||||
* number of K elements, values of V elements, and then dispatches the problem
|
||||
* to the GPU for the given number of iterations, displaying runtime information.
|
||||
*
|
||||
* @param[in] iterations
|
||||
* Number of times to invoke the GPU sorting primitive
|
||||
* @param[in] num_elements
|
||||
* Size in elements of the vector to sort
|
||||
* @param[in] cfg
|
||||
* Config
|
||||
*/
|
||||
template<typename K, typename V, DeviceType type>
|
||||
void TestSort(
|
||||
unsigned int iterations,
|
||||
int num_elements,
|
||||
bool keys_only, const DeviceUtils::Config& cfg)
|
||||
{
|
||||
// Allocate the sorting problem on the host and fill the keys with random bytes
|
||||
|
||||
K *h_keys = NULL;
|
||||
K *h_reference_keys = NULL;
|
||||
V *h_values = NULL;
|
||||
h_keys = (K*) malloc(num_elements * sizeof(K));
|
||||
h_reference_keys = (K*) malloc(num_elements * sizeof(K));
|
||||
if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V));
|
||||
|
||||
|
||||
// Use random bits
|
||||
for (unsigned int i = 0; i < num_elements; ++i) {
|
||||
RandomBits<K>(h_keys[i], 0);
|
||||
//h_keys[i] = 0xffffffffu-i;
|
||||
if (!keys_only)
|
||||
h_values[i] = h_keys[i];//0xffffffffu-i;
|
||||
|
||||
h_reference_keys[i] = h_keys[i];
|
||||
}
|
||||
|
||||
// Run the timing test
|
||||
if (keys_only) {
|
||||
TimedSort<K, type>(num_elements, h_keys, iterations, cfg);
|
||||
} else {
|
||||
TimedSort<K, V, type>(num_elements, h_keys, h_values, iterations, cfg);
|
||||
}
|
||||
|
||||
// cudaThreadSynchronize();
|
||||
|
||||
// Display sorted key data
|
||||
if (g_verbose) {
|
||||
printf("\n\nKeys:\n");
|
||||
for (int i = 0; i < num_elements; i++) {
|
||||
PrintValue<K>(h_keys[i]);
|
||||
printf(", ");
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
// Verify solution
|
||||
std::sort(h_reference_keys, h_reference_keys + num_elements);
|
||||
CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
|
||||
printf("\n");
|
||||
fflush(stdout);
|
||||
|
||||
// Free our allocated host memory
|
||||
if (h_keys != NULL) free(h_keys);
|
||||
if (h_values != NULL) free(h_values);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Displays the commandline usage for this tool
|
||||
*/
|
||||
void Usage()
|
||||
{
|
||||
printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--keys-only]\n");
|
||||
printf("\n");
|
||||
printf("\t--v\tDisplays sorted results to the console.\n");
|
||||
printf("\n");
|
||||
printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
|
||||
printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
|
||||
printf("\n");
|
||||
printf("\t--n\tThe number of elements to comprise the sample problem\n");
|
||||
printf("\t\t\tDefault = 512\n");
|
||||
printf("\n");
|
||||
printf("\t--keys-only\tSpecifies that keys are not accommodated by value pairings\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Command-line parsing
|
||||
******************************************************************************/
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
class CommandLineArgs
|
||||
{
|
||||
protected:
|
||||
|
||||
std::map<std::string, std::string> pairs;
|
||||
|
||||
public:
|
||||
|
||||
// Constructor
|
||||
CommandLineArgs(int argc, char **argv)
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
string arg = argv[i];
|
||||
|
||||
if ((arg[0] != '-') || (arg[1] != '-')) {
|
||||
continue;
|
||||
}
|
||||
|
||||
string::size_type pos;
|
||||
string key, val;
|
||||
if ((pos = arg.find( '=')) == string::npos) {
|
||||
key = string(arg, 2, arg.length() - 2);
|
||||
val = "";
|
||||
} else {
|
||||
key = string(arg, 2, pos - 2);
|
||||
val = string(arg, pos + 1, arg.length() - 1);
|
||||
}
|
||||
pairs[key] = val;
|
||||
}
|
||||
}
|
||||
|
||||
bool CheckCmdLineFlag(const char* arg_name)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GetCmdLineArgument(const char *arg_name, T &val);
|
||||
|
||||
int ParsedArgc()
|
||||
{
|
||||
return pairs.size();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
istringstream strstream(itr->second);
|
||||
strstream >> val;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
|
||||
string s = itr->second;
|
||||
val = (char*) malloc(sizeof(char) * (s.length() + 1));
|
||||
strcpy(val, s.c_str());
|
||||
|
||||
} else {
|
||||
val = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Main
|
||||
******************************************************************************/
|
||||
|
||||
int main( int argc, char** argv)
|
||||
{
|
||||
|
||||
//srand(time(NULL));
|
||||
srand(0); // presently deterministic
|
||||
|
||||
unsigned int num_elements = 1024*1024*12;//16*1024;//8*524288;//2048;//512;//524288;
|
||||
unsigned int iterations = 10;
|
||||
bool keys_only;
|
||||
|
||||
//
|
||||
// Check command line arguments
|
||||
//
|
||||
|
||||
CommandLineArgs args(argc,argv);
|
||||
|
||||
if (args.CheckCmdLineFlag("help"))
|
||||
{
|
||||
Usage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
args.GetCmdLineArgument("i", iterations);
|
||||
args.GetCmdLineArgument("n", num_elements);
|
||||
keys_only = args.CheckCmdLineFlag("keys-only");
|
||||
g_verbose = args.CheckCmdLineFlag("v");
|
||||
|
||||
DeviceUtils::Config cfg;
|
||||
|
||||
// Choose AMD or NVidia
|
||||
#ifdef CL_PLATFORM_AMD
|
||||
cfg.m_vendor = DeviceUtils::Config::VD_AMD;
|
||||
#endif
|
||||
|
||||
#ifdef CL_PLATFORM_NVIDIA
|
||||
cfg.m_vendor = DeviceUtils::Config::VD_NV;
|
||||
#endif
|
||||
|
||||
TestSort<unsigned int, unsigned int, TYPE_CL>(
|
||||
iterations,
|
||||
num_elements,
|
||||
keys_only, cfg);
|
||||
|
||||
#ifdef ADL_ENABLE_DX11
|
||||
TestSort<unsigned int, unsigned int, TYPE_DX11>(
|
||||
iterations,
|
||||
num_elements,
|
||||
keys_only, cfg);
|
||||
#endif //ADL_ENABLE_DX11
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user