Add the GPU rigid body pipeline from https://github.com/erwincoumans/experiments as a Bullet 3.x preview for Bullet 2.80
This commit is contained in:
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Adl/Adl.h>
|
||||
#include <AdlPrimitives/Math/Math.h>
|
||||
|
||||
namespace adl
|
||||
{
|
||||
|
||||
class CopyBase
|
||||
{
|
||||
public:
|
||||
enum Option
|
||||
{
|
||||
PER_WI_1,
|
||||
PER_WI_2,
|
||||
PER_WI_4,
|
||||
};
|
||||
};
|
||||
|
||||
template<DeviceType TYPE>
|
||||
class Copy : public CopyBase
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
struct Data
|
||||
{
|
||||
const Device* m_device;
|
||||
Kernel* m_copy1F4Kernel;
|
||||
Kernel* m_copy2F4Kernel;
|
||||
Kernel* m_copy4F4Kernel;
|
||||
Kernel* m_copyF1Kernel;
|
||||
Kernel* m_copyF2Kernel;
|
||||
Buffer<int4>* m_constBuffer;
|
||||
};
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData);
|
||||
|
||||
static
|
||||
void deallocate(Data* data);
|
||||
|
||||
static
|
||||
void execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option = PER_WI_1);
|
||||
|
||||
static
|
||||
void execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n);
|
||||
|
||||
static
|
||||
void execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n);
|
||||
};
|
||||
|
||||
|
||||
#include <AdlPrimitives/Copy/CopyHost.inl>
|
||||
#include <AdlPrimitives/Copy/Copy.inl>
|
||||
|
||||
};
|
||||
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
|
||||
#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Copy\\CopyKernels"
|
||||
#define KERNEL0 "Copy1F4Kernel"
|
||||
#define KERNEL1 "Copy2F4Kernel"
|
||||
#define KERNEL2 "Copy4F4Kernel"
|
||||
#define KERNEL3 "CopyF1Kernel"
|
||||
#define KERNEL4 "CopyF2Kernel"
|
||||
|
||||
#include <AdlPrimitives/Copy/CopyKernelsCL.h>
|
||||
#include <AdlPrimitives/Copy/CopyKernelsDX11.h>
|
||||
|
||||
|
||||
template<DeviceType TYPE>
|
||||
typename Copy<TYPE>::Data* Copy<TYPE>::allocate( const Device* device )
|
||||
{
|
||||
ADLASSERT( TYPE == device->m_type );
|
||||
|
||||
|
||||
const char* src[] =
|
||||
#if defined(ADL_LOAD_KERNEL_FROM_STRING)
|
||||
{copyKernelsCL, copyKernelsDX11};
|
||||
// ADLASSERT(0);
|
||||
#else
|
||||
{0,0};
|
||||
#endif
|
||||
|
||||
Data* data = new Data;
|
||||
data->m_device = device;
|
||||
data->m_copy1F4Kernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
|
||||
data->m_copy2F4Kernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
|
||||
data->m_copy4F4Kernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
|
||||
data->m_copyF1Kernel = device->getKernel( PATH, KERNEL3, 0, src[TYPE] );
|
||||
data->m_copyF2Kernel = device->getKernel( PATH, KERNEL4, 0, src[TYPE] );
|
||||
data->m_constBuffer = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void Copy<TYPE>::deallocate( Data* data )
|
||||
{
|
||||
delete data->m_constBuffer;
|
||||
delete data;
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void Copy<TYPE>::execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option )
|
||||
{
|
||||
ADLASSERT( TYPE == dst.getType() );
|
||||
ADLASSERT( TYPE == src.getType() );
|
||||
|
||||
int4 constBuffer;
|
||||
constBuffer.x = n;
|
||||
|
||||
switch (option)
|
||||
{
|
||||
case PER_WI_1:
|
||||
{
|
||||
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_copy1F4Kernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( n/1 );
|
||||
}
|
||||
break;
|
||||
case PER_WI_2:
|
||||
{
|
||||
ADLASSERT( n%2 == 0 );
|
||||
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_copy2F4Kernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( n/2 );
|
||||
}
|
||||
break;
|
||||
case PER_WI_4:
|
||||
{
|
||||
ADLASSERT( n%4 == 0 );
|
||||
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_copy4F4Kernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( n/4 );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
ADLASSERT(0);
|
||||
break;
|
||||
};
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void Copy<TYPE>::execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n )
|
||||
{
|
||||
ADLASSERT( TYPE == dst.getType() );
|
||||
ADLASSERT( TYPE == src.getType() );
|
||||
|
||||
int4 constBuffer;
|
||||
constBuffer.x = n;
|
||||
|
||||
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_copyF2Kernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( n/1 );
|
||||
}
|
||||
|
||||
template<DeviceType TYPE>
|
||||
void Copy<TYPE>::execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n )
|
||||
{
|
||||
ADLASSERT( TYPE == dst.getType() );
|
||||
ADLASSERT( TYPE == src.getType() );
|
||||
|
||||
int4 constBuffer;
|
||||
constBuffer.x = n;
|
||||
|
||||
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
|
||||
|
||||
Launcher launcher( data->m_device, data->m_copyF1Kernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
||||
launcher.setConst( *data->m_constBuffer, constBuffer );
|
||||
launcher.launch1D( n/1 );
|
||||
}
|
||||
|
||||
|
||||
#undef PATH
|
||||
#undef KERNEL0
|
||||
#undef KERNEL1
|
||||
#undef KERNEL2
|
||||
#undef KERNEL3
|
||||
#undef KERNEL4
|
||||
@@ -0,0 +1,85 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
template<>
|
||||
class Copy<TYPE_HOST> : public CopyBase
|
||||
{
|
||||
public:
|
||||
typedef Launcher::BufferInfo BufferInfo;
|
||||
|
||||
struct Data
|
||||
{
|
||||
};
|
||||
|
||||
static
|
||||
Data* allocate(const Device* deviceData)
|
||||
{
|
||||
ADLASSERT( TYPE_HOST == deviceData->m_type );
|
||||
return 0;
|
||||
}
|
||||
|
||||
static
|
||||
void deallocate(Data* data)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static
|
||||
void execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option = PER_WI_1)
|
||||
{
|
||||
ADLASSERT( TYPE_HOST == dst.getType() );
|
||||
ADLASSERT( TYPE_HOST == src.getType() );
|
||||
|
||||
HostBuffer<float4>& dstH = (HostBuffer<float4>&)dst;
|
||||
HostBuffer<float4>& srcH = (HostBuffer<float4>&)src;
|
||||
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
dstH[i] = srcH[i];
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n)
|
||||
{
|
||||
ADLASSERT( TYPE_HOST == dst.getType() );
|
||||
ADLASSERT( TYPE_HOST == src.getType() );
|
||||
|
||||
HostBuffer<float2>& dstH = (HostBuffer<float2>&)dst;
|
||||
HostBuffer<float2>& srcH = (HostBuffer<float2>&)src;
|
||||
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
dstH[i] = srcH[i];
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n)
|
||||
{
|
||||
ADLASSERT( TYPE_HOST == dst.getType() );
|
||||
ADLASSERT( TYPE_HOST == src.getType() );
|
||||
|
||||
HostBuffer<float>& dstH = (HostBuffer<float>&)dst;
|
||||
HostBuffer<float>& srcH = (HostBuffer<float>&)src;
|
||||
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
dstH[i] = srcH[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||
|
||||
typedef unsigned int u32;
|
||||
#define GET_GROUP_IDX get_group_id(0)
|
||||
#define GET_LOCAL_IDX get_local_id(0)
|
||||
#define GET_GLOBAL_IDX get_global_id(0)
|
||||
#define GET_GROUP_SIZE get_local_size(0)
|
||||
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
|
||||
#define AtomInc(x) atom_inc(&(x))
|
||||
#define AtomInc1(x, out) out = atom_inc(&(x))
|
||||
|
||||
#define make_uint4 (uint4)
|
||||
#define make_uint2 (uint2)
|
||||
#define make_int2 (int2)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int m_n;
|
||||
int m_padding[3];
|
||||
} ConstBuffer;
|
||||
|
||||
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void Copy1F4Kernel(__global float4* dst, __global float4* src,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < cb.m_n )
|
||||
{
|
||||
float4 a0 = src[gIdx];
|
||||
|
||||
dst[ gIdx ] = a0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void Copy2F4Kernel(__global float4* dst, __global float4* src,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( 2*gIdx <= cb.m_n )
|
||||
{
|
||||
float4 a0 = src[gIdx*2+0];
|
||||
float4 a1 = src[gIdx*2+1];
|
||||
|
||||
dst[ gIdx*2+0 ] = a0;
|
||||
dst[ gIdx*2+1 ] = a1;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void Copy4F4Kernel(__global float4* dst, __global float4* src,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( 4*gIdx <= cb.m_n )
|
||||
{
|
||||
int idx0 = gIdx*4+0;
|
||||
int idx1 = gIdx*4+1;
|
||||
int idx2 = gIdx*4+2;
|
||||
int idx3 = gIdx*4+3;
|
||||
|
||||
float4 a0 = src[idx0];
|
||||
float4 a1 = src[idx1];
|
||||
float4 a2 = src[idx2];
|
||||
float4 a3 = src[idx3];
|
||||
|
||||
dst[ idx0 ] = a0;
|
||||
dst[ idx1 ] = a1;
|
||||
dst[ idx2 ] = a2;
|
||||
dst[ idx3 ] = a3;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void CopyF1Kernel(__global float* dstF1, __global float* srcF1,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < cb.m_n )
|
||||
{
|
||||
float a0 = srcF1[gIdx];
|
||||
|
||||
dstF1[ gIdx ] = a0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < cb.m_n )
|
||||
{
|
||||
float2 a0 = srcF2[gIdx];
|
||||
|
||||
dstF2[ gIdx ] = a0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
typedef uint u32;
|
||||
|
||||
#define GET_GROUP_IDX groupIdx.x
|
||||
#define GET_LOCAL_IDX localIdx.x
|
||||
#define GET_GLOBAL_IDX globalIdx.x
|
||||
#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
|
||||
#define GROUP_MEM_FENCE
|
||||
#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
|
||||
#define AtomInc(x) InterlockedAdd(x, 1)
|
||||
#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
|
||||
|
||||
#define make_uint4 uint4
|
||||
#define make_uint2 uint2
|
||||
#define make_int2 int2
|
||||
|
||||
#define WG_SIZE 64
|
||||
|
||||
#define GET_GROUP_SIZE WG_SIZE
|
||||
|
||||
|
||||
|
||||
cbuffer CB : register( b0 )
|
||||
{
|
||||
int m_n;
|
||||
int m_padding[3];
|
||||
};
|
||||
|
||||
RWStructuredBuffer<float4> dst : register( u0 );
|
||||
StructuredBuffer<float4> src : register( t0 );
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void Copy1F4Kernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < m_n )
|
||||
{
|
||||
float4 a0 = src[gIdx];
|
||||
|
||||
dst[ gIdx ] = a0;
|
||||
}
|
||||
}
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void Copy2F4Kernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( 2*gIdx <= m_n )
|
||||
{
|
||||
float4 a0 = src[gIdx*2+0];
|
||||
float4 a1 = src[gIdx*2+1];
|
||||
|
||||
dst[ gIdx*2+0 ] = a0;
|
||||
dst[ gIdx*2+1 ] = a1;
|
||||
}
|
||||
}
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void Copy4F4Kernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( 4*gIdx <= m_n )
|
||||
{
|
||||
int idx0 = gIdx*4+0;
|
||||
int idx1 = gIdx*4+1;
|
||||
int idx2 = gIdx*4+2;
|
||||
int idx3 = gIdx*4+3;
|
||||
|
||||
float4 a0 = src[idx0];
|
||||
float4 a1 = src[idx1];
|
||||
float4 a2 = src[idx2];
|
||||
float4 a3 = src[idx3];
|
||||
|
||||
dst[ idx0 ] = a0;
|
||||
dst[ idx1 ] = a1;
|
||||
dst[ idx2 ] = a2;
|
||||
dst[ idx3 ] = a3;
|
||||
}
|
||||
}
|
||||
|
||||
RWStructuredBuffer<float> dstF1 : register( u0 );
|
||||
StructuredBuffer<float> srcF1 : register( t0 );
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void CopyF1Kernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < m_n )
|
||||
{
|
||||
float a0 = srcF1[gIdx];
|
||||
|
||||
dstF1[ gIdx ] = a0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
RWStructuredBuffer<float2> dstF2 : register( u0 );
|
||||
StructuredBuffer<float2> srcF2 : register( t0 );
|
||||
|
||||
[numthreads(WG_SIZE, 1, 1)]
|
||||
void CopyF2Kernel( DEFAULT_ARGS )
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < m_n )
|
||||
{
|
||||
float2 a0 = srcF2[gIdx];
|
||||
|
||||
dstF2[ gIdx ] = a0;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
static const char* copyKernelsCL= \
|
||||
"/*\n"
|
||||
" 2011 Takahiro Harada\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
|
||||
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
|
||||
"\n"
|
||||
"typedef unsigned int u32;\n"
|
||||
"#define GET_GROUP_IDX get_group_id(0)\n"
|
||||
"#define GET_LOCAL_IDX get_local_id(0)\n"
|
||||
"#define GET_GLOBAL_IDX get_global_id(0)\n"
|
||||
"#define GET_GROUP_SIZE get_local_size(0)\n"
|
||||
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define AtomInc(x) atom_inc(&(x))\n"
|
||||
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
|
||||
"\n"
|
||||
"#define make_uint4 (uint4)\n"
|
||||
"#define make_uint2 (uint2)\n"
|
||||
"#define make_int2 (int2)\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" int m_n;\n"
|
||||
" int m_padding[3];\n"
|
||||
"} ConstBuffer;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < cb.m_n )\n"
|
||||
" {\n"
|
||||
" float4 a0 = src[gIdx];\n"
|
||||
"\n"
|
||||
" dst[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( 2*gIdx <= cb.m_n )\n"
|
||||
" {\n"
|
||||
" float4 a0 = src[gIdx*2+0];\n"
|
||||
" float4 a1 = src[gIdx*2+1];\n"
|
||||
"\n"
|
||||
" dst[ gIdx*2+0 ] = a0;\n"
|
||||
" dst[ gIdx*2+1 ] = a1;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( 4*gIdx <= cb.m_n )\n"
|
||||
" {\n"
|
||||
" int idx0 = gIdx*4+0;\n"
|
||||
" int idx1 = gIdx*4+1;\n"
|
||||
" int idx2 = gIdx*4+2;\n"
|
||||
" int idx3 = gIdx*4+3;\n"
|
||||
"\n"
|
||||
" float4 a0 = src[idx0];\n"
|
||||
" float4 a1 = src[idx1];\n"
|
||||
" float4 a2 = src[idx2];\n"
|
||||
" float4 a3 = src[idx3];\n"
|
||||
"\n"
|
||||
" dst[ idx0 ] = a0;\n"
|
||||
" dst[ idx1 ] = a1;\n"
|
||||
" dst[ idx2 ] = a2;\n"
|
||||
" dst[ idx3 ] = a3;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < cb.m_n )\n"
|
||||
" {\n"
|
||||
" float a0 = srcF1[gIdx];\n"
|
||||
"\n"
|
||||
" dstF1[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < cb.m_n )\n"
|
||||
" {\n"
|
||||
" float2 a0 = srcF2[gIdx];\n"
|
||||
"\n"
|
||||
" dstF2[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
;
|
||||
@@ -0,0 +1,120 @@
|
||||
static const char* copyKernelsDX11= \
|
||||
"/*\n"
|
||||
" 2011 Takahiro Harada\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"typedef uint u32;\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_IDX groupIdx.x\n"
|
||||
"#define GET_LOCAL_IDX localIdx.x\n"
|
||||
"#define GET_GLOBAL_IDX globalIdx.x\n"
|
||||
"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
|
||||
"#define GROUP_MEM_FENCE\n"
|
||||
"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
|
||||
"#define AtomInc(x) InterlockedAdd(x, 1)\n"
|
||||
"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
|
||||
"\n"
|
||||
"#define make_uint4 uint4\n"
|
||||
"#define make_uint2 uint2\n"
|
||||
"#define make_int2 int2\n"
|
||||
"\n"
|
||||
"#define WG_SIZE 64\n"
|
||||
"\n"
|
||||
"#define GET_GROUP_SIZE WG_SIZE\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"cbuffer CB : register( b0 )\n"
|
||||
"{\n"
|
||||
" int m_n;\n"
|
||||
" int m_padding[3];\n"
|
||||
"};\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<float4> dst : register( u0 );\n"
|
||||
"StructuredBuffer<float4> src : register( t0 );\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void Copy1F4Kernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < m_n )\n"
|
||||
" {\n"
|
||||
" float4 a0 = src[gIdx];\n"
|
||||
"\n"
|
||||
" dst[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void Copy2F4Kernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( 2*gIdx <= m_n )\n"
|
||||
" {\n"
|
||||
" float4 a0 = src[gIdx*2+0];\n"
|
||||
" float4 a1 = src[gIdx*2+1];\n"
|
||||
"\n"
|
||||
" dst[ gIdx*2+0 ] = a0;\n"
|
||||
" dst[ gIdx*2+1 ] = a1;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void Copy4F4Kernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( 4*gIdx <= m_n )\n"
|
||||
" {\n"
|
||||
" int idx0 = gIdx*4+0;\n"
|
||||
" int idx1 = gIdx*4+1;\n"
|
||||
" int idx2 = gIdx*4+2;\n"
|
||||
" int idx3 = gIdx*4+3;\n"
|
||||
"\n"
|
||||
" float4 a0 = src[idx0];\n"
|
||||
" float4 a1 = src[idx1];\n"
|
||||
" float4 a2 = src[idx2];\n"
|
||||
" float4 a3 = src[idx3];\n"
|
||||
"\n"
|
||||
" dst[ idx0 ] = a0;\n"
|
||||
" dst[ idx1 ] = a1;\n"
|
||||
" dst[ idx2 ] = a2;\n"
|
||||
" dst[ idx3 ] = a3;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<float> dstF1 : register( u0 );\n"
|
||||
"StructuredBuffer<float> srcF1 : register( t0 );\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void CopyF1Kernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < m_n )\n"
|
||||
" {\n"
|
||||
" float a0 = srcF1[gIdx];\n"
|
||||
"\n"
|
||||
" dstF1[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"RWStructuredBuffer<float2> dstF2 : register( u0 );\n"
|
||||
"StructuredBuffer<float2> srcF2 : register( t0 );\n"
|
||||
"\n"
|
||||
"[numthreads(WG_SIZE, 1, 1)]\n"
|
||||
"void CopyF2Kernel( DEFAULT_ARGS )\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < m_n )\n"
|
||||
" {\n"
|
||||
" float2 a0 = srcF2[gIdx];\n"
|
||||
"\n"
|
||||
" dstF2[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
;
|
||||
Reference in New Issue
Block a user