Files
bullet3/opencl/lds_bank_conflict/lds_kernels.cl
2013-03-11 22:03:27 +01:00

172 lines
5.6 KiB
Common Lisp

#define TILE_DIM 32
#define BLOCK_ROWS 8
/*// simple copy kernel (CUDA)
// Used as reference case representing best effective bandwidth.
__global__ void copy(float *odata, const float *idata)
{
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
odata[(y+j)*width + x] = idata[(y+j)*width + x];
}
*/
// simple copy kernel (OpenCL)
__kernel void copyKernel(__global float* odata, __global const float* idata)
{
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < get_num_groups(1); j+= get_local_size(1))
{
odata[(y+j)*width + x] = idata[(y+j)*width + x];
}
}
/*
// copy kernel using shared memory (CUDA)
// Also used as reference case, demonstrating effect of using shared memory.
__global__ void copySharedMem(float *odata, const float *idata)
{
__shared__ float tile[TILE_DIM * TILE_DIM];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x] = idata[(y+j)*width + x];
__syncthreads();
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x];
}
*/
// copy kernel using shared memory (OpenCL)
// Also used as reference case, demonstrating effect of using shared memory.
__kernel void copySharedMemKernel(__global float *odata, __global const float *idata)
{
__local float tile[TILE_DIM * TILE_DIM];
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)] = idata[(y+j)*width + x];
barrier(CLK_LOCAL_MEM_FENCE);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)];
}
/*
// naive transpose (CUDA)
// Simplest transpose; doesn't use shared memory.
// Global memory reads are coalesced but writes are not.
__global__ void transposeNaive(float *odata, const float *idata)
{
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
odata[x*width + (y+j)] = idata[(y+j)*width + x];
}
*/
// naive transpose (OpenCL)
// Simplest transpose; doesn't use shared memory.
// Global memory reads are coalesced but writes are not.
__kernel void transposeNaiveKernel(__global float *odata, __global const float *idata)
{
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
odata[x*width + (y+j)] = idata[(y+j)*width + x];
}
/*
// coalesced transpose (CUDA)
// Uses shared memory to achieve coalesing in both reads and writes
// Tile width == #banks causes shared memory bank conflicts.
__global__ void transposeCoalesced(float *odata, const float *idata)
{
__shared__ float tile[TILE_DIM][TILE_DIM];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
__syncthreads();
x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
y = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
}
*/
// coalesced transpose (OpenCL)
// Uses shared memory to achieve coalesing in both reads and writes
// Tile width == #banks causes shared memory bank conflicts.
__kernel void transposeCoalescedKernel(__global float *odata, __global const float *idata)
{
__local float tile[TILE_DIM][TILE_DIM];
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
barrier(CLK_LOCAL_MEM_FENCE);
x = get_group_id(1) * TILE_DIM + get_local_id(0);
y = get_group_id(0) * TILE_DIM + get_local_id(1);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
}
// No bank-conflict transpose (OpenCL)
// Same as transposeCoalesced except the first tile dimension is padded
// to avoid shared memory bank conflicts.
__kernel void transposeNoBankConflictsKernel(__global float *odata, __global const float *idata)
{
__local float tile[TILE_DIM][TILE_DIM+1];
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
barrier(CLK_LOCAL_MEM_FENCE);
x = get_group_id(1) * TILE_DIM + get_local_id(0);
y = get_group_id(0) * TILE_DIM + get_local_id(1);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
}