#define TILE_DIM 32 #define BLOCK_ROWS 8 /*// simple copy kernel (CUDA) // Used as reference case representing best effective bandwidth. __global__ void copy(float *odata, const float *idata) { int x = blockIdx.x * TILE_DIM + threadIdx.x; int y = blockIdx.y * TILE_DIM + threadIdx.y; int width = gridDim.x * TILE_DIM; for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS) odata[(y+j)*width + x] = idata[(y+j)*width + x]; } */ // simple copy kernel (OpenCL) __kernel void copyKernel(__global float* odata, __global const float* idata) { int x = get_group_id(0) * get_num_groups(0) + get_local_id(0); int y = get_group_id(1) * get_num_groups(1) + get_local_id(1); int width = get_num_groups(0) * get_local_size(0); for (int j = 0; j < get_num_groups(1); j+= get_local_size(1)) { odata[(y+j)*width + x] = idata[(y+j)*width + x]; } } /* // copy kernel using shared memory (CUDA) // Also used as reference case, demonstrating effect of using shared memory. __global__ void copySharedMem(float *odata, const float *idata) { __shared__ float tile[TILE_DIM * TILE_DIM]; int x = blockIdx.x * TILE_DIM + threadIdx.x; int y = blockIdx.y * TILE_DIM + threadIdx.y; int width = gridDim.x * TILE_DIM; for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x] = idata[(y+j)*width + x]; __syncthreads(); for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) odata[(y+j)*width + x] = tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x]; } */ // copy kernel using shared memory (OpenCL) // Also used as reference case, demonstrating effect of using shared memory. __kernel void copySharedMemKernel(__global float *odata, __global const float *idata) { __local float tile[TILE_DIM * TILE_DIM]; int x = get_group_id(0) * get_num_groups(0) + get_local_id(0); int y = get_group_id(1) * get_num_groups(1) + get_local_id(1); int width = get_num_groups(0) * get_local_size(0); for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)] = idata[(y+j)*width + x]; barrier(CLK_LOCAL_MEM_FENCE); for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) odata[(y+j)*width + x] = tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)]; } /* // naive transpose (CUDA) // Simplest transpose; doesn't use shared memory. // Global memory reads are coalesced but writes are not. __global__ void transposeNaive(float *odata, const float *idata) { int x = blockIdx.x * TILE_DIM + threadIdx.x; int y = blockIdx.y * TILE_DIM + threadIdx.y; int width = gridDim.x * TILE_DIM; for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS) odata[x*width + (y+j)] = idata[(y+j)*width + x]; } */ // naive transpose (OpenCL) // Simplest transpose; doesn't use shared memory. // Global memory reads are coalesced but writes are not. __kernel void transposeNaiveKernel(__global float *odata, __global const float *idata) { int x = get_group_id(0) * get_num_groups(0) + get_local_id(0); int y = get_group_id(1) * get_num_groups(1) + get_local_id(1); int width = get_num_groups(0) * get_local_size(0); for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS) odata[x*width + (y+j)] = idata[(y+j)*width + x]; } /* // coalesced transpose (CUDA) // Uses shared memory to achieve coalesing in both reads and writes // Tile width == #banks causes shared memory bank conflicts. __global__ void transposeCoalesced(float *odata, const float *idata) { __shared__ float tile[TILE_DIM][TILE_DIM]; int x = blockIdx.x * TILE_DIM + threadIdx.x; int y = blockIdx.y * TILE_DIM + threadIdx.y; int width = gridDim.x * TILE_DIM; for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; __syncthreads(); x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset y = blockIdx.x * TILE_DIM + threadIdx.y; for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; } */ // coalesced transpose (OpenCL) // Uses shared memory to achieve coalesing in both reads and writes // Tile width == #banks causes shared memory bank conflicts. __kernel void transposeCoalescedKernel(__global float *odata, __global const float *idata) { __local float tile[TILE_DIM][TILE_DIM]; int x = get_group_id(0) * get_num_groups(0) + get_local_id(0); int y = get_group_id(1) * get_num_groups(1) + get_local_id(1); int width = get_num_groups(0) * get_local_size(0); for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x]; barrier(CLK_LOCAL_MEM_FENCE); x = get_group_id(1) * TILE_DIM + get_local_id(0); y = get_group_id(0) * TILE_DIM + get_local_id(1); for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j]; } // No bank-conflict transpose (OpenCL) // Same as transposeCoalesced except the first tile dimension is padded // to avoid shared memory bank conflicts. __kernel void transposeNoBankConflictsKernel(__global float *odata, __global const float *idata) { __local float tile[TILE_DIM][TILE_DIM+1]; int x = get_group_id(0) * get_num_groups(0) + get_local_id(0); int y = get_group_id(1) * get_num_groups(1) + get_local_id(1); int width = get_num_groups(0) * get_local_size(0); for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x]; barrier(CLK_LOCAL_MEM_FENCE); x = get_group_id(1) * TILE_DIM + get_local_id(0); y = get_group_id(0) * TILE_DIM + get_local_id(1); for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j]; }