add bitonic sort, as comparison.
fix stringify.bat for Windows (need to fix Mac/Linux version too)
This commit is contained in:
171
test/OpenCL/BitonicSort/BitonicSort.cl
Normal file
171
test/OpenCL/BitonicSort/BitonicSort.cl
Normal file
@@ -0,0 +1,171 @@
|
||||
MSTRINGIFY(
|
||||
/*
|
||||
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NVIDIA Corporation and its licensors retain all intellectual property and
|
||||
* proprietary rights in and to this software and related documentation.
|
||||
* Any use, reproduction, disclosure, or distribution of this software
|
||||
* and related documentation without an express license agreement from
|
||||
* NVIDIA Corporation is strictly prohibited.
|
||||
*
|
||||
* Please refer to the applicable NVIDIA end user license agreement (EULA)
|
||||
* associated with this source code for terms and conditions that govern
|
||||
* your use of this NVIDIA software.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
||||
inline void ComparatorPrivate(int2* keyA, int2* keyB, uint dir)
|
||||
{
|
||||
if((keyA[0].x > keyB[0].x) == dir)
|
||||
{
|
||||
int2 tmp = *keyA;
|
||||
*keyA = *keyB;
|
||||
*keyB = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
inline void ComparatorLocal(__local int2* keyA, __local int2* keyB, uint dir)
|
||||
{
|
||||
if((keyA[0].x > keyB[0].x) == dir)
|
||||
{
|
||||
int2 tmp = *keyA;
|
||||
*keyA = *keyB;
|
||||
*keyB = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Monolithic bitonic sort kernel for short arrays fitting into local memory
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void kBitonicSortCellIdLocal(__global int2* pKey, uint arrayLength, uint dir GUID_ARG)
|
||||
{
|
||||
__local int2 l_key[1024U];
|
||||
int localSizeLimit = get_local_size(0) * 2;
|
||||
|
||||
//Offset to the beginning of subbatch and load data
|
||||
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
|
||||
l_key[get_local_id(0) + 0] = pKey[ 0];
|
||||
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
|
||||
|
||||
for(uint size = 2; size < arrayLength; size <<= 1)
|
||||
{
|
||||
//Bitonic merge
|
||||
uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );
|
||||
for(uint stride = size / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
|
||||
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
|
||||
}
|
||||
}
|
||||
|
||||
//ddd == dir for the last bitonic merge step
|
||||
{
|
||||
for(uint stride = arrayLength / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
|
||||
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], dir);
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
pKey[ 0] = l_key[get_local_id(0) + 0];
|
||||
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Bitonic sort kernel for large arrays (not fitting into local memory)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//Bottom-level bitonic sort
|
||||
//Almost the same as bitonicSortLocal with the only exception
|
||||
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
|
||||
//sorted in opposite directions
|
||||
__kernel void kBitonicSortCellIdLocal1(__global int2* pKey GUID_ARG)
|
||||
{
|
||||
__local int2 l_key[1024U];
|
||||
uint localSizeLimit = get_local_size(0) * 2;
|
||||
|
||||
//Offset to the beginning of subarray and load data
|
||||
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
|
||||
l_key[get_local_id(0) + 0] = pKey[ 0];
|
||||
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
|
||||
|
||||
uint comparatorI = get_global_id(0) & ((localSizeLimit / 2) - 1);
|
||||
|
||||
for(uint size = 2; size < localSizeLimit; size <<= 1)
|
||||
{
|
||||
//Bitonic merge
|
||||
uint ddd = (comparatorI & (size / 2)) != 0;
|
||||
for(uint stride = size / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
|
||||
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
|
||||
}
|
||||
}
|
||||
|
||||
//Odd / even arrays of localSizeLimit elements
|
||||
//sorted in opposite directions
|
||||
{
|
||||
uint ddd = (get_group_id(0) & 1);
|
||||
for(uint stride = localSizeLimit / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
|
||||
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
pKey[ 0] = l_key[get_local_id(0) + 0];
|
||||
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
|
||||
}
|
||||
|
||||
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
|
||||
__kernel void kBitonicSortCellIdMergeGlobal(__global int2* pKey, uint arrayLength, uint size, uint stride, uint dir GUID_ARG)
|
||||
{
|
||||
uint global_comparatorI = get_global_id(0);
|
||||
uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
|
||||
|
||||
//Bitonic merge
|
||||
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
|
||||
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
|
||||
|
||||
int2 keyA = pKey[pos + 0];
|
||||
int2 keyB = pKey[pos + stride];
|
||||
|
||||
ComparatorPrivate(&keyA, &keyB, ddd);
|
||||
|
||||
pKey[pos + 0] = keyA;
|
||||
pKey[pos + stride] = keyB;
|
||||
}
|
||||
|
||||
//Combined bitonic merge steps for
|
||||
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
|
||||
__kernel void kBitonicSortCellIdMergeLocal(__global int2* pKey, uint arrayLength, uint stride, uint size, uint dir GUID_ARG)
|
||||
{
|
||||
__local int2 l_key[1024U];
|
||||
int localSizeLimit = get_local_size(0) * 2;
|
||||
|
||||
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
|
||||
l_key[get_local_id(0) + 0] = pKey[ 0];
|
||||
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
|
||||
|
||||
//Bitonic merge
|
||||
uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
|
||||
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
|
||||
for(; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
|
||||
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
pKey[ 0] = l_key[get_local_id(0) + 0];
|
||||
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
|
||||
}
|
||||
);
|
||||
Reference in New Issue
Block a user