Files
bullet3/test/OpenCL/BitonicSort/BitonicSort.cl
erwincoumans 92f0938af3 add bitonic sort, as comparison.
fix stringify.bat for Windows (need to fix Mac/Linux version too)
2013-04-30 11:40:09 -07:00

171 lines
6.2 KiB
Common Lisp

MSTRINGIFY(
/*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/
inline void ComparatorPrivate(int2* keyA, int2* keyB, uint dir)
{
if((keyA[0].x > keyB[0].x) == dir)
{
int2 tmp = *keyA;
*keyA = *keyB;
*keyB = tmp;
}
}
inline void ComparatorLocal(__local int2* keyA, __local int2* keyB, uint dir)
{
if((keyA[0].x > keyB[0].x) == dir)
{
int2 tmp = *keyA;
*keyA = *keyB;
*keyB = tmp;
}
}
////////////////////////////////////////////////////////////////////////////////
// Monolithic bitonic sort kernel for short arrays fitting into local memory
////////////////////////////////////////////////////////////////////////////////
__kernel void kBitonicSortCellIdLocal(__global int2* pKey, uint arrayLength, uint dir GUID_ARG)
{
__local int2 l_key[1024U];
int localSizeLimit = get_local_size(0) * 2;
//Offset to the beginning of subbatch and load data
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
for(uint size = 2; size < arrayLength; size <<= 1)
{
//Bitonic merge
uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );
for(uint stride = size / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
//ddd == dir for the last bitonic merge step
{
for(uint stride = arrayLength / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], dir);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
////////////////////////////////////////////////////////////////////////////////
// Bitonic sort kernel for large arrays (not fitting into local memory)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level bitonic sort
//Almost the same as bitonicSortLocal with the only exception
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
//sorted in opposite directions
__kernel void kBitonicSortCellIdLocal1(__global int2* pKey GUID_ARG)
{
__local int2 l_key[1024U];
uint localSizeLimit = get_local_size(0) * 2;
//Offset to the beginning of subarray and load data
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
uint comparatorI = get_global_id(0) & ((localSizeLimit / 2) - 1);
for(uint size = 2; size < localSizeLimit; size <<= 1)
{
//Bitonic merge
uint ddd = (comparatorI & (size / 2)) != 0;
for(uint stride = size / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
//Odd / even arrays of localSizeLimit elements
//sorted in opposite directions
{
uint ddd = (get_group_id(0) & 1);
for(uint stride = localSizeLimit / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
__kernel void kBitonicSortCellIdMergeGlobal(__global int2* pKey, uint arrayLength, uint size, uint stride, uint dir GUID_ARG)
{
uint global_comparatorI = get_global_id(0);
uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
//Bitonic merge
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
int2 keyA = pKey[pos + 0];
int2 keyB = pKey[pos + stride];
ComparatorPrivate(&keyA, &keyB, ddd);
pKey[pos + 0] = keyA;
pKey[pos + stride] = keyB;
}
//Combined bitonic merge steps for
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
__kernel void kBitonicSortCellIdMergeLocal(__global int2* pKey, uint arrayLength, uint stride, uint size, uint dir GUID_ARG)
{
__local int2 l_key[1024U];
int localSizeLimit = get_local_size(0) * 2;
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
//Bitonic merge
uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
for(; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
);