Files
bullet3/test/OpenCL/BitonicSort/b3BitonicSort.cpp

90 lines
4.2 KiB
C++

#include "b3BitonicSort.h"
#include "Bullet3Common/b3Scalar.h"
//Note: logically shared with BitonicSort OpenCL code!
// TODO : get parameter from OpenCL and pass it to kernel (needed for platforms other than NVIDIA)
void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info)
{
if(arrayLength < 2)
return;
//Only power-of-two array lengths are supported so far
info.dir = (info.dir != 0);
cl_int ciErrNum;
size_t localWorkSize, globalWorkSize;
int res = -1;
cl_int clerr=clGetKernelWorkGroupInfo (info.bitonicSortLocal1,info.dev,CL_KERNEL_WORK_GROUP_SIZE,sizeof(size_t),&res,NULL);
if((clerr==CL_SUCCESS)&&(res>0))
info.localSizeLimit=res;
if(arrayLength <= info.localSizeLimit)
{
b3Assert( ( arrayLength) % info.localSizeLimit == 0);
//Launch bitonicSortLocal
ciErrNum = clSetKernelArg(info.bitonicSortLocal, 0, sizeof(cl_mem), (void *)&pKey);
ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 2, sizeof(cl_uint), (void *)&info.dir);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 2;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
else
{
//Launch bitonicSortLocal1
ciErrNum = clSetKernelArg(info.bitonicSortLocal1, 0, sizeof(cl_mem), (void *)&pKey);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 2;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
for(unsigned int size = 2 * info.localSizeLimit; size <= arrayLength; size <<= 1)
{
for(unsigned stride = size / 2; stride > 0; stride >>= 1)
{
if(stride >= info.localSizeLimit)
{
//Launch bitonicMergeGlobal
ciErrNum = clSetKernelArg(info.bitonicSortMergeGlobal, 0, sizeof(cl_mem), (void *)&pKey);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 1, sizeof(cl_uint), (void *)&arrayLength);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 2, sizeof(cl_uint), (void *)&size);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 3, sizeof(cl_uint), (void *)&stride);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 4, sizeof(cl_uint), (void *)&info.dir);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 4;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeGlobal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
else
{
//Launch bitonicMergeLocal
ciErrNum = clSetKernelArg(info.bitonicSortMergeLocal, 0, sizeof(cl_mem), (void *)&pKey);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 2, sizeof(cl_uint), (void *)&stride);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 3, sizeof(cl_uint), (void *)&size);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 4, sizeof(cl_uint), (void *)&info.dir);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 2;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
break;
}
}
}
}
}