90 lines
4.2 KiB
C++
90 lines
4.2 KiB
C++
|
|
#include "b3BitonicSort.h"
|
|
#include "Bullet3Common/b3Scalar.h"
|
|
|
|
|
|
//Note: logically shared with BitonicSort OpenCL code!
|
|
// TODO : get parameter from OpenCL and pass it to kernel (needed for platforms other than NVIDIA)
|
|
|
|
void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info)
|
|
{
|
|
|
|
if(arrayLength < 2)
|
|
return;
|
|
//Only power-of-two array lengths are supported so far
|
|
info.dir = (info.dir != 0);
|
|
cl_int ciErrNum;
|
|
size_t localWorkSize, globalWorkSize;
|
|
int res = -1;
|
|
|
|
|
|
cl_int clerr=clGetKernelWorkGroupInfo (info.bitonicSortLocal1,info.dev,CL_KERNEL_WORK_GROUP_SIZE,sizeof(size_t),&res,NULL);
|
|
if((clerr==CL_SUCCESS)&&(res>0))
|
|
info.localSizeLimit=res;
|
|
|
|
if(arrayLength <= info.localSizeLimit)
|
|
{
|
|
b3Assert( ( arrayLength) % info.localSizeLimit == 0);
|
|
//Launch bitonicSortLocal
|
|
ciErrNum = clSetKernelArg(info.bitonicSortLocal, 0, sizeof(cl_mem), (void *)&pKey);
|
|
ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
|
|
ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 2, sizeof(cl_uint), (void *)&info.dir);
|
|
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
|
|
|
localWorkSize = info.localSizeLimit / 2;
|
|
globalWorkSize = arrayLength / 2;
|
|
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
|
|
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
|
}
|
|
else
|
|
{
|
|
//Launch bitonicSortLocal1
|
|
ciErrNum = clSetKernelArg(info.bitonicSortLocal1, 0, sizeof(cl_mem), (void *)&pKey);
|
|
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
|
|
|
localWorkSize = info.localSizeLimit / 2;
|
|
globalWorkSize = arrayLength / 2;
|
|
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
|
|
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
|
|
|
for(unsigned int size = 2 * info.localSizeLimit; size <= arrayLength; size <<= 1)
|
|
{
|
|
for(unsigned stride = size / 2; stride > 0; stride >>= 1)
|
|
{
|
|
if(stride >= info.localSizeLimit)
|
|
{
|
|
//Launch bitonicMergeGlobal
|
|
ciErrNum = clSetKernelArg(info.bitonicSortMergeGlobal, 0, sizeof(cl_mem), (void *)&pKey);
|
|
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 1, sizeof(cl_uint), (void *)&arrayLength);
|
|
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 2, sizeof(cl_uint), (void *)&size);
|
|
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 3, sizeof(cl_uint), (void *)&stride);
|
|
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 4, sizeof(cl_uint), (void *)&info.dir);
|
|
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
|
|
|
localWorkSize = info.localSizeLimit / 4;
|
|
globalWorkSize = arrayLength / 2;
|
|
|
|
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeGlobal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
|
|
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
|
}
|
|
else
|
|
{
|
|
//Launch bitonicMergeLocal
|
|
ciErrNum = clSetKernelArg(info.bitonicSortMergeLocal, 0, sizeof(cl_mem), (void *)&pKey);
|
|
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
|
|
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 2, sizeof(cl_uint), (void *)&stride);
|
|
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 3, sizeof(cl_uint), (void *)&size);
|
|
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 4, sizeof(cl_uint), (void *)&info.dir);
|
|
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
|
|
|
localWorkSize = info.localSizeLimit / 2;
|
|
globalWorkSize = arrayLength / 2;
|
|
|
|
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
|
|
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} |