add bitonic sort, as comparison.

fix stringify.bat for Windows (need to fix Mac/Linux version too)
This commit is contained in:
erwincoumans
2013-04-30 11:40:09 -07:00
parent c5f488fe6d
commit 92f0938af3
24 changed files with 1857 additions and 177 deletions

View File

@@ -0,0 +1,171 @@
MSTRINGIFY(
/*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/
inline void ComparatorPrivate(int2* keyA, int2* keyB, uint dir)
{
if((keyA[0].x > keyB[0].x) == dir)
{
int2 tmp = *keyA;
*keyA = *keyB;
*keyB = tmp;
}
}
inline void ComparatorLocal(__local int2* keyA, __local int2* keyB, uint dir)
{
if((keyA[0].x > keyB[0].x) == dir)
{
int2 tmp = *keyA;
*keyA = *keyB;
*keyB = tmp;
}
}
////////////////////////////////////////////////////////////////////////////////
// Monolithic bitonic sort kernel for short arrays fitting into local memory
////////////////////////////////////////////////////////////////////////////////
__kernel void kBitonicSortCellIdLocal(__global int2* pKey, uint arrayLength, uint dir GUID_ARG)
{
__local int2 l_key[1024U];
int localSizeLimit = get_local_size(0) * 2;
//Offset to the beginning of subbatch and load data
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
for(uint size = 2; size < arrayLength; size <<= 1)
{
//Bitonic merge
uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );
for(uint stride = size / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
//ddd == dir for the last bitonic merge step
{
for(uint stride = arrayLength / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], dir);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
////////////////////////////////////////////////////////////////////////////////
// Bitonic sort kernel for large arrays (not fitting into local memory)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level bitonic sort
//Almost the same as bitonicSortLocal with the only exception
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
//sorted in opposite directions
__kernel void kBitonicSortCellIdLocal1(__global int2* pKey GUID_ARG)
{
__local int2 l_key[1024U];
uint localSizeLimit = get_local_size(0) * 2;
//Offset to the beginning of subarray and load data
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
uint comparatorI = get_global_id(0) & ((localSizeLimit / 2) - 1);
for(uint size = 2; size < localSizeLimit; size <<= 1)
{
//Bitonic merge
uint ddd = (comparatorI & (size / 2)) != 0;
for(uint stride = size / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
//Odd / even arrays of localSizeLimit elements
//sorted in opposite directions
{
uint ddd = (get_group_id(0) & 1);
for(uint stride = localSizeLimit / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
__kernel void kBitonicSortCellIdMergeGlobal(__global int2* pKey, uint arrayLength, uint size, uint stride, uint dir GUID_ARG)
{
uint global_comparatorI = get_global_id(0);
uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
//Bitonic merge
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
int2 keyA = pKey[pos + 0];
int2 keyB = pKey[pos + stride];
ComparatorPrivate(&keyA, &keyB, ddd);
pKey[pos + 0] = keyA;
pKey[pos + stride] = keyB;
}
//Combined bitonic merge steps for
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
__kernel void kBitonicSortCellIdMergeLocal(__global int2* pKey, uint arrayLength, uint stride, uint size, uint dir GUID_ARG)
{
__local int2 l_key[1024U];
int localSizeLimit = get_local_size(0) * 2;
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
//Bitonic merge
uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
for(; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
);

View File

@@ -0,0 +1,83 @@
#include "b3BitonicSort.h"
#include "Bullet3Common/b3Scalar.h"
//Note: logically shared with BitonicSort OpenCL code!
// TODO : get parameter from OpenCL and pass it to kernel (needed for platforms other than NVIDIA)
void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info)
{
if(arrayLength < 2)
return;
//Only power-of-two array lengths are supported so far
info.dir = (info.dir != 0);
cl_int ciErrNum;
size_t localWorkSize, globalWorkSize;
if(arrayLength <= info.localSizeLimit)
{
b3Assert( ( arrayLength) % info.localSizeLimit == 0);
//Launch bitonicSortLocal
ciErrNum = clSetKernelArg(info.bitonicSortLocal, 0, sizeof(cl_mem), (void *)&pKey);
ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 2, sizeof(cl_uint), (void *)&info.dir);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 2;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
else
{
//Launch bitonicSortLocal1
ciErrNum = clSetKernelArg(info.bitonicSortLocal1, 0, sizeof(cl_mem), (void *)&pKey);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 2;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
for(unsigned int size = 2 * info.localSizeLimit; size <= arrayLength; size <<= 1)
{
for(unsigned stride = size / 2; stride > 0; stride >>= 1)
{
if(stride >= info.localSizeLimit)
{
//Launch bitonicMergeGlobal
ciErrNum = clSetKernelArg(info.bitonicSortMergeGlobal, 0, sizeof(cl_mem), (void *)&pKey);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 1, sizeof(cl_uint), (void *)&arrayLength);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 2, sizeof(cl_uint), (void *)&size);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 3, sizeof(cl_uint), (void *)&stride);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 4, sizeof(cl_uint), (void *)&info.dir);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 4;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeGlobal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
else
{
//Launch bitonicMergeLocal
ciErrNum = clSetKernelArg(info.bitonicSortMergeLocal, 0, sizeof(cl_mem), (void *)&pKey);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 2, sizeof(cl_uint), (void *)&stride);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 3, sizeof(cl_uint), (void *)&size);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 4, sizeof(cl_uint), (void *)&info.dir);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 2;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
break;
}
}
}
}
}

View File

@@ -0,0 +1,30 @@
#ifndef B3_BITONIC_SORT_H
#define B3_BITONIC_SORT_H
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
struct b3BitonicSortInfo
{
cl_command_queue m_cqCommandQue;
cl_kernel bitonicSortLocal;
cl_kernel bitonicSortLocal1;
cl_kernel bitonicSortMergeGlobal;
cl_kernel bitonicSortMergeLocal;
unsigned int dir;
unsigned int localSizeLimit;
b3BitonicSortInfo()
{
bitonicSortLocal=0;
bitonicSortLocal1=0;
bitonicSortMergeGlobal=0;
bitonicSortMergeLocal=0;
dir = 1;
localSizeLimit = 1024U;
}
};
void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info);
#endif //B3_BITONIC_SORT_H

View File

@@ -0,0 +1,192 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///original author: Erwin Coumans
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3Common/b3Int2.h"
#include "Bullet3Common/b3Quickprof.h"
#include "b3BitonicSort.h"
#include <stdio.h>
int numSuccess=0;
int numFailed=0;
cl_context g_cxMainContext;
cl_command_queue g_cqCommandQue;
#define MSTRINGIFY(A) #A
static const char* kernelSource=
#include "BitonicSort.cl"
static bool compareFunc(const b3Int2& p, const b3Int2& q)
{
return (p.x < q.x) || ((p.x == q.x) && ((p.y < q.y)));
}
int main(int argc, char* argv[])
{
int ciErrNum = 0;
b3Clock clock;
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
const char* vendorSDK = b3OpenCLUtils::getSdkVendorName();
printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
int numPlatforms = b3OpenCLUtils::getNumPlatforms();
printf("Num Platforms = %d\n", numPlatforms);
for (int i=0;i<numPlatforms;i++)
{
cl_platform_id platform = b3OpenCLUtils::getPlatform(i);
b3OpenCLPlatformInfo platformInfo;
b3OpenCLUtils::getPlatformInfo(platform,&platformInfo);
printf("--------------------------------\n");
printf("Platform info for platform nr %d:\n",i);
printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
cl_context context = b3OpenCLUtils::createContextFromPlatform(platform,deviceType,&ciErrNum);
int numDevices = b3OpenCLUtils::getNumDevices(context);
printf("Num Devices = %d\n", numDevices);
for (int j=0;j<numDevices;j++)
{
cl_device_id dev = b3OpenCLUtils::getDevice(context,j);
b3OpenCLDeviceInfo devInfo;
b3OpenCLUtils::getDeviceInfo(dev,&devInfo);
printf("m_deviceName = %s\n",devInfo.m_deviceName);
//b3OpenCLUtils::printDeviceInfo(dev);
g_cqCommandQue = clCreateCommandQueue(context, dev, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
b3BitonicSortInfo info;
info.bitonicSortLocal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdLocal",&ciErrNum,0,"");
oclCHECKERROR(ciErrNum, CL_SUCCESS);
info.bitonicSortLocal1 = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdLocal1",&ciErrNum,0,"");
oclCHECKERROR(ciErrNum, CL_SUCCESS);
info.bitonicSortMergeGlobal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdMergeGlobal",&ciErrNum,0,"");
oclCHECKERROR(ciErrNum, CL_SUCCESS);
info.bitonicSortMergeLocal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdMergeLocal",&ciErrNum,0,"");
oclCHECKERROR(ciErrNum, CL_SUCCESS);
info.m_cqCommandQue = g_cqCommandQue;
b3OpenCLArray<b3Int2> keyValuesGPU(context,g_cqCommandQue);
b3AlignedObjectArray<b3Int2> keyValuesCPU;
b3AlignedObjectArray<b3Int2> keyValuesGold;
int numValues = 8*1024*1024;//2048;//1024;
keyValuesCPU.resize(numValues);
for (int i=0;i<numValues;i++)
{
b3Int2 v;
v.x = numValues+1-i;
v.y = i*i;
keyValuesCPU[i] = v;
}
keyValuesGPU.copyFromHost(keyValuesCPU);
keyValuesGPU.copyToHost(keyValuesGold);
keyValuesGold.quickSort(compareFunc);
unsigned int batch = 1;
unsigned int arrayLength = keyValuesGPU.size();
for (int i=0;i<10;i++)
{
keyValuesGPU.copyFromHost(keyValuesCPU);
clFinish(info.m_cqCommandQue);
unsigned long pre=clock.getTimeMilliseconds();
bitonicSortNv(keyValuesGPU.getBufferCL(), arrayLength, info);
clFinish(info.m_cqCommandQue);
unsigned long post=clock.getTimeMilliseconds();
printf("GPU sort took %d ms\n",post-pre);
}
keyValuesGPU.copyToHost(keyValuesCPU);
int success=1;
for (int i=0;i<numValues;i++)
{
if (keyValuesCPU[i].x != keyValuesGold[i].x)
success = 0;
if (keyValuesCPU[i].y != keyValuesGold[i].y)
success = 0;
}
if (success)
{
printf("Correct\n");
numSuccess++;
} else
{
printf("Sort Failed\n");
numFailed++;
}
}
clReleaseContext(context);
}
///Easier method to initialize OpenCL using createContextFromType for a GPU
deviceType = CL_DEVICE_TYPE_GPU;
void* glCtx=0;
void* glDC = 0;
printf("Initialize OpenCL using b3OpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
g_cxMainContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
if (g_cxMainContext)
{
int numDev = b3OpenCLUtils::getNumDevices(g_cxMainContext);
for (int i=0;i<numDev;i++)
{
cl_device_id device;
device = b3OpenCLUtils::getDevice(g_cxMainContext,i);
b3OpenCLDeviceInfo clInfo;
b3OpenCLUtils::getDeviceInfo(device,&clInfo);
b3OpenCLUtils::printDeviceInfo(device);
// create a command-queue
g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
//normally you would create and execute kernels using this command queue
clReleaseCommandQueue(g_cqCommandQue);
}
clReleaseContext(g_cxMainContext);
}
else {
printf("No OpenCL capable GPU found!");
}
printf("numSuccess=%d\n",numSuccess);
printf("numFailed=%d\n",numFailed);
printf("press <Enter>\n");
getchar();
return 0;
}

View File

@@ -0,0 +1,36 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("Test_BitonicSort_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {"../../../src"}
files {
"main.cpp",
"b3BitonicSort.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.h",
"../../../src/Bullet3Common/b3Quickprof.cpp",
"../../../src/Bullet3Common/b3Quickprof.h",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
}
end
end
createProject("Apple")
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")

View File

@@ -0,0 +1,378 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include <stdio.h>
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
#include "Bullet3Common/b3CommandLineArgs.h"
#include "Bullet3Common/b3MinMax.h"
int g_nPassed = 0;
int g_nFailed = 0;
bool g_testFailed = 0;
#define TEST_INIT g_testFailed = 0;
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
cl_context g_context=0;
cl_device_id g_device=0;
cl_command_queue g_queue =0;
const char* g_deviceName = 0;
void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
{
void* glCtx=0;
void* glDC = 0;
int ciErrNum = 0;
//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = b3OpenCLUtils::getNumDevices(g_context);
if (numDev>0)
{
b3OpenCLDeviceInfo info;
g_device= b3OpenCLUtils::getDevice(g_context,0);
g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
b3OpenCLUtils::printDeviceInfo(g_device);
b3OpenCLUtils::getDeviceInfo(g_device,&info);
g_deviceName = info.m_deviceName;
}
}
void exitCL()
{
clReleaseCommandQueue(g_queue);
clReleaseContext(g_context);
}
inline void fillIntTest()
{
TEST_INIT;
b3FillCL* fillCL = new b3FillCL(g_context,g_device,g_queue);
int maxSize=1024*256;
b3OpenCLArray<int> intBuffer(g_context,g_queue,maxSize);
intBuffer.resize(maxSize);
#define NUM_TESTS 7
int dx = maxSize/NUM_TESTS;
for (int iter=0;iter<NUM_TESTS;iter++)
{
int size = b3Min( 11+dx*iter, maxSize );
int value = 2;
int offset=0;
fillCL->execute(intBuffer,value,size,offset);
b3AlignedObjectArray<int> hostBuf2;
hostBuf2.resize(size);
fillCL->executeHost(hostBuf2,value,size,offset);
b3AlignedObjectArray<int> hostBuf;
intBuffer.copyToHost(hostBuf);
for(int i=0; i<size; i++)
{
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
}
}
delete fillCL;
TEST_REPORT( "fillIntTest" );
}
__inline
void seedRandom(int seed)
{
srand( seed );
}
template<typename T>
__inline
T getRandom(const T& minV, const T& maxV)
{
float r = (rand()%10000)/10000.f;
T range = maxV - minV;
return (T)(minV + r*range);
}
struct b3SortDataCompare
{
inline bool operator()(const b3SortData& first, const b3SortData& second) const
{
return (first.m_key < second.m_key) || (first.m_key==second.m_key && first.m_value < second.m_value);
}
};
void boundSearchTest( )
{
TEST_INIT;
int maxSize = 1024*256;
int bucketSize = 256;
b3OpenCLArray<b3SortData> srcCL(g_context,g_queue,maxSize);
b3OpenCLArray<unsigned int> upperCL(g_context,g_queue,maxSize);
b3OpenCLArray<unsigned int> lowerCL(g_context,g_queue,maxSize);
b3AlignedObjectArray<b3SortData> srcHost;
b3AlignedObjectArray<unsigned int> upperHost;
b3AlignedObjectArray<unsigned int> lowerHost;
b3AlignedObjectArray<unsigned int> upperHostCompare;
b3AlignedObjectArray<unsigned int> lowerHostCompare;
b3BoundSearchCL* search = new b3BoundSearchCL(g_context,g_device,g_queue, maxSize);
int dx = maxSize/NUM_TESTS;
for(int iter=0; iter<NUM_TESTS; iter++)
{
int size = b3Min( 128+dx*iter, maxSize );
upperHost.resize(bucketSize);
lowerHost.resize(bucketSize);
upperHostCompare.resize(bucketSize);
lowerHostCompare.resize(bucketSize);
srcHost.resize(size);
for(int i=0; i<size; i++)
{
b3SortData v;
// v.m_key = i<2? 0 : 5;
v.m_key = getRandom(0,bucketSize);
v.m_value = i;
srcHost.at(i) = v;
}
srcHost.quickSort(b3SortDataCompare());
srcCL.copyFromHost(srcHost);
{
for(int i=0; i<bucketSize; i++)
{
lowerHost[i] = -1;
lowerHostCompare[i] = -1;
upperHost[i] = -1;
upperHostCompare[i] = -1;
}
upperCL.copyFromHost(upperHost);
lowerCL.copyFromHost(lowerHost);
}
search->execute(srcCL,size,upperCL,bucketSize,b3BoundSearchCL::BOUND_UPPER);
search->execute(srcCL,size,lowerCL,bucketSize,b3BoundSearchCL::BOUND_LOWER);
search->executeHost(srcHost,size,upperHostCompare,bucketSize,b3BoundSearchCL::BOUND_UPPER);
search->executeHost(srcHost,size,lowerHostCompare,bucketSize,b3BoundSearchCL::BOUND_LOWER);
lowerCL.copyToHost(lowerHost);
upperCL.copyToHost(upperHost);
for(int i=0; i<bucketSize; i++)
{
TEST_ASSERT(upperHostCompare[i] == upperHost[i]);
TEST_ASSERT(lowerHostCompare[i] == lowerHost[i]);
}
/*
for(int i=1; i<bucketSize; i++)
{
int lhi_1 = lowerHost[i-1];
int lhi = lowerHost[i];
for(int j=lhi_1; j<lhi; j++)
//for(int j=lowerHost[i-1]; j<lowerHost[i]; j++)
{
TEST_ASSERT( srcHost[j].m_key < i );
}
}
for(int i=0; i<bucketSize; i++)
{
int jMin = (i==0)?0:upperHost[i-1];
for(int j=jMin; j<upperHost[i]; j++)
{
TEST_ASSERT( srcHost[j].m_key <= i );
}
}
*/
for(int i=0; i<bucketSize; i++)
{
int lhi = lowerHost[i];
int uhi = upperHost[i];
for(int j=lhi; j<uhi; j++)
{
if ( srcHost[j].m_key != i )
{
printf("error %d != %d\n",srcHost[j].m_key,i);
}
TEST_ASSERT( srcHost[j].m_key == i );
}
}
}
delete search;
TEST_REPORT( "boundSearchTest" );
}
void prefixScanTest()
{
TEST_INIT;
int maxSize = 1024*256;
b3AlignedObjectArray<unsigned int> buf0Host;
b3AlignedObjectArray<unsigned int> buf1Host;
b3OpenCLArray<unsigned int> buf2CL(g_context,g_queue,maxSize);
b3OpenCLArray<unsigned int> buf3CL(g_context,g_queue,maxSize);
b3PrefixScanCL* scan = new b3PrefixScanCL(g_context,g_device,g_queue,maxSize);
int dx = maxSize/NUM_TESTS;
for(int iter=0; iter<NUM_TESTS; iter++)
{
int size = b3Min( 128+dx*iter, maxSize );
buf0Host.resize(size);
buf1Host.resize(size);
for(int i=0; i<size; i++)
buf0Host[i] = 1;
buf2CL.copyFromHost( buf0Host);
unsigned int sumHost, sumGPU;
scan->executeHost(buf0Host, buf1Host, size, &sumHost );
scan->execute( buf2CL, buf3CL, size, &sumGPU );
buf3CL.copyToHost(buf0Host);
TEST_ASSERT( sumHost == sumGPU );
for(int i=0; i<size; i++)
TEST_ASSERT( buf1Host[i] == buf0Host[i] );
}
delete scan;
TEST_REPORT( "scanTest" );
}
bool radixSortTest()
{
TEST_INIT;
int maxSize = 1024*256;
b3AlignedObjectArray<b3SortData> buf0Host;
buf0Host.resize(maxSize);
b3AlignedObjectArray<b3SortData> buf1Host;
buf1Host.resize(maxSize );
b3OpenCLArray<b3SortData> buf2CL(g_context,g_queue,maxSize);
b3RadixSort32CL* sort = new b3RadixSort32CL(g_context,g_device,g_queue,maxSize);
int dx = maxSize/NUM_TESTS;
for(int iter=0; iter<NUM_TESTS; iter++)
{
int size = b3Min( 128+dx*iter, maxSize-512 );
size = NEXTMULTIPLEOF( size, 512 );//not necessary
buf0Host.resize(size);
for(int i=0; i<size; i++)
{
b3SortData v;
v.m_key = getRandom(0,0xff);
v.m_value = i;
buf0Host[i] = v;
}
buf2CL.copyFromHost( buf0Host);
sort->executeHost( buf0Host);
sort->execute(buf2CL);
buf2CL.copyToHost(buf1Host);
for(int i=0; i<size; i++)
{
TEST_ASSERT( buf0Host[i].m_value == buf1Host[i].m_value && buf0Host[i].m_key == buf1Host[i].m_key );
}
}
delete sort;
TEST_REPORT( "radixSort" );
return g_testFailed;
}
int main(int argc, char** argv)
{
int preferredDeviceIndex = -1;
int preferredPlatformIndex = -1;
b3CommandLineArgs args(argc, argv);
args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
args.GetCmdLineArgument("platformId", preferredPlatformIndex);
initCL(preferredDeviceIndex,preferredPlatformIndex);
fillIntTest();
boundSearchTest();
prefixScanTest();
radixSortTest();
exitCL();
printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
printf("End, press <enter>\n");
getchar();
}

View File

@@ -0,0 +1,41 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("Test_OpenCL_Primitives_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {".","../../../src"}
files {
"main.cpp",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h",
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.h",
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")

View File

@@ -0,0 +1,712 @@
/******************************************************************************
* Copyright 2010 Duane Merrill
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may ob3ain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
*
*
* AUTHORS' REQUEST:
*
* If you use|reference|benchmark this code, please cite our Technical
* Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
*
* @TechReport{ Merrill:Sorting:2010,
* author = "Duane Merrill and Andrew Grimshaw",
* title = "Revisiting Sorting for GPGPU Stream Architectures",
* year = "2010",
* institution = "University of Virginia, Department of Computer Science",
* address = "Charlottesville, VA, USA",
* number = "CS2010-03"
* }
*
* For more information, see our Google Code project site:
* http://code.google.com/p/back40computing/
*
* Thanks!
******************************************************************************/
/******************************************************************************
* Simple test driver program for *large-problem* radix sorting.
*
* Useful for demonstrating how to integrate radix sorting into
* your application
******************************************************************************/
/******************************************************************************
* Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
******************************************************************************/
#ifdef _WIN32
#pragma warning (disable:4996)
#endif
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
#include <algorithm>
#include <string>
//#include <iostream>
#include <sstream>
/**********************
*
*/
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3Common/b3Quickprof.h"
cl_context g_cxMainContext;
cl_device_id g_device;
cl_command_queue g_cqCommandQueue;
/***********************
*
*/
bool g_verbose;
///Preferred OpenCL device/platform. When < 0 then no preference is used.
///Note that b3OpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application.
///Preferred device/platform take priority over this platform-vendor match
int gPreferredDeviceId = -1;
int gPreferredPlatformId = -1;
/******************************************************************************
* Routines
******************************************************************************/
/**
* Keys-only sorting. Uses the GPU to sort the specified vector of elements for the given
* number of iterations, displaying runtime information.
*
* @param[in] num_elements
* Size in elements of the vector to sort
* @param[in] h_keys
* Vector of keys to sort
* @param[in] iterations
* Number of times to invoke the GPU sorting primitive
* @param[in] cfg
* Config
*/
template <typename K>
void TimedSort(
unsigned int num_elements,
K *h_keys,
unsigned int iterations)
{
printf("Keys only, %d iterations, %d elements\n", iterations, num_elements);
int max_elements = num_elements;
b3AlignedObjectArray<unsigned int> hostData;
hostData.resize(num_elements);
for (int i=0;i<num_elements;i++)
{
hostData[i] = h_keys[i];
}
b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
b3OpenCLArray<unsigned int> gpuData(g_cxMainContext,g_cqCommandQueue);
gpuData.copyFromHost(hostData);
//sorter.executeHost(gpuData);
sorter.execute(gpuData);
b3AlignedObjectArray<unsigned int> hostDataSorted;
gpuData.copyToHost(hostDataSorted);
clFinish(g_cqCommandQueue);
{
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
// Create sorting enactor
// Perform the timed number of sorting iterations
double elapsed = 0;
float duration = 0;
b3Clock watch;
//warm-start
gpuData.copyFromHost(hostData);
clFinish(g_cqCommandQueue);
sorter.execute(gpuData);
watch.reset();
for (int i = 0; i < iterations; i++)
{
// Move a fresh copy of the problem into device storage
gpuData.copyFromHost(hostData);
clFinish(g_cqCommandQueue);
// Start GPU timing record
double startMs = watch.getTimeMicroseconds()/1e3;
// Call the sorting API routine
sorter.execute(gpuData);
clFinish(g_cqCommandQueue);
double stopMs = watch.getTimeMicroseconds()/1e3;
duration = stopMs - startMs;
// End GPU timing record
elapsed += (double) duration;
printf("duration = %f\n", duration);
}
// Display timing information
double avg_runtime = elapsed / iterations;
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
gpuData.copyToHost(hostData);
for (int i=0;i<num_elements;i++)
{
h_keys[i] = hostData[i];
}
}
}
/**
* Key-value sorting. Uses the GPU to sort the specified vector of elements for the given
* number of iterations, displaying runtime information.
*
* @param[in] num_elements
* Size in elements of the vector to sort
* @param[in] h_keys
* Vector of keys to sort
* @param[in,out] h_values
* Vector of values to sort
* @param[in] iterations
* Number of times to invoke the GPU sorting primitive
* @param[in] cfg
* Config
*/
template <typename K, typename V>
void TimedSort(
unsigned int num_elements,
K *h_keys,
V *h_values,
unsigned int iterations)
{
printf("Key-values, %d iterations, %d elements\n", iterations, num_elements);
int max_elements = num_elements;
b3AlignedObjectArray<b3SortData> hostData;
hostData.resize(num_elements);
for (int i=0;i<num_elements;i++)
{
hostData[i].m_key = h_keys[i];
hostData[i].m_value = h_values[i];
}
b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
b3OpenCLArray<b3SortData> gpuData(g_cxMainContext,g_cqCommandQueue);
gpuData.copyFromHost(hostData);
//sorter.executeHost(gpuData);
sorter.execute(gpuData);
b3AlignedObjectArray<b3SortData> hostDataSorted;
gpuData.copyToHost(hostDataSorted);
#if 0
for (int i=0;i<num_elements;i++)
{
printf("hostData[%d].m_key = %d\n",i, hostDataSorted[i].m_key);
printf("hostData[%d].m_value = %d\n",i,hostDataSorted[i].m_value);
}
#endif
clFinish(g_cqCommandQueue);
{
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
// Create sorting enactor
// Perform the timed number of sorting iterations
double elapsed = 0;
float duration = 0;
b3Clock watch;
//warm-start
gpuData.copyFromHost(hostData);
sorter.execute(gpuData);
clFinish(g_cqCommandQueue);
watch.reset();
for (int i = 0; i < iterations; i++)
{
// Move a fresh copy of the problem into device storage
gpuData.copyFromHost(hostData);
clFinish(g_cqCommandQueue);
// Start GPU timing record
double startMs = watch.getTimeMicroseconds()/1e3;
// Call the sorting API routine
sorter.execute(gpuData);
clFinish(g_cqCommandQueue);
double stopMs = watch.getTimeMicroseconds()/1e3;
duration = stopMs - startMs;
// End GPU timing record
elapsed += (double) duration;
printf("duration = %f\n", duration);
}
// Display timing information
double avg_runtime = elapsed / iterations;
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
gpuData.copyToHost(hostData);
for (int i=0;i<num_elements;i++)
{
h_keys[i] = hostData[i].m_key;
h_values[i] = hostData[i].m_value;
}
}
}
/**
* Generates random 32-bit keys.
*
* We always take the second-order byte from rand() because the higher-order
* bits returned by rand() are commonly considered more uniformly distributed
* than the lower-order bits.
*
* We can decrease the entropy level of keys by adopting the technique
* of Thearling and Smith in which keys are computed from the bitwise AND of
* multiple random samples:
*
* entropy_reduction | Effectively-unique bits per key
* -----------------------------------------------------
* -1 | 0
* 0 | 32
* 1 | 25.95
* 2 | 17.41
* 3 | 10.78
* 4 | 6.42
* ... | ...
*
*/
template <typename K>
void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
{
const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
unsigned char key_bits[NUM_UCHARS];
do {
for (int j = 0; j < NUM_UCHARS; j++) {
unsigned char quarterword = 0xff;
for (int i = 0; i <= entropy_reduction; i++) {
quarterword &= (rand() >> 7);
}
key_bits[j] = quarterword;
}
if (lower_key_bits < sizeof(K) * 8) {
unsigned long long base = 0;
memcpy(&base, key_bits, sizeof(K));
base &= (1 << lower_key_bits) - 1;
memcpy(key_bits, &base, sizeof(K));
}
memcpy(&key, key_bits, sizeof(K));
} while (key != key); // avoids NaNs when generating random floating point numbers
}
/******************************************************************************
* Templated routines for printing keys/values to the console
******************************************************************************/
template<typename T>
void PrintValue(T val) {
printf("%d", val);
}
template<>
void PrintValue<float>(float val) {
printf("%f", val);
}
template<>
void PrintValue<double>(double val) {
printf("%f", val);
}
template<>
void PrintValue<unsigned char>(unsigned char val) {
printf("%u", val);
}
template<>
void PrintValue<unsigned short>(unsigned short val) {
printf("%u", val);
}
template<>
void PrintValue<unsigned int>(unsigned int val) {
printf("%u", val);
}
template<>
void PrintValue<long>(long val) {
printf("%ld", val);
}
template<>
void PrintValue<unsigned long>(unsigned long val) {
printf("%lu", val);
}
template<>
void PrintValue<long long>(long long val) {
printf("%lld", val);
}
template<>
void PrintValue<unsigned long long>(unsigned long long val) {
printf("%llu", val);
}
/**
* Compares the equivalence of two arrays
*/
template <typename T, typename SizeT>
int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true)
{
printf("\n");
for (SizeT i = 0; i < len; i++) {
if (computed[i] != reference[i]) {
printf("INCORRECT: [%lu]: ", (unsigned long) i);
PrintValue<T>(computed[i]);
printf(" != ");
PrintValue<T>(reference[i]);
if (verbose) {
printf("\nresult[...");
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
PrintValue<T>(computed[j]);
printf(", ");
}
printf("...]");
printf("\nreference[...");
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
PrintValue<T>(reference[j]);
printf(", ");
}
printf("...]");
}
return 1;
}
}
printf("CORRECT\n");
return 0;
}
/**
* Creates an example sorting problem whose keys is a vector of the specified
* number of K elements, values of V elements, and then dispatches the problem
* to the GPU for the given number of iterations, displaying runtime information.
*
* @param[in] iterations
* Number of times to invoke the GPU sorting primitive
* @param[in] num_elements
* Size in elements of the vector to sort
* @param[in] cfg
* Config
*/
template<typename K, typename V>
void TestSort(
unsigned int iterations,
int num_elements,
bool keys_only)
{
// Allocate the sorting problem on the host and fill the keys with random bytes
K *h_keys = NULL;
K *h_reference_keys = NULL;
V *h_values = NULL;
h_keys = (K*) malloc(num_elements * sizeof(K));
h_reference_keys = (K*) malloc(num_elements * sizeof(K));
if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V));
// Use random bits
for (unsigned int i = 0; i < num_elements; ++i) {
RandomBits<K>(h_keys[i], 0);
//h_keys[i] = num_elements-i;
//h_keys[i] = 0xffffffffu-i;
if (!keys_only)
h_values[i] = h_keys[i];//0xffffffffu-i;
h_reference_keys[i] = h_keys[i];
}
// Run the timing test
if (keys_only) {
TimedSort<K>(num_elements, h_keys, iterations);
} else {
TimedSort<K, V>(num_elements, h_keys, h_values, iterations);
}
// cudaThreadSynchronize();
// Display sorted key data
if (g_verbose) {
printf("\n\nKeys:\n");
for (int i = 0; i < num_elements; i++) {
PrintValue<K>(h_keys[i]);
printf(", ");
}
printf("\n\n");
}
// Verify solution
std::sort(h_reference_keys, h_reference_keys + num_elements);
CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
printf("\n");
fflush(stdout);
// Free our allocated host memory
if (h_keys != NULL) free(h_keys);
if (h_values != NULL) free(h_values);
}
/**
* Displays the commandline usage for this tool
*/
void Usage()
{
printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--key-values] [--deviceId=<int>] [--platformId=<int>]\n");
printf("\n");
printf("\t--v\tDisplays sorted results to the console.\n");
printf("\n");
printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
printf("\n");
printf("\t--n\tThe number of elements to comprise the sample problem\n");
printf("\t\t\tDefault = 512\n");
printf("\n");
printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n");
printf("\n");
}
/******************************************************************************
* Command-line parsing
******************************************************************************/
#include <map>
#include <algorithm>
#include <string>
class b3CommandLineArgs
{
protected:
std::map<std::string, std::string> pairs;
public:
// Constructor
b3CommandLineArgs(int argc, char **argv)
{
using namespace std;
for (int i = 1; i < argc; i++)
{
string arg = argv[i];
if ((arg[0] != '-') || (arg[1] != '-')) {
continue;
}
string::size_type pos;
string key, val;
if ((pos = arg.find( '=')) == string::npos) {
key = string(arg, 2, arg.length() - 2);
val = "";
} else {
key = string(arg, 2, pos - 2);
val = string(arg, pos + 1, arg.length() - 1);
}
pairs[key] = val;
}
}
bool CheckCmdLineFlag(const char* arg_name)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
return true;
}
return false;
}
template <typename T>
void GetCmdLineArgument(const char *arg_name, T &val);
int ParsedArgc()
{
return pairs.size();
}
};
template <typename T>
void b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
istringstream strstream(itr->second);
strstream >> val;
}
}
template <>
void b3CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
string s = itr->second;
val = (char*) malloc(sizeof(char) * (s.length() + 1));
strcpy(val, s.c_str());
} else {
val = NULL;
}
}
/******************************************************************************
* Main
******************************************************************************/
extern bool gDebugSkipLoadingBinary;
int main( int argc, char** argv)
{
//gDebugSkipLoadingBinary = true;
cl_int ciErrNum;
b3CommandLineArgs args(argc,argv);
args.GetCmdLineArgument("deviceId", gPreferredDeviceId);
args.GetCmdLineArgument("platformId", gPreferredPlatformId);
printf("Initialize OpenCL using b3OpenCLUtils_createContextFromType\n");
cl_platform_id platformId;
// g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
//g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_CPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = b3OpenCLUtils_getNumDevices(g_cxMainContext);
if (!numDev)
{
printf("error: no OpenCL devices\n");
exit(0);
}
int result;
int devId = 0;
g_device = b3OpenCLUtils_getDevice(g_cxMainContext,devId);
b3OpenCLUtils_printDeviceInfo(g_device);
// create a command-queue
g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
//srand(time(NULL));
srand(0); // presently deterministic
unsigned int num_elements = 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
unsigned int iterations = 10;
bool keys_only = true;
//
// Check command line arguments
//
if (args.CheckCmdLineFlag("help"))
{
Usage();
return 0;
}
args.GetCmdLineArgument("i", iterations);
args.GetCmdLineArgument("n", num_elements);
keys_only = !args.CheckCmdLineFlag("key-values");
g_verbose = args.CheckCmdLineFlag("v");
TestSort<unsigned int, unsigned int>(
iterations,
num_elements,
keys_only);
}

View File

@@ -0,0 +1,40 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("Test_OpenCL_RadixSortBenchmark_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {"..","../../../src"}
-- links {
-- ("OpenCL_lib_parallel_primitives_host_" .. vendor)
-- }
files {
"main.cpp",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.h",
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
"../../../src/Bullet3Common/b3Quickprof.cpp",
"../../../src/Bullet3Common/b3Quickprof.h",
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")