add bitonic sort, as comparison.

fix stringify.bat for Windows (need to fix Mac/Linux version too)
2013-04-30 11:40:09 -07:00
parent c5f488fe6d
commit 92f0938af3
24 changed files with 1857 additions and 177 deletions
--- a/test/OpenCL/BitonicSort/BitonicSort.cl
+++ b/test/OpenCL/BitonicSort/BitonicSort.cl
@@ -0,0 +1,171 @@
+MSTRINGIFY(
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation. 
+ * Any use, reproduction, disclosure, or distribution of this software 
+ * and related documentation without an express license agreement from
+ * NVIDIA Corporation is strictly prohibited.
+ *
+ * Please refer to the applicable NVIDIA end user license agreement (EULA) 
+ * associated with this source code for terms and conditions that govern 
+ * your use of this NVIDIA software.
+ * 
+ */
+
+
+
+inline void ComparatorPrivate(int2* keyA, int2* keyB, uint dir)
+{
+    if((keyA[0].x > keyB[0].x) == dir)
+    {
+		int2 tmp = *keyA;
+		*keyA = *keyB;
+		*keyB = tmp;
+    }
+}
+
+inline void ComparatorLocal(__local int2* keyA, __local int2* keyB, uint dir)
+{
+    if((keyA[0].x > keyB[0].x) == dir)
+    {
+		int2 tmp = *keyA;
+		*keyA = *keyB;
+		*keyB = tmp;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Monolithic bitonic sort kernel for short arrays fitting into local memory
+////////////////////////////////////////////////////////////////////////////////
+__kernel void kBitonicSortCellIdLocal(__global int2* pKey, uint arrayLength, uint dir GUID_ARG)
+{
+    __local int2 l_key[1024U];
+    int localSizeLimit = get_local_size(0) * 2;
+
+    //Offset to the beginning of subbatch and load data
+    pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
+    l_key[get_local_id(0) +                    0] = pKey[                   0];
+    l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
+
+    for(uint size = 2; size < arrayLength; size <<= 1)
+    {
+        //Bitonic merge
+        uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );
+        for(uint stride = size / 2; stride > 0; stride >>= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(&l_key[pos +      0], &l_key[pos + stride], ddd);
+        }
+    }
+
+    //ddd == dir for the last bitonic merge step
+    {
+        for(uint stride = arrayLength / 2; stride > 0; stride >>= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], dir);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    pKey[                   0] = l_key[get_local_id(0) +                    0];
+    pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Bitonic sort kernel for large arrays (not fitting into local memory)
+////////////////////////////////////////////////////////////////////////////////
+//Bottom-level bitonic sort
+//Almost the same as bitonicSortLocal with the only exception
+//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
+//sorted in opposite directions
+__kernel void kBitonicSortCellIdLocal1(__global int2* pKey GUID_ARG)
+{
+    __local int2 l_key[1024U];
+    uint localSizeLimit = get_local_size(0) * 2;
+
+    //Offset to the beginning of subarray and load data
+    pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
+    l_key[get_local_id(0) +                    0] = pKey[                   0];
+    l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
+
+    uint comparatorI = get_global_id(0) & ((localSizeLimit / 2) - 1);
+
+    for(uint size = 2; size < localSizeLimit; size <<= 1)
+    {
+        //Bitonic merge
+        uint ddd = (comparatorI & (size / 2)) != 0;
+        for(uint stride = size / 2; stride > 0; stride >>= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
+        }
+    }
+
+    //Odd / even arrays of localSizeLimit elements
+    //sorted in opposite directions
+    {
+        uint ddd = (get_group_id(0) & 1);
+        for(uint stride = localSizeLimit / 2; stride > 0; stride >>= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    pKey[                   0] = l_key[get_local_id(0) +                    0];
+    pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
+}
+
+//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
+__kernel void kBitonicSortCellIdMergeGlobal(__global int2* pKey, uint arrayLength, uint size, uint stride, uint dir GUID_ARG)
+{
+    uint global_comparatorI = get_global_id(0);
+    uint        comparatorI = global_comparatorI & (arrayLength / 2 - 1);
+
+    //Bitonic merge
+    uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
+    uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
+
+    int2 keyA = pKey[pos +      0];
+    int2 keyB = pKey[pos + stride];
+
+    ComparatorPrivate(&keyA, &keyB, ddd);
+
+    pKey[pos +      0] = keyA;
+    pKey[pos + stride] = keyB;
+}
+
+//Combined bitonic merge steps for
+//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
+__kernel void kBitonicSortCellIdMergeLocal(__global int2* pKey, uint arrayLength, uint stride, uint size, uint dir GUID_ARG)
+{
+    __local int2 l_key[1024U];
+    int localSizeLimit = get_local_size(0) * 2;
+
+    pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
+    l_key[get_local_id(0) +                    0] = pKey[                   0];
+    l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
+
+    //Bitonic merge
+    uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
+    uint         ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
+    for(; stride > 0; stride >>= 1)
+    {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+        ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    pKey[                   0] = l_key[get_local_id(0) +                    0];
+    pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
+}
+);
--- a/test/OpenCL/BitonicSort/b3BitonicSort.cpp
+++ b/test/OpenCL/BitonicSort/b3BitonicSort.cpp
@@ -0,0 +1,83 @@
+
+#include "b3BitonicSort.h"
+#include "Bullet3Common/b3Scalar.h"
+
+
+//Note: logically shared with BitonicSort OpenCL code!
+// TODO : get parameter from OpenCL and pass it to kernel (needed for platforms other than NVIDIA)
+
+void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info)
+{
+	
+    if(arrayLength < 2)
+        return;
+    //Only power-of-two array lengths are supported so far
+    info.dir = (info.dir != 0);
+    cl_int ciErrNum;
+    size_t localWorkSize, globalWorkSize;
+    if(arrayLength <= info.localSizeLimit)
+    {
+        b3Assert( ( arrayLength) % info.localSizeLimit == 0);
+        //Launch bitonicSortLocal
+		ciErrNum  = clSetKernelArg(info.bitonicSortLocal, 0,   sizeof(cl_mem), (void *)&pKey);
+        ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 1,  sizeof(cl_uint), (void *)&arrayLength);
+        ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 2,  sizeof(cl_uint), (void *)&info.dir);
+        oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+        localWorkSize  = info.localSizeLimit / 2;
+        globalWorkSize =  arrayLength / 2;
+        ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
+        oclCHECKERROR(ciErrNum, CL_SUCCESS);
+    }
+    else
+    {
+        //Launch bitonicSortLocal1
+        ciErrNum  = clSetKernelArg(info.bitonicSortLocal1, 0,  sizeof(cl_mem), (void *)&pKey);
+        oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+        localWorkSize  = info.localSizeLimit / 2;
+        globalWorkSize =  arrayLength / 2;
+        ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
+        oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+        for(unsigned int size = 2 * info.localSizeLimit; size <= arrayLength; size <<= 1)
+        {
+            for(unsigned stride = size / 2; stride > 0; stride >>= 1)
+            {
+                if(stride >= info.localSizeLimit)
+                {
+                    //Launch bitonicMergeGlobal
+                    ciErrNum  = clSetKernelArg(info.bitonicSortMergeGlobal, 0,  sizeof(cl_mem), (void *)&pKey);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 1, sizeof(cl_uint), (void *)&arrayLength);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 2, sizeof(cl_uint), (void *)&size);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 3, sizeof(cl_uint), (void *)&stride);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 4, sizeof(cl_uint), (void *)&info.dir);
+					oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+                    localWorkSize  = info.localSizeLimit / 4;
+                    globalWorkSize =  arrayLength / 2;
+
+                    ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeGlobal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
+					oclCHECKERROR(ciErrNum, CL_SUCCESS);
+                }
+                else
+                {
+                    //Launch bitonicMergeLocal
+					ciErrNum  = clSetKernelArg(info.bitonicSortMergeLocal, 0,  sizeof(cl_mem), (void *)&pKey);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 2, sizeof(cl_uint), (void *)&stride);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 3, sizeof(cl_uint), (void *)&size);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 4, sizeof(cl_uint), (void *)&info.dir);
+					oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+                    localWorkSize  = info.localSizeLimit / 2;
+                    globalWorkSize =  arrayLength / 2;
+
+                    ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
+					oclCHECKERROR(ciErrNum, CL_SUCCESS);
+                    break;
+                }
+            }
+        }
+    }
+}
--- a/test/OpenCL/BitonicSort/b3BitonicSort.h
+++ b/test/OpenCL/BitonicSort/b3BitonicSort.h
@@ -0,0 +1,30 @@
+#ifndef B3_BITONIC_SORT_H
+#define B3_BITONIC_SORT_H
+
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
+
+struct b3BitonicSortInfo
+{
+	cl_command_queue m_cqCommandQue;
+	cl_kernel bitonicSortLocal;
+	cl_kernel bitonicSortLocal1;
+	cl_kernel bitonicSortMergeGlobal;
+	cl_kernel bitonicSortMergeLocal;
+	unsigned int dir;
+	unsigned int localSizeLimit;
+
+	b3BitonicSortInfo()
+	{
+		bitonicSortLocal=0;
+		bitonicSortLocal1=0;
+		bitonicSortMergeGlobal=0;
+		bitonicSortMergeLocal=0;
+		dir = 1;
+		localSizeLimit = 1024U;
+	}
+};
+
+
+void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info);
+
+#endif //B3_BITONIC_SORT_H
--- a/test/OpenCL/BitonicSort/main.cpp
+++ b/test/OpenCL/BitonicSort/main.cpp
@@ -0,0 +1,192 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///original author: Erwin Coumans
+
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3Common/b3Int2.h"
+#include "Bullet3Common/b3Quickprof.h"
+
+#include "b3BitonicSort.h"
+
+#include <stdio.h>
+
+int numSuccess=0;
+int numFailed=0;
+
+cl_context			g_cxMainContext;
+cl_command_queue	g_cqCommandQue;
+
+#define MSTRINGIFY(A) #A
+static const char* kernelSource= 
+#include "BitonicSort.cl"
+
+
+
+
+static bool compareFunc(const b3Int2& p, const b3Int2& q)
+{
+	return (p.x < q.x) || ((p.x == q.x) && ((p.y < q.y)));
+}
+
+int main(int argc, char* argv[])
+{
+	int ciErrNum = 0;
+
+	b3Clock clock;
+	
+
+	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+	const char* vendorSDK = b3OpenCLUtils::getSdkVendorName();
+
+	printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
+	int numPlatforms = b3OpenCLUtils::getNumPlatforms();
+	printf("Num Platforms = %d\n", numPlatforms);
+
+	for (int i=0;i<numPlatforms;i++)
+	{
+		cl_platform_id platform = b3OpenCLUtils::getPlatform(i);
+		b3OpenCLPlatformInfo platformInfo;
+		b3OpenCLUtils::getPlatformInfo(platform,&platformInfo);
+		printf("--------------------------------\n");
+		printf("Platform info for platform nr %d:\n",i);
+		printf("  CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
+		printf("  CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
+		printf("  CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
+		
+		cl_context context = b3OpenCLUtils::createContextFromPlatform(platform,deviceType,&ciErrNum);
+		
+		int numDevices = b3OpenCLUtils::getNumDevices(context);
+		printf("Num Devices = %d\n", numDevices);
+		for (int j=0;j<numDevices;j++)
+		{
+			cl_device_id dev = b3OpenCLUtils::getDevice(context,j);
+			b3OpenCLDeviceInfo devInfo;
+			b3OpenCLUtils::getDeviceInfo(dev,&devInfo);
+			printf("m_deviceName = %s\n",devInfo.m_deviceName);
+			//b3OpenCLUtils::printDeviceInfo(dev);
+
+			g_cqCommandQue = clCreateCommandQueue(context, dev, 0, &ciErrNum);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+			b3BitonicSortInfo info;
+			
+			info.bitonicSortLocal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdLocal",&ciErrNum,0,"");
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+			info.bitonicSortLocal1 = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdLocal1",&ciErrNum,0,"");
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+			info.bitonicSortMergeGlobal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdMergeGlobal",&ciErrNum,0,"");
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+			info.bitonicSortMergeLocal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdMergeLocal",&ciErrNum,0,"");
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+			info.m_cqCommandQue = g_cqCommandQue;
+
+			b3OpenCLArray<b3Int2> keyValuesGPU(context,g_cqCommandQue);
+			b3AlignedObjectArray<b3Int2> keyValuesCPU;
+			b3AlignedObjectArray<b3Int2> keyValuesGold;
+			int numValues = 8*1024*1024;//2048;//1024;
+			keyValuesCPU.resize(numValues);
+			for (int i=0;i<numValues;i++)
+			{
+				b3Int2 v;
+				v.x = numValues+1-i;
+				v.y = i*i;
+				keyValuesCPU[i] = v;
+			}
+			keyValuesGPU.copyFromHost(keyValuesCPU);
+			keyValuesGPU.copyToHost(keyValuesGold);
+			keyValuesGold.quickSort(compareFunc);
+			
+			unsigned int batch = 1;
+			unsigned int arrayLength = keyValuesGPU.size();
+
+			for (int i=0;i<10;i++)
+			{
+				keyValuesGPU.copyFromHost(keyValuesCPU);
+				clFinish(info.m_cqCommandQue);
+				unsigned long pre=clock.getTimeMilliseconds();
+				bitonicSortNv(keyValuesGPU.getBufferCL(), arrayLength, info);
+				clFinish(info.m_cqCommandQue);
+				unsigned long post=clock.getTimeMilliseconds();
+				printf("GPU sort took %d ms\n",post-pre);
+			}
+			keyValuesGPU.copyToHost(keyValuesCPU);
+			int success=1;
+			for (int i=0;i<numValues;i++)
+			{
+				if (keyValuesCPU[i].x != keyValuesGold[i].x)
+					success = 0;
+				if (keyValuesCPU[i].y != keyValuesGold[i].y)
+					success = 0;
+			}
+			if (success)
+			{
+				printf("Correct\n");
+				numSuccess++;
+			} else
+			{
+				printf("Sort Failed\n");
+				numFailed++;
+			}
+			
+		}
+
+		clReleaseContext(context);
+	}
+
+	///Easier method to initialize OpenCL using createContextFromType for a GPU
+	deviceType = CL_DEVICE_TYPE_GPU;
+	
+	void* glCtx=0;
+	void* glDC = 0;
+	printf("Initialize OpenCL using b3OpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
+	g_cxMainContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	if (g_cxMainContext)
+	{
+		int numDev = b3OpenCLUtils::getNumDevices(g_cxMainContext);
+
+		for (int i=0;i<numDev;i++)
+		{
+			cl_device_id		device;
+			device = b3OpenCLUtils::getDevice(g_cxMainContext,i);
+			b3OpenCLDeviceInfo clInfo;
+			b3OpenCLUtils::getDeviceInfo(device,&clInfo);
+			b3OpenCLUtils::printDeviceInfo(device);
+			// create a command-queue
+			g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+			//normally you would create and execute kernels using this command queue
+
+
+			clReleaseCommandQueue(g_cqCommandQue);
+		}
+
+		clReleaseContext(g_cxMainContext);
+
+	}
+	else {
+		printf("No OpenCL capable GPU found!");
+	}
+
+	printf("numSuccess=%d\n",numSuccess);
+	printf("numFailed=%d\n",numFailed);
+
+	printf("press <Enter>\n");
+	getchar();
+	return 0;
+}
--- a/test/OpenCL/BitonicSort/premake4.lua
+++ b/test/OpenCL/BitonicSort/premake4.lua
@@ -0,0 +1,36 @@
+function createProject(vendor)
+	
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ("Test_BitonicSort_" .. vendor)
+
+		initOpenCL(vendor)
+	
+		language "C++"
+				
+		
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		includedirs {"../../../src"}
+		
+		files {
+			"main.cpp",
+			"b3BitonicSort.cpp",
+			"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
+			"../../../src/Bullet3Common/b3AlignedAllocator.h",
+			"../../../src/Bullet3Common/b3Quickprof.cpp",
+			"../../../src/Bullet3Common/b3Quickprof.h",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+		}
+		
+	end
+end
+	
+createProject("Apple")
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")
--- a/test/OpenCL/ParallelPrimitives/main.cpp
+++ b/test/OpenCL/ParallelPrimitives/main.cpp
@@ -0,0 +1,378 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include <stdio.h>
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
+#include "Bullet3Common/b3CommandLineArgs.h"
+#include "Bullet3Common/b3MinMax.h"
+
+int g_nPassed = 0;
+int g_nFailed = 0;
+bool g_testFailed = 0;
+
+#define TEST_INIT g_testFailed = 0;
+#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
+#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
+#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
+
+cl_context g_context=0;
+cl_device_id g_device=0;
+cl_command_queue g_queue =0;
+const char* g_deviceName = 0;
+
+void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
+{
+	void* glCtx=0;
+	void* glDC = 0;
+	int ciErrNum = 0;
+	//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
+
+	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+
+	g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	int numDev = b3OpenCLUtils::getNumDevices(g_context);
+	if (numDev>0)
+	{
+		b3OpenCLDeviceInfo info;
+		g_device= b3OpenCLUtils::getDevice(g_context,0);
+		g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+        b3OpenCLUtils::printDeviceInfo(g_device);
+		b3OpenCLUtils::getDeviceInfo(g_device,&info);
+		g_deviceName = info.m_deviceName;
+	}
+}
+
+void exitCL()
+{
+	clReleaseCommandQueue(g_queue);
+	clReleaseContext(g_context);
+}
+
+
+inline void fillIntTest()
+{
+	TEST_INIT;
+
+	b3FillCL* fillCL = new b3FillCL(g_context,g_device,g_queue);
+	int maxSize=1024*256;
+	b3OpenCLArray<int> intBuffer(g_context,g_queue,maxSize);
+	intBuffer.resize(maxSize);
+	
+#define NUM_TESTS 7
+
+	int dx = maxSize/NUM_TESTS;
+	for (int iter=0;iter<NUM_TESTS;iter++)
+	{
+		int size = b3Min( 11+dx*iter, maxSize );
+
+		int value = 2;
+		
+
+		int offset=0;
+		fillCL->execute(intBuffer,value,size,offset);
+
+		b3AlignedObjectArray<int> hostBuf2;
+		hostBuf2.resize(size);
+		fillCL->executeHost(hostBuf2,value,size,offset);
+
+		b3AlignedObjectArray<int> hostBuf;
+		intBuffer.copyToHost(hostBuf);
+
+		for(int i=0; i<size; i++)
+		{
+				TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
+				TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
+		}
+	}
+
+	
+
+	delete fillCL;
+
+	TEST_REPORT( "fillIntTest" );
+}
+
+
+__inline
+void seedRandom(int seed)
+{
+	srand( seed );
+}
+
+template<typename T>
+__inline
+T getRandom(const T& minV, const T& maxV)
+{
+	float r = (rand()%10000)/10000.f;
+	T range = maxV - minV;
+	return (T)(minV + r*range);
+}
+
+struct b3SortDataCompare
+{
+	inline bool operator()(const b3SortData& first, const b3SortData& second) const
+	{
+		return (first.m_key < second.m_key) || (first.m_key==second.m_key && first.m_value < second.m_value);
+	}
+};
+
+
+void boundSearchTest( )
+{
+	TEST_INIT;
+
+	int maxSize = 1024*256;
+	int bucketSize = 256;
+
+	b3OpenCLArray<b3SortData> srcCL(g_context,g_queue,maxSize);
+	b3OpenCLArray<unsigned int> upperCL(g_context,g_queue,maxSize);
+	b3OpenCLArray<unsigned int> lowerCL(g_context,g_queue,maxSize);
+	
+	b3AlignedObjectArray<b3SortData> srcHost;
+	b3AlignedObjectArray<unsigned int> upperHost;
+	b3AlignedObjectArray<unsigned int> lowerHost;
+	b3AlignedObjectArray<unsigned int> upperHostCompare;
+	b3AlignedObjectArray<unsigned int> lowerHostCompare;
+	
+	b3BoundSearchCL* search = new b3BoundSearchCL(g_context,g_device,g_queue, maxSize);
+
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		
+		int size = b3Min( 128+dx*iter, maxSize );
+
+		upperHost.resize(bucketSize);
+		lowerHost.resize(bucketSize);
+		upperHostCompare.resize(bucketSize);
+		lowerHostCompare.resize(bucketSize);
+
+		srcHost.resize(size);
+
+		for(int i=0; i<size; i++) 
+		{
+			b3SortData v;
+//			v.m_key = i<2? 0 : 5;
+			v.m_key = getRandom(0,bucketSize);
+
+			v.m_value = i;
+			srcHost.at(i) = v;
+		}
+
+		srcHost.quickSort(b3SortDataCompare());
+		srcCL.copyFromHost(srcHost);
+
+		{
+			
+			for(int i=0; i<bucketSize; i++) 
+			{
+				lowerHost[i] = -1;
+				lowerHostCompare[i] = -1;
+				upperHost[i] = -1;
+				upperHostCompare[i] = -1;
+			}
+			upperCL.copyFromHost(upperHost);
+			lowerCL.copyFromHost(lowerHost);
+		}
+
+		search->execute(srcCL,size,upperCL,bucketSize,b3BoundSearchCL::BOUND_UPPER);
+		search->execute(srcCL,size,lowerCL,bucketSize,b3BoundSearchCL::BOUND_LOWER);
+
+		search->executeHost(srcHost,size,upperHostCompare,bucketSize,b3BoundSearchCL::BOUND_UPPER);
+		search->executeHost(srcHost,size,lowerHostCompare,bucketSize,b3BoundSearchCL::BOUND_LOWER);
+
+		lowerCL.copyToHost(lowerHost);
+		upperCL.copyToHost(upperHost);
+		for(int i=0; i<bucketSize; i++)
+		{
+			TEST_ASSERT(upperHostCompare[i] == upperHost[i]);
+			TEST_ASSERT(lowerHostCompare[i] == lowerHost[i]);
+		}
+		/*
+		for(int i=1; i<bucketSize; i++)
+		{
+			int lhi_1 = lowerHost[i-1];
+			int lhi = lowerHost[i];
+
+			for(int j=lhi_1; j<lhi; j++)
+			//for(int j=lowerHost[i-1]; j<lowerHost[i]; j++)
+			{
+				TEST_ASSERT( srcHost[j].m_key < i );
+			}
+		}
+
+		for(int i=0; i<bucketSize; i++)
+		{
+			int jMin = (i==0)?0:upperHost[i-1];
+			for(int j=jMin; j<upperHost[i]; j++)
+			{
+				TEST_ASSERT( srcHost[j].m_key <= i );
+			}
+		}
+		*/
+
+
+		for(int i=0; i<bucketSize; i++)
+		{
+			int lhi = lowerHost[i];
+			int uhi = upperHost[i];
+
+			for(int j=lhi; j<uhi; j++)
+			{
+				if ( srcHost[j].m_key != i )
+				{
+					printf("error %d != %d\n",srcHost[j].m_key,i);
+				}
+				TEST_ASSERT( srcHost[j].m_key == i );
+			}
+		}
+
+	}
+
+	delete search;
+
+	TEST_REPORT( "boundSearchTest" );
+}
+
+
+void prefixScanTest()
+{
+	TEST_INIT;
+
+	int maxSize = 1024*256;
+
+	b3AlignedObjectArray<unsigned int> buf0Host;
+	b3AlignedObjectArray<unsigned int> buf1Host;
+
+	b3OpenCLArray<unsigned int> buf2CL(g_context,g_queue,maxSize);
+	b3OpenCLArray<unsigned int> buf3CL(g_context,g_queue,maxSize);
+	
+	
+	b3PrefixScanCL* scan = new b3PrefixScanCL(g_context,g_device,g_queue,maxSize);
+		
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = b3Min( 128+dx*iter, maxSize );
+		buf0Host.resize(size);
+		buf1Host.resize(size);
+
+		for(int i=0; i<size; i++) 
+			buf0Host[i] = 1;
+		
+		buf2CL.copyFromHost( buf0Host);
+	
+		unsigned int sumHost, sumGPU;
+
+		scan->executeHost(buf0Host, buf1Host, size, &sumHost );
+		scan->execute( buf2CL, buf3CL, size, &sumGPU );
+
+		buf3CL.copyToHost(buf0Host);
+		
+		TEST_ASSERT( sumHost == sumGPU );
+		for(int i=0; i<size; i++) 
+			TEST_ASSERT( buf1Host[i] == buf0Host[i] );
+	}
+
+	delete scan;
+
+	TEST_REPORT( "scanTest" );
+}
+
+
+bool radixSortTest()
+{
+	TEST_INIT;
+	
+	int maxSize = 1024*256;
+
+	b3AlignedObjectArray<b3SortData> buf0Host;
+	buf0Host.resize(maxSize);
+	b3AlignedObjectArray<b3SortData> buf1Host;
+	buf1Host.resize(maxSize );
+	b3OpenCLArray<b3SortData> buf2CL(g_context,g_queue,maxSize);
+
+	b3RadixSort32CL* sort = new b3RadixSort32CL(g_context,g_device,g_queue,maxSize);
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = b3Min( 128+dx*iter, maxSize-512 );
+		size = NEXTMULTIPLEOF( size, 512 );//not necessary
+		
+		buf0Host.resize(size);
+
+		for(int i=0; i<size; i++)
+		{
+			b3SortData v;
+			v.m_key = getRandom(0,0xff);
+			v.m_value = i;
+			buf0Host[i] = v;
+		}
+
+		buf2CL.copyFromHost( buf0Host);
+		
+
+		sort->executeHost( buf0Host);
+		sort->execute(buf2CL);
+
+		buf2CL.copyToHost(buf1Host);
+				
+		for(int i=0; i<size; i++) 
+		{
+			TEST_ASSERT( buf0Host[i].m_value == buf1Host[i].m_value && buf0Host[i].m_key == buf1Host[i].m_key );
+		}
+	}
+
+	delete sort;
+
+	TEST_REPORT( "radixSort" );
+
+	return g_testFailed;
+}
+
+
+int main(int argc, char** argv)
+{
+	int preferredDeviceIndex = -1;
+	int preferredPlatformIndex = -1;
+
+	b3CommandLineArgs args(argc, argv);
+	args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
+	args.GetCmdLineArgument("platformId", preferredPlatformIndex);
+
+	initCL(preferredDeviceIndex,preferredPlatformIndex);
+
+	fillIntTest();
+
+	boundSearchTest();
+
+	prefixScanTest();
+
+	radixSortTest();
+
+	exitCL();
+
+	printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
+	printf("End, press <enter>\n");
+	getchar();
+}
--- a/test/OpenCL/ParallelPrimitives/premake4.lua
+++ b/test/OpenCL/ParallelPrimitives/premake4.lua
@@ -0,0 +1,41 @@
+function createProject(vendor)	
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ("Test_OpenCL_Primitives_" .. vendor)
+
+		initOpenCL(vendor)
+
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+		includedirs {".","../../../src"}
+		
+		
+		files {
+			"main.cpp",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h",
+			"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
+			"../../../src/Bullet3Common/b3AlignedAllocator.h",
+			"../../../src/Bullet3Common/b3AlignedObjectArray.h",
+		}
+		
+	end
+end
+
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")
+createProject("Apple")
--- a/test/OpenCL/RadixSortBenchmark/main.cpp
+++ b/test/OpenCL/RadixSortBenchmark/main.cpp
@@ -0,0 +1,712 @@
+/******************************************************************************
+ * Copyright 2010 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may ob3ain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * 
+ * 
+ * 
+ * AUTHORS' REQUEST: 
+ * 
+ * 		If you use|reference|benchmark this code, please cite our Technical 
+ * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
+ * 
+ *		@TechReport{ Merrill:Sorting:2010,
+ *        	author = "Duane Merrill and Andrew Grimshaw",
+ *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
+ *        	year = "2010",
+ *        	institution = "University of Virginia, Department of Computer Science",
+ *        	address = "Charlottesville, VA, USA",
+ *        	number = "CS2010-03"
+ *		}
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ * Thanks!
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple test driver program for *large-problem* radix sorting.
+ *
+ * Useful for demonstrating how to integrate radix sorting into 
+ * your application 
+ ******************************************************************************/
+
+/******************************************************************************
+ * Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
+ ******************************************************************************/
+#ifdef _WIN32
+#pragma warning (disable:4996)
+#endif
+#include <stdlib.h> 
+#include <stdio.h> 
+#include <string.h> 
+#include <math.h> 
+#include <float.h>
+#include <algorithm>
+#include <string>
+
+
+//#include <iostream>
+#include <sstream>
+/**********************
+*
+*/
+
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3Common/b3Quickprof.h"
+
+cl_context g_cxMainContext;
+cl_device_id g_device;
+cl_command_queue g_cqCommandQueue;
+
+/***********************
+*
+*/
+
+bool g_verbose;
+///Preferred OpenCL device/platform. When < 0 then no preference is used. 
+///Note that b3OpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application.
+///Preferred device/platform take priority over this platform-vendor match
+int gPreferredDeviceId = -1;
+int gPreferredPlatformId = -1;
+
+
+
+/******************************************************************************
+ * Routines
+ ******************************************************************************/
+
+
+/**
+ * Keys-only sorting.  Uses the GPU to sort the specified vector of elements for the given 
+ * number of iterations, displaying runtime information.
+ *
+ * @param[in] 		num_elements 
+ * 		Size in elements of the vector to sort
+ * @param[in] 		h_keys 
+ * 		Vector of keys to sort 
+ * @param[in] 		iterations  
+ * 		Number of times to invoke the GPU sorting primitive
+  * @param[in] 		cfg 
+ * 		Config
+ */
+template <typename K>
+void TimedSort(
+	unsigned int num_elements, 
+	K *h_keys,
+	unsigned int iterations)
+{
+	printf("Keys only, %d iterations, %d elements\n", iterations, num_elements);
+
+	int max_elements = num_elements;
+	b3AlignedObjectArray<unsigned int> hostData;
+	hostData.resize(num_elements);
+	for (int i=0;i<num_elements;i++)
+	{
+		hostData[i] = h_keys[i];
+	}
+
+	b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
+
+	b3OpenCLArray<unsigned int> gpuData(g_cxMainContext,g_cqCommandQueue);
+	gpuData.copyFromHost(hostData);
+	//sorter.executeHost(gpuData);
+    sorter.execute(gpuData);
+    
+	b3AlignedObjectArray<unsigned int> hostDataSorted;
+	gpuData.copyToHost(hostDataSorted);
+    
+	clFinish(g_cqCommandQueue);
+
+	{
+		//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
+
+		// Create sorting enactor
+
+		// Perform the timed number of sorting iterations
+		double elapsed = 0;
+		float duration = 0;
+		b3Clock watch;
+
+		//warm-start
+		gpuData.copyFromHost(hostData);
+		clFinish(g_cqCommandQueue);
+		sorter.execute(gpuData);
+
+		watch.reset();
+
+			
+		for (int i = 0; i < iterations; i++) 
+		{
+
+
+
+			// Move a fresh copy of the problem into device storage
+			gpuData.copyFromHost(hostData);
+			clFinish(g_cqCommandQueue);
+
+			// Start GPU timing record
+			double startMs = watch.getTimeMicroseconds()/1e3;
+			
+			// Call the sorting API routine
+			sorter.execute(gpuData);
+
+
+
+			clFinish(g_cqCommandQueue);
+	
+			double stopMs = watch.getTimeMicroseconds()/1e3;
+
+			duration = stopMs - startMs;
+			
+			// End GPU timing record
+			elapsed += (double) duration;
+			printf("duration = %f\n", duration);
+		}
+
+		// Display timing information
+		double avg_runtime = elapsed / iterations;
+	//	double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0; 
+	//   printf(", %f GPU ms, %f x10^9 elts/sec\n", 	avg_runtime,	throughput);
+		double throughput = ((double) num_elements) / avg_runtime / 1000.0 ; 
+		printf(", %f GPU ms, %f x10^6 elts/sec\n", 	avg_runtime,	throughput);
+
+		gpuData.copyToHost(hostData);
+		for (int i=0;i<num_elements;i++)
+		{
+			h_keys[i] = hostData[i];
+		}
+	}
+}
+
+/**
+ * Key-value sorting.  Uses the GPU to sort the specified vector of elements for the given 
+ * number of iterations, displaying runtime information.
+ *
+ * @param[in] 		num_elements 
+ * 		Size in elements of the vector to sort
+ * @param[in] 		h_keys 
+ * 		Vector of keys to sort 
+ * @param[in,out] 	h_values  
+ * 		Vector of values to sort 
+ * @param[in] 		iterations  
+ * 		Number of times to invoke the GPU sorting primitive
+  * @param[in] 		cfg 
+ * 		Config
+ */
+template <typename K, typename V>
+void TimedSort(
+	unsigned int num_elements, 
+	K *h_keys,
+	V *h_values, 
+	unsigned int iterations) 
+{
+	
+	printf("Key-values, %d iterations, %d elements\n", iterations, num_elements);
+
+	int max_elements = num_elements;
+	b3AlignedObjectArray<b3SortData> hostData;
+	hostData.resize(num_elements);
+	for (int i=0;i<num_elements;i++)
+	{
+		hostData[i].m_key = h_keys[i];
+		hostData[i].m_value = h_values[i];
+	}
+
+	b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
+
+	b3OpenCLArray<b3SortData> gpuData(g_cxMainContext,g_cqCommandQueue);
+	gpuData.copyFromHost(hostData);
+	//sorter.executeHost(gpuData);
+    sorter.execute(gpuData);
+    
+	b3AlignedObjectArray<b3SortData> hostDataSorted;
+	gpuData.copyToHost(hostDataSorted);
+#if 0
+    for (int i=0;i<num_elements;i++)
+	{
+		printf("hostData[%d].m_key = %d\n",i, hostDataSorted[i].m_key);
+        printf("hostData[%d].m_value = %d\n",i,hostDataSorted[i].m_value);
+	}
+#endif
+    
+clFinish(g_cqCommandQueue);
+
+	{
+		//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
+
+		// Create sorting enactor
+
+		// Perform the timed number of sorting iterations
+		double elapsed = 0;
+		float duration = 0;
+		b3Clock watch;
+		
+		//warm-start
+		gpuData.copyFromHost(hostData);
+		sorter.execute(gpuData);
+		clFinish(g_cqCommandQueue);
+
+		watch.reset();
+
+			
+		for (int i = 0; i < iterations; i++) 
+		{
+
+
+
+			// Move a fresh copy of the problem into device storage
+			gpuData.copyFromHost(hostData);
+			clFinish(g_cqCommandQueue);
+
+			// Start GPU timing record
+			double startMs = watch.getTimeMicroseconds()/1e3;
+			
+			// Call the sorting API routine
+			sorter.execute(gpuData);
+			clFinish(g_cqCommandQueue);
+	
+			double stopMs = watch.getTimeMicroseconds()/1e3;
+
+			duration = stopMs - startMs;
+			
+			// End GPU timing record
+			elapsed += (double) duration;
+			printf("duration = %f\n", duration);
+		}
+
+		// Display timing information
+		double avg_runtime = elapsed / iterations;
+	//	double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0; 
+	//   printf(", %f GPU ms, %f x10^9 elts/sec\n", 	avg_runtime,	throughput);
+		double throughput = ((double) num_elements) / avg_runtime / 1000.0 ; 
+		printf(", %f GPU ms, %f x10^6 elts/sec\n", 	avg_runtime,	throughput);
+
+		gpuData.copyToHost(hostData);
+		for (int i=0;i<num_elements;i++)
+		{
+			h_keys[i] = hostData[i].m_key;
+			h_values[i] = hostData[i].m_value;
+		}
+	}
+}
+
+
+
+/**
+ * Generates random 32-bit keys.
+ * 
+ * We always take the second-order byte from rand() because the higher-order 
+ * bits returned by rand() are commonly considered more uniformly distributed
+ * than the lower-order bits.
+ * 
+ * We can decrease the entropy level of keys by adopting the technique 
+ * of Thearling and Smith in which keys are computed from the bitwise AND of 
+ * multiple random samples: 
+ * 
+ * entropy_reduction	| Effectively-unique bits per key
+ * -----------------------------------------------------
+ * -1					| 0
+ * 0					| 32
+ * 1					| 25.95
+ * 2					| 17.41
+ * 3					| 10.78
+ * 4					| 6.42
+ * ...					| ...
+ * 
+ */
+template <typename K>
+void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
+{
+	const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
+	unsigned char key_bits[NUM_UCHARS];
+	
+	do {
+	
+		for (int j = 0; j < NUM_UCHARS; j++) {
+			unsigned char quarterword = 0xff;
+			for (int i = 0; i <= entropy_reduction; i++) {
+				quarterword &= (rand() >> 7);
+			}
+			key_bits[j] = quarterword;
+		}
+		
+		if (lower_key_bits < sizeof(K) * 8) {
+			unsigned long long base = 0;
+			memcpy(&base, key_bits, sizeof(K));
+			base &= (1 << lower_key_bits) - 1;
+			memcpy(key_bits, &base, sizeof(K));
+		}
+		
+		memcpy(&key, key_bits, sizeof(K));
+		
+	} while (key != key);		// avoids NaNs when generating random floating point numbers 
+}
+
+
+/******************************************************************************
+ * Templated routines for printing keys/values to the console 
+ ******************************************************************************/
+
+template<typename T> 
+void PrintValue(T val) {
+	printf("%d", val);
+}
+
+template<>
+void PrintValue<float>(float val) {
+	printf("%f", val);
+}
+
+template<>
+void PrintValue<double>(double val) {
+	printf("%f", val);
+}
+
+template<>
+void PrintValue<unsigned char>(unsigned char val) {
+	printf("%u", val);
+}
+
+template<>
+void PrintValue<unsigned short>(unsigned short val) {
+	printf("%u", val);
+}
+
+template<>
+void PrintValue<unsigned int>(unsigned int val) {
+	printf("%u", val);
+}
+
+template<>
+void PrintValue<long>(long val) {
+	printf("%ld", val);
+}
+
+template<>
+void PrintValue<unsigned long>(unsigned long val) {
+	printf("%lu", val);
+}
+
+template<>
+void PrintValue<long long>(long long val) {
+	printf("%lld", val);
+}
+
+template<>
+void PrintValue<unsigned long long>(unsigned long long val) {
+	printf("%llu", val);
+}
+
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename T, typename SizeT>
+int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true)
+{
+	printf("\n");
+	for (SizeT i = 0; i < len; i++) {
+
+		if (computed[i] != reference[i]) {
+			printf("INCORRECT: [%lu]: ", (unsigned long) i);
+			PrintValue<T>(computed[i]);
+			printf(" != ");
+			PrintValue<T>(reference[i]);
+
+			if (verbose) {
+				printf("\nresult[...");
+				for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
+					PrintValue<T>(computed[j]);
+					printf(", ");
+				}
+				printf("...]");
+				printf("\nreference[...");
+				for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
+					PrintValue<T>(reference[j]);
+					printf(", ");
+				}
+				printf("...]");
+			}
+
+			return 1;
+		}
+	}
+
+	printf("CORRECT\n");
+	return 0;
+}
+
+/**
+ * Creates an example sorting problem whose keys is a vector of the specified 
+ * number of K elements, values of V elements, and then dispatches the problem 
+ * to the GPU for the given number of iterations, displaying runtime information.
+ *
+ * @param[in] 		iterations  
+ * 		Number of times to invoke the GPU sorting primitive
+ * @param[in] 		num_elements 
+ * 		Size in elements of the vector to sort
+ * @param[in] 		cfg 
+ * 		Config
+ */
+template<typename K, typename V>
+void TestSort(
+	unsigned int iterations,
+	int num_elements,
+	bool keys_only)
+{
+    // Allocate the sorting problem on the host and fill the keys with random bytes
+
+	K *h_keys = NULL;
+	K *h_reference_keys = NULL;
+	V *h_values = NULL;
+	h_keys = (K*) malloc(num_elements * sizeof(K));
+	h_reference_keys = (K*) malloc(num_elements * sizeof(K));
+	if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V));
+	
+
+	// Use random bits
+	for (unsigned int i = 0; i < num_elements; ++i) {
+		RandomBits<K>(h_keys[i], 0);
+		//h_keys[i] = num_elements-i;
+        //h_keys[i] = 0xffffffffu-i;
+		if (!keys_only)
+			h_values[i] = h_keys[i];//0xffffffffu-i;
+
+		h_reference_keys[i] = h_keys[i];
+	}
+
+    // Run the timing test 
+	if (keys_only) {
+		TimedSort<K>(num_elements, h_keys, iterations);
+	} else {
+		TimedSort<K, V>(num_elements, h_keys, h_values, iterations);
+	}
+
+//	cudaThreadSynchronize();
+    
+	// Display sorted key data
+	if (g_verbose) {
+		printf("\n\nKeys:\n");
+		for (int i = 0; i < num_elements; i++) {	
+			PrintValue<K>(h_keys[i]);
+			printf(", ");
+		}
+		printf("\n\n");
+	}	
+	
+    // Verify solution
+	std::sort(h_reference_keys, h_reference_keys + num_elements);	
+	CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
+	printf("\n");
+	fflush(stdout);
+
+	// Free our allocated host memory 
+	if (h_keys != NULL) free(h_keys);
+    if (h_values != NULL) free(h_values);
+}
+
+
+
+/**
+ * Displays the commandline usage for this tool
+ */
+void Usage() 
+{
+	printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--key-values] [--deviceId=<int>] [--platformId=<int>]\n"); 
+	printf("\n");
+	printf("\t--v\tDisplays sorted results to the console.\n");
+	printf("\n");
+	printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
+	printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
+	printf("\n");
+	printf("\t--n\tThe number of elements to comprise the sample problem\n");
+	printf("\t\t\tDefault = 512\n");
+	printf("\n");
+	printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n");
+	printf("\n");
+}
+
+
+/******************************************************************************
+ * Command-line parsing
+ ******************************************************************************/
+#include <map>
+#include <algorithm>
+#include <string>
+
+class b3CommandLineArgs
+{
+protected:
+
+	std::map<std::string, std::string> pairs;
+
+public:
+
+	// Constructor
+	b3CommandLineArgs(int argc, char **argv)
+	{
+		using namespace std;
+
+	    for (int i = 1; i < argc; i++)
+	    {
+	        string arg = argv[i];
+
+	        if ((arg[0] != '-') || (arg[1] != '-')) {
+	        	continue;
+	        }
+
+        	string::size_type pos;
+		    string key, val;
+	        if ((pos = arg.find( '=')) == string::npos) {
+	        	key = string(arg, 2, arg.length() - 2);
+	        	val = "";
+	        } else {
+	        	key = string(arg, 2, pos - 2);
+	        	val = string(arg, pos + 1, arg.length() - 1);
+	        }
+        	pairs[key] = val;
+	    }
+	}
+
+	bool CheckCmdLineFlag(const char* arg_name)
+	{
+		using namespace std;
+		map<string, string>::iterator itr;
+		if ((itr = pairs.find(arg_name)) != pairs.end()) {
+			return true;
+	    }
+		return false;
+	}
+
+	template <typename T>
+	void GetCmdLineArgument(const char *arg_name, T &val);
+
+	int ParsedArgc()
+	{
+		return pairs.size();
+	}
+};
+
+template <typename T>
+void b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
+{
+	using namespace std;
+	map<string, string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+		istringstream strstream(itr->second);
+		strstream >> val;
+    }
+}
+
+template <>
+void b3CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
+{
+	using namespace std;
+	map<string, string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+
+		string s = itr->second;
+		val = (char*) malloc(sizeof(char) * (s.length() + 1));
+		strcpy(val, s.c_str());
+
+	} else {
+    	val = NULL;
+	}
+}
+
+
+
+
+
+/******************************************************************************
+ * Main
+ ******************************************************************************/
+
+extern bool gDebugSkipLoadingBinary;
+
+int main( int argc, char** argv) 
+{
+	//gDebugSkipLoadingBinary = true;
+
+	cl_int ciErrNum;
+	b3CommandLineArgs args(argc,argv);
+
+	args.GetCmdLineArgument("deviceId", gPreferredDeviceId);
+	args.GetCmdLineArgument("platformId", gPreferredPlatformId);
+
+	printf("Initialize OpenCL using b3OpenCLUtils_createContextFromType\n");
+	cl_platform_id platformId;
+//	g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
+	g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
+	//g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_CPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
+	
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	int numDev = b3OpenCLUtils_getNumDevices(g_cxMainContext);
+
+	if (!numDev)
+	{
+		printf("error: no OpenCL devices\n");
+		exit(0);
+	}
+	int result;
+	int devId = 0;
+	g_device = b3OpenCLUtils_getDevice(g_cxMainContext,devId);
+	b3OpenCLUtils_printDeviceInfo(g_device);
+	// create a command-queue
+	g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+
+
+	//srand(time(NULL));	
+	srand(0);				// presently deterministic
+
+    unsigned int num_elements 					= 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
+    unsigned int iterations  					= 10;
+    bool keys_only = true;
+
+    //
+	// Check command line arguments
+    //
+
+	
+
+	if (args.CheckCmdLineFlag("help"))
+	{
+		Usage();
+		return 0;
+	}
+	
+	args.GetCmdLineArgument("i", iterations);
+	args.GetCmdLineArgument("n", num_elements);
+	
+
+
+	keys_only = !args.CheckCmdLineFlag("key-values");
+	g_verbose = args.CheckCmdLineFlag("v");
+
+
+
+	TestSort<unsigned int, unsigned int>(
+			iterations,
+			num_elements, 
+			keys_only);
+
+
+}
--- a/test/OpenCL/RadixSortBenchmark/premake4.lua
+++ b/test/OpenCL/RadixSortBenchmark/premake4.lua
@@ -0,0 +1,40 @@
+function createProject(vendor)
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ("Test_OpenCL_RadixSortBenchmark_" .. vendor)
+
+		initOpenCL(vendor)
+		
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+		includedirs {"..","../../../src"}
+		
+--		links {
+--			("OpenCL_lib_parallel_primitives_host_" .. vendor)
+--		}
+		
+		files {
+			"main.cpp",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp",
+			"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
+			"../../../src/Bullet3Common/b3AlignedAllocator.h",
+			"../../../src/Bullet3Common/b3AlignedObjectArray.h",
+			"../../../src/Bullet3Common/b3Quickprof.cpp",
+			"../../../src/Bullet3Common/b3Quickprof.h",
+		}
+		
+	end
+end
+
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")
+createProject("Apple")