From 92f0938af30c2731ee7245068286f1c12ffeff01 Mon Sep 17 00:00:00 2001 From: erwincoumans Date: Tue, 30 Apr 2013 11:40:09 -0700 Subject: [PATCH] add bitonic sort, as comparison. fix stringify.bat for Windows (need to fix Mac/Linux version too) --- build/premake4.lua | 4 +- build/stringify.bat | 45 +- .../kernels/sapFastKernels.h | 12 +- .../BroadphaseCollision/kernels/sapKernels.h | 32 +- .../kernels/bvhTraversal.h | 44 +- .../kernels/primitiveContacts.h | 48 +- .../kernels/satClipHullContacts.h | 62 +- .../NarrowphaseCollision/kernels/satKernels.h | 64 +- .../RigidBody/kernels/batchingKernelsNew.h | 2 +- .../RigidBody/kernels/integrateKernel.h | 6 +- .../RigidBody/kernels/solveContact.h | 4 +- .../RigidBody/kernels/solveFriction.h | 6 +- .../RigidBody/kernels/solverSetup.h | 6 +- .../RigidBody/kernels/solverUtils.h | 8 +- .../RigidBody/kernels/updateAabbsKernel.h | 8 +- test/OpenCL/BitonicSort/BitonicSort.cl | 171 +++++ test/OpenCL/BitonicSort/b3BitonicSort.cpp | 83 ++ test/OpenCL/BitonicSort/b3BitonicSort.h | 30 + test/OpenCL/BitonicSort/main.cpp | 192 +++++ test/OpenCL/BitonicSort/premake4.lua | 36 + test/OpenCL/ParallelPrimitives/main.cpp | 378 ++++++++++ test/OpenCL/ParallelPrimitives/premake4.lua | 41 + test/OpenCL/RadixSortBenchmark/main.cpp | 712 ++++++++++++++++++ test/OpenCL/RadixSortBenchmark/premake4.lua | 40 + 24 files changed, 1857 insertions(+), 177 deletions(-) create mode 100644 test/OpenCL/BitonicSort/BitonicSort.cl create mode 100644 test/OpenCL/BitonicSort/b3BitonicSort.cpp create mode 100644 test/OpenCL/BitonicSort/b3BitonicSort.h create mode 100644 test/OpenCL/BitonicSort/main.cpp create mode 100644 test/OpenCL/BitonicSort/premake4.lua create mode 100644 test/OpenCL/ParallelPrimitives/main.cpp create mode 100644 test/OpenCL/ParallelPrimitives/premake4.lua create mode 100644 test/OpenCL/RadixSortBenchmark/main.cpp create mode 100644 test/OpenCL/RadixSortBenchmark/premake4.lua diff --git a/build/premake4.lua b/build/premake4.lua index dccf53c15..fe7e6daaf 100644 --- a/build/premake4.lua +++ b/build/premake4.lua @@ -101,7 +101,9 @@ include "../test/OpenCL/BasicInitialize" -- include "../test/OpenCL/BroadphaseCollision" -- include "../test/OpenCL/NarrowphaseCollision" --- include "../test/OpenCL/ParallelPrimitives" + include "../test/OpenCL/ParallelPrimitives" + include "../test/OpenCL/RadixSortBenchmark" + include "../test/OpenCL/BitonicSort" include "../src/Bullet3Dynamics" include "../src/Bullet3Common" diff --git a/build/stringify.bat b/build/stringify.bat index d121c8ff0..b5abdad6c 100644 --- a/build/stringify.bat +++ b/build/stringify.bat @@ -1,35 +1,30 @@ -@echo off +rem @echo off -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/vector_add/VectorAddKernels.cl" --headerfile="../opencl/vector_add/VectorAddKernels.h" --stringname="vectorAddCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/RadixSort32Kernels.cl" --headerfile="../opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/BoundSearchKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/PrefixScanKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/FillKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kernels/sap.cl" --headerfile="../opencl/gpu_broadphase/kernels/sapKernels.h" --stringname="sapCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kernels/sapFast.cl" --headerfile="../opencl/gpu_broadphase/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h" --stringname="sapCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/sat.cl" --headerfile="../opencl/gpu_narrowphase/kernels/satKernels.h" --stringname="satKernelsCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/satClipHullContacts.cl" --headerfile="../opencl/gpu_narrowphase/kernels/satClipHullContacts.h" --stringname="satClipKernelsCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/primitiveContacts.cl" --headerfile="../opencl/gpu_narrowphase/kernels/primitiveContacts.h" --stringname="primitiveContactsKernelsCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/bvhTraversal.cl" --headerfile="../opencl/gpu_narrowphase/kernels/bvhTraversal.h" --stringname="bvhTraversalKernelCL" stringify - - -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/integrateKernel.cl" --headerfile="../opencl/gpu_rigidbody/kernels/integrateKernel.h" --stringname="integrateKernelCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/updateAabbsKernel.cl" --headerfile="../opencl/gpu_rigidbody/kernels/updateAabbsKernel.h" --stringname="updateAabbsKernelCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverSetup.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverSetup.h" --stringname="solverSetupCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverSetup2.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverSetup2.h" --stringname="solverSetup2CL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/batchingKernels.cl" --headerfile="../opencl/gpu_rigidbody/kernels/batchingKernels.h" --stringname="batchingKernelsCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/batchingKernelsNew.cl" --headerfile="../opencl/gpu_rigidbody/kernels/batchingKernelsNew.h" --stringname="batchingKernelsNewCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverUtils.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverUtils.h" --stringname="solverUtilsCL" stringify - - -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solveContact.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solveContact.h" --stringname="solveContactCL" stringify -premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solveFriction.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solveFriction.h" --stringname="solveFrictionCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h" --stringname="satKernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h" --stringname="satClipKernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h" --stringname="primitiveContactsKernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h" --stringname="bvhTraversalKernelCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/integrateKernel.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/integrateKernel.h" --stringname="integrateKernelCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/updateAabbsKernel.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/updateAabbsKernel.h" --stringname="updateAabbsKernelCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup.h" --stringname="solverSetupCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup2.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup2.h" --stringname="solverSetup2CL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernels.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernels.h" --stringname="batchingKernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernelsNew.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernelsNew.h" --stringname="batchingKernelsNewCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solverUtils.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solverUtils.h" --stringname="solverUtilsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solveContact.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solveContact.h" --stringname="solveContactCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solveFriction.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solveFriction.h" --stringname="solveFrictionCL" stringify pause \ No newline at end of file diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h index 758ae91cc..b9986a74c 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h @@ -30,12 +30,12 @@ static const char* sapFastCL= \ " float m_maxElems[4];\n" " int m_maxIndices[4];\n" " };\n" -"} b3AabbCL;\n" +"} btAabbCL;\n" "\n" "\n" "/// conservative test for overlap between two aabbs\n" -"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n" -"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n" +"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n" +"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n" "{\n" "//skip pairs between static (mass=0) objects\n" " if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n" @@ -50,18 +50,18 @@ static const char* sapFastCL= \ "\n" "\n" "//computePairsKernelBatchWrite\n" -"__kernel void computePairsKernel( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +"__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" "{\n" " int i = get_global_id(0);\n" " int localId = get_local_id(0);\n" "\n" " __local int numActiveWgItems[1];\n" " __local int breakRequest[1];\n" -" __local b3AabbCL localAabbs[128];// = aabbs[i];\n" +" __local btAabbCL localAabbs[128];// = aabbs[i];\n" " \n" " int2 myPairs[64];\n" " \n" -" b3AabbCL myAabb;\n" +" btAabbCL myAabb;\n" " \n" " myAabb = (im_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" @@ -43,8 +43,8 @@ static const char* sapCL= \ " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" " return overlap;\n" "}\n" -"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n" -"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n" +"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" +"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" "{\n" " bool overlap = true;\n" " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" @@ -53,8 +53,8 @@ static const char* sapCL= \ " return overlap;\n" "}\n" "\n" -"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n" -"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n" +"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" +"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" "{\n" " bool overlap = true;\n" " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" @@ -64,7 +64,7 @@ static const char* sapCL= \ "}\n" "\n" "\n" -"__kernel void computePairsKernelTwoArrays( __global const b3AabbCL* unsortedAabbs, __global const b3AabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n" +"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n" "{\n" " int i = get_global_id(0);\n" " if (i>=numUnsortedAabbs)\n" @@ -89,7 +89,7 @@ static const char* sapCL= \ " }\n" "}\n" "\n" -"__kernel void computePairsKernelOriginal( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" "{\n" " int i = get_global_id(0);\n" " if (i>=numObjects)\n" @@ -117,7 +117,7 @@ static const char* sapCL= \ "\n" "\n" "\n" -"__kernel void computePairsKernelBarrier( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" "{\n" " int i = get_global_id(0);\n" " int localId = get_local_id(0);\n" @@ -181,16 +181,16 @@ static const char* sapCL= \ "}\n" "\n" "\n" -"__kernel void computePairsKernelLocalSharedMemory( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" "{\n" " int i = get_global_id(0);\n" " int localId = get_local_id(0);\n" "\n" " __local int numActiveWgItems[1];\n" " __local int breakRequest[1];\n" -" __local b3AabbCL localAabbs[128];// = aabbs[i];\n" +" __local btAabbCL localAabbs[128];// = aabbs[i];\n" " \n" -" b3AabbCL myAabb;\n" +" btAabbCL myAabb;\n" " \n" " myAabb = (i=numObjects)\n" @@ -300,7 +300,7 @@ static const char* sapCL= \ "}\n" "\n" "\n" -"__kernel void flipFloatKernel( __global const b3AabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n" +"__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n" "{\n" " int i = get_global_id(0);\n" " if (i>=numObjects)\n" @@ -312,7 +312,7 @@ static const char* sapCL= \ "}\n" "\n" "\n" -"__kernel void scatterKernel( __global const b3AabbCL* aabbs, volatile __global const int2* sortData, __global b3AabbCL* sortedAabbs, int numObjects)\n" +"__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n" "{\n" " int i = get_global_id(0);\n" " if (i>=numObjects)\n" diff --git a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h index 7d6ba62b7..6640917cb 100644 --- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h +++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h @@ -1,6 +1,6 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project static const char* bvhTraversalKernelCL= \ -"//keep this enum in sync with the CPU version (in b3Collidable.h)\n" +"//keep this enum in sync with the CPU version (in btCollidable.h)\n" "//written by Erwin Coumans\n" "\n" "#define SHAPE_CONVEX_HULL 3\n" @@ -13,7 +13,7 @@ static const char* bvhTraversalKernelCL= \ "\n" "#define MAX_NUM_PARTS_IN_BITS 10\n" "\n" -"///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n" +"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n" "///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n" "typedef struct\n" "{\n" @@ -22,7 +22,7 @@ static const char* bvhTraversalKernelCL= \ " unsigned short int m_quantizedAabbMax[3];\n" " //4 bytes\n" " int m_escapeIndexOrTriangleIndex;\n" -"} b3QuantizedBvhNode;\n" +"} btQuantizedBvhNode;\n" "\n" "typedef struct\n" "{\n" @@ -44,12 +44,12 @@ static const char* bvhTraversalKernelCL= \ " }\n" " int getEscapeIndex() const\n" " {\n" -" b3Assert(!isLeafNode());\n" +" btAssert(!isLeafNode());\n" " return -m_escapeIndexOrTriangleIndex;\n" " }\n" " int getTriangleIndex() const\n" " {\n" -" b3Assert(isLeafNode());\n" +" btAssert(isLeafNode());\n" " unsigned int x=0;\n" " unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" " // Get only the lower bits where the triangle index is stored\n" @@ -57,13 +57,13 @@ static const char* bvhTraversalKernelCL= \ " }\n" " int getPartId() const\n" " {\n" -" b3Assert(isLeafNode());\n" +" btAssert(isLeafNode());\n" " // Get only the highest bits where the part index is stored\n" " return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS));\n" " }\n" "*/\n" "\n" -"int getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n" +"int getTriangleIndex(const btQuantizedBvhNode* rootNode)\n" "{\n" " unsigned int x=0;\n" " unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" @@ -71,13 +71,13 @@ static const char* bvhTraversalKernelCL= \ " return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" "}\n" "\n" -"int isLeaf(const b3QuantizedBvhNode* rootNode)\n" +"int isLeaf(const btQuantizedBvhNode* rootNode)\n" "{\n" " //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" " return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" "}\n" " \n" -"int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n" +"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n" "{\n" " return -rootNode->m_escapeIndexOrTriangleIndex;\n" "}\n" @@ -92,9 +92,9 @@ static const char* bvhTraversalKernelCL= \ " //4 bytes\n" " int m_subtreeSize;\n" " int m_padding[3];\n" -"} b3BvhSubtreeInfo;\n" +"} btBvhSubtreeInfo;\n" "\n" -"///keep this in sync with b3Collidable.h\n" +"///keep this in sync with btCollidable.h\n" "typedef struct\n" "{\n" " int m_numChildShapes;\n" @@ -102,7 +102,7 @@ static const char* bvhTraversalKernelCL= \ " int m_shapeType;\n" " int m_shapeIndex;\n" " \n" -"} b3CollidableGpu;\n" +"} btCollidableGpu;\n" "\n" "typedef struct\n" "{\n" @@ -112,7 +112,7 @@ static const char* bvhTraversalKernelCL= \ " int m_unused0;\n" " int m_unused1;\n" " int m_unused2;\n" -"} b3GpuChildShape;\n" +"} btGpuChildShape;\n" "\n" "\n" "typedef struct\n" @@ -142,7 +142,7 @@ static const char* bvhTraversalKernelCL= \ " float m_maxElems[4];\n" " int m_maxIndices[4];\n" " };\n" -"} b3AabbCL;\n" +"} btAabbCL;\n" "\n" "\n" "int testQuantizedAabbAgainstQuantizedAabb(\n" @@ -196,12 +196,12 @@ static const char* bvhTraversalKernelCL= \ "// work-in-progress\n" "__kernel void bvhTraversalKernel( __global const int2* pairs, \n" " __global const BodyData* rigidBodies, \n" -" __global const b3CollidableGpu* collidables,\n" -" __global b3AabbCL* aabbs,\n" +" __global const btCollidableGpu* collidables,\n" +" __global btAabbCL* aabbs,\n" " __global int4* concavePairsOut,\n" " __global volatile int* numConcavePairsOut,\n" -" __global const b3BvhSubtreeInfo* subtreeHeadersRoot,\n" -" __global const b3QuantizedBvhNode* quantizedNodesRoot,\n" +" __global const btBvhSubtreeInfo* subtreeHeadersRoot,\n" +" __global const btQuantizedBvhNode* quantizedNodesRoot,\n" " __global const b3BvhInfo* bvhInfos,\n" " int numPairs,\n" " int maxNumConcavePairsCapacity)\n" @@ -238,8 +238,8 @@ static const char* bvhTraversalKernelCL= \ " float4 bvhAabbMax = bvhInfo.m_aabbMax;\n" " float4 bvhQuantization = bvhInfo.m_quantization;\n" " int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n" -" __global const b3BvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n" -" __global const b3QuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n" +" __global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n" +" __global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n" " \n" "\n" " unsigned short int quantizedQueryAabbMin[3];\n" @@ -249,7 +249,7 @@ static const char* bvhTraversalKernelCL= \ " \n" " for (int i=0;im_faceOffset+closestFaceA];\n" +" btGpuFace polyA = faces[hullA->m_faceOffset+closestFaceA];\n" "\n" " // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" " int numVerticesA = polyA.m_numIndices;\n" @@ -416,7 +416,7 @@ static const char* satClipKernelsCL= \ " //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n" " numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n" "\n" -" //b3Swap(pVtxIn,pVtxOut);\n" +" //btSwap(pVtxIn,pVtxOut);\n" " float4* tmp = pVtxOut;\n" " pVtxOut = pVtxIn;\n" " pVtxIn = tmp;\n" @@ -458,10 +458,10 @@ static const char* satClipKernelsCL= \ " float4* worldVertsB2, int capacityWorldVertsB2,\n" " const float minDist, float maxDist,\n" " const float4* verticesA,\n" -" const b3GpuFace* facesA,\n" +" const btGpuFace* facesA,\n" " const int* indicesA,\n" " __global const float4* verticesB,\n" -" __global const b3GpuFace* facesB,\n" +" __global const btGpuFace* facesB,\n" " __global const int* indicesB,\n" " float4* contactsOut,\n" " int contactCapacity)\n" @@ -496,7 +496,7 @@ static const char* satClipKernelsCL= \ " if (closestFaceA<0)\n" " return numContactsOut;\n" "\n" -" b3GpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA];\n" +" btGpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA];\n" "\n" " // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" " int numVerticesA = polyA.m_numIndices;\n" @@ -520,7 +520,7 @@ static const char* satClipKernelsCL= \ " //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n" " numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n" "\n" -" //b3Swap(pVtxIn,pVtxOut);\n" +" //btSwap(pVtxIn,pVtxOut);\n" " float4* tmp = pVtxOut;\n" " pVtxOut = pVtxIn;\n" " pVtxIn = tmp;\n" @@ -561,7 +561,7 @@ static const char* satClipKernelsCL= \ " float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n" " const float minDist, float maxDist,\n" " __global const float4* vertices,\n" -" __global const b3GpuFace* faces,\n" +" __global const btGpuFace* faces,\n" " __global const int* indices,\n" " float4* localContactsOut,\n" " int localContactCapacity)\n" @@ -589,7 +589,7 @@ static const char* satClipKernelsCL= \ " }\n" "\n" " {\n" -" const b3GpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n" +" const btGpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n" " const int numVertices = polyB.m_numIndices;\n" " for(int e0=0;e0m_faceOffset+closestFaceB];\n" +" const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n" " const int numVertices = polyB.m_numIndices;\n" " for(int e0=0;e0m_faceOffset+closestFaceB];\n" +" const btGpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n" " const int numVertices = polyB.m_numIndices;\n" " for(int e0=0;e0 B3_GPU_ANGULAR_MOTION_THRESHOLD)\n" +" if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n" " {\n" -" fAngle = B3_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n" +" fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n" " }\n" " if(fAngle < 0.001f)\n" " {\n" diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h index ccb4daf32..b758f43d8 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h +++ b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h @@ -313,8 +313,8 @@ static const char* solveContactCL= \ " }\n" "}\n" "\n" -"void b3PlaneSpace1 (const float4* n, float4* p, float4* q);\n" -" void b3PlaneSpace1 (const float4* n, float4* p, float4* q)\n" +"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n" +" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n" "{\n" " if (fabs(n[0].z) > 0.70710678f) {\n" " // choose p in y-z plane\n" diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h index b890bb85c..9d6de6ccc 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h +++ b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h @@ -265,8 +265,8 @@ static const char* solveFrictionCL= \ " float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" " return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" "}\n" -"void b3PlaneSpace1 (const float4* n, float4* p, float4* q);\n" -" void b3PlaneSpace1 (const float4* n, float4* p, float4* q)\n" +"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n" +" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n" "{\n" " if (fabs(n[0].z) > 0.70710678f) {\n" " // choose p in y-z plane\n" @@ -347,7 +347,7 @@ static const char* solveFrictionCL= \ " float4 n = -cs->m_linear;\n" " \n" " float4 tangent[2];\n" -" b3PlaneSpace1(&n,&tangent[0],&tangent[1]);\n" +" btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n" " float4 angular0, angular1, linear;\n" " float4 r0 = center - posA;\n" " float4 r1 = center - posB;\n" diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h index ad6ba0e58..83371897b 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h +++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h @@ -489,8 +489,8 @@ static const char* solverSetupCL= \ "} ConstBufferSSD;\n" "\n" "\n" -"void b3PlaneSpace1 (float4 n, float4* p, float4* q);\n" -" void b3PlaneSpace1 (float4 n, float4* p, float4* q)\n" +"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n" +" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n" "{\n" " if (fabs(n.z) > 0.70710678f) {\n" " // choose p in y-z plane\n" @@ -577,7 +577,7 @@ static const char* solverSetupCL= \ " center /= (float)src->m_worldNormal.w;\n" "\n" " float4 tangent[2];\n" -" b3PlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n" +" btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n" " \n" " float4 r[2];\n" " r[0] = center - posA;\n" diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h b/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h index 7b85f6c56..91726f36e 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h +++ b/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h @@ -488,8 +488,8 @@ static const char* solverUtilsCL= \ "}\n" "\n" "\n" -"void b3PlaneSpace1 (float4 n, float4* p, float4* q);\n" -" void b3PlaneSpace1 (float4 n, float4* p, float4* q)\n" +"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n" +" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n" "{\n" " if (fabs(n.z) > 0.70710678f) {\n" " // choose p in y-z plane\n" @@ -739,7 +739,7 @@ static const char* solverUtilsCL= \ " float4 n = -cs->m_linear;\n" " \n" " float4 tangent[2];\n" -" b3PlaneSpace1(n,&tangent[0],&tangent[1]);\n" +" btPlaneSpace1(n,&tangent[0],&tangent[1]);\n" " float4 angular0, angular1, linear;\n" " float4 r0 = center - posA;\n" " float4 r1 = center - posB;\n" @@ -896,7 +896,7 @@ static const char* solverUtilsCL= \ " center /= (float)src->m_worldNormal.w;\n" "\n" " float4 tangent[2];\n" -" b3PlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n" +" btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n" " \n" " float4 r[2];\n" " r[0] = center - posA;\n" diff --git a/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h b/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h index 8b7f7f42b..43db8bb56 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h +++ b/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h @@ -120,7 +120,7 @@ static const char* updateAabbsKernelCL= \ " float fy;\n" " float fz;\n" " int uw;\n" -"} b3AABBCL;\n" +"} btAABBCL;\n" "\n" "__inline\n" "Matrix3x3 mtTranspose(Matrix3x3 m)\n" @@ -156,7 +156,7 @@ static const char* updateAabbsKernelCL= \ "}\n" "\n" "\n" -"__kernel void initializeGpuAabbsFull( const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global b3AABBCL* plocalShapeAABB, __global b3AABBCL* pAABB)\n" +"__kernel void initializeGpuAabbsFull( const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global btAABBCL* plocalShapeAABB, __global btAABBCL* pAABB)\n" "{\n" " int nodeID = get_global_id(0);\n" " \n" @@ -171,8 +171,8 @@ static const char* updateAabbsKernelCL= \ " \n" " if (shapeIndex>=0)\n" " {\n" -" b3AABBCL minAabb = plocalShapeAABB[collidableIndex*2];\n" -" b3AABBCL maxAabb = plocalShapeAABB[collidableIndex*2+1];\n" +" btAABBCL minAabb = plocalShapeAABB[collidableIndex*2];\n" +" btAABBCL maxAabb = plocalShapeAABB[collidableIndex*2+1];\n" " \n" " float4 halfExtents = ((float4)(maxAabb.fx - minAabb.fx,maxAabb.fy - minAabb.fy,maxAabb.fz - minAabb.fz,0.f))*0.5f;\n" " float4 localCenter = ((float4)(maxAabb.fx + minAabb.fx,maxAabb.fy + minAabb.fy,maxAabb.fz + minAabb.fz,0.f))*0.5f;\n" diff --git a/test/OpenCL/BitonicSort/BitonicSort.cl b/test/OpenCL/BitonicSort/BitonicSort.cl new file mode 100644 index 000000000..dce9a5435 --- /dev/null +++ b/test/OpenCL/BitonicSort/BitonicSort.cl @@ -0,0 +1,171 @@ +MSTRINGIFY( +/* + * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and + * proprietary rights in and to this software and related documentation. + * Any use, reproduction, disclosure, or distribution of this software + * and related documentation without an express license agreement from + * NVIDIA Corporation is strictly prohibited. + * + * Please refer to the applicable NVIDIA end user license agreement (EULA) + * associated with this source code for terms and conditions that govern + * your use of this NVIDIA software. + * + */ + + + +inline void ComparatorPrivate(int2* keyA, int2* keyB, uint dir) +{ + if((keyA[0].x > keyB[0].x) == dir) + { + int2 tmp = *keyA; + *keyA = *keyB; + *keyB = tmp; + } +} + +inline void ComparatorLocal(__local int2* keyA, __local int2* keyB, uint dir) +{ + if((keyA[0].x > keyB[0].x) == dir) + { + int2 tmp = *keyA; + *keyA = *keyB; + *keyB = tmp; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Monolithic bitonic sort kernel for short arrays fitting into local memory +//////////////////////////////////////////////////////////////////////////////// +__kernel void kBitonicSortCellIdLocal(__global int2* pKey, uint arrayLength, uint dir GUID_ARG) +{ + __local int2 l_key[1024U]; + int localSizeLimit = get_local_size(0) * 2; + + //Offset to the beginning of subbatch and load data + pKey += get_group_id(0) * localSizeLimit + get_local_id(0); + l_key[get_local_id(0) + 0] = pKey[ 0]; + l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)]; + + for(uint size = 2; size < arrayLength; size <<= 1) + { + //Bitonic merge + uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 ); + for(uint stride = size / 2; stride > 0; stride >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd); + } + } + + //ddd == dir for the last bitonic merge step + { + for(uint stride = arrayLength / 2; stride > 0; stride >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], dir); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + pKey[ 0] = l_key[get_local_id(0) + 0]; + pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Bitonic sort kernel for large arrays (not fitting into local memory) +//////////////////////////////////////////////////////////////////////////////// +//Bottom-level bitonic sort +//Almost the same as bitonicSortLocal with the only exception +//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being +//sorted in opposite directions +__kernel void kBitonicSortCellIdLocal1(__global int2* pKey GUID_ARG) +{ + __local int2 l_key[1024U]; + uint localSizeLimit = get_local_size(0) * 2; + + //Offset to the beginning of subarray and load data + pKey += get_group_id(0) * localSizeLimit + get_local_id(0); + l_key[get_local_id(0) + 0] = pKey[ 0]; + l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)]; + + uint comparatorI = get_global_id(0) & ((localSizeLimit / 2) - 1); + + for(uint size = 2; size < localSizeLimit; size <<= 1) + { + //Bitonic merge + uint ddd = (comparatorI & (size / 2)) != 0; + for(uint stride = size / 2; stride > 0; stride >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd); + } + } + + //Odd / even arrays of localSizeLimit elements + //sorted in opposite directions + { + uint ddd = (get_group_id(0) & 1); + for(uint stride = localSizeLimit / 2; stride > 0; stride >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + pKey[ 0] = l_key[get_local_id(0) + 0]; + pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)]; +} + +//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT +__kernel void kBitonicSortCellIdMergeGlobal(__global int2* pKey, uint arrayLength, uint size, uint stride, uint dir GUID_ARG) +{ + uint global_comparatorI = get_global_id(0); + uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); + + //Bitonic merge + uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); + uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); + + int2 keyA = pKey[pos + 0]; + int2 keyB = pKey[pos + stride]; + + ComparatorPrivate(&keyA, &keyB, ddd); + + pKey[pos + 0] = keyA; + pKey[pos + stride] = keyB; +} + +//Combined bitonic merge steps for +//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2] +__kernel void kBitonicSortCellIdMergeLocal(__global int2* pKey, uint arrayLength, uint stride, uint size, uint dir GUID_ARG) +{ + __local int2 l_key[1024U]; + int localSizeLimit = get_local_size(0) * 2; + + pKey += get_group_id(0) * localSizeLimit + get_local_id(0); + l_key[get_local_id(0) + 0] = pKey[ 0]; + l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)]; + + //Bitonic merge + uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1); + uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); + for(; stride > 0; stride >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd); + } + + barrier(CLK_LOCAL_MEM_FENCE); + pKey[ 0] = l_key[get_local_id(0) + 0]; + pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)]; +} +); \ No newline at end of file diff --git a/test/OpenCL/BitonicSort/b3BitonicSort.cpp b/test/OpenCL/BitonicSort/b3BitonicSort.cpp new file mode 100644 index 000000000..83e2cf0ff --- /dev/null +++ b/test/OpenCL/BitonicSort/b3BitonicSort.cpp @@ -0,0 +1,83 @@ + +#include "b3BitonicSort.h" +#include "Bullet3Common/b3Scalar.h" + + +//Note: logically shared with BitonicSort OpenCL code! +// TODO : get parameter from OpenCL and pass it to kernel (needed for platforms other than NVIDIA) + +void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info) +{ + + if(arrayLength < 2) + return; + //Only power-of-two array lengths are supported so far + info.dir = (info.dir != 0); + cl_int ciErrNum; + size_t localWorkSize, globalWorkSize; + if(arrayLength <= info.localSizeLimit) + { + b3Assert( ( arrayLength) % info.localSizeLimit == 0); + //Launch bitonicSortLocal + ciErrNum = clSetKernelArg(info.bitonicSortLocal, 0, sizeof(cl_mem), (void *)&pKey); + ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 1, sizeof(cl_uint), (void *)&arrayLength); + ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 2, sizeof(cl_uint), (void *)&info.dir); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + + localWorkSize = info.localSizeLimit / 2; + globalWorkSize = arrayLength / 2; + ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + } + else + { + //Launch bitonicSortLocal1 + ciErrNum = clSetKernelArg(info.bitonicSortLocal1, 0, sizeof(cl_mem), (void *)&pKey); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + + localWorkSize = info.localSizeLimit / 2; + globalWorkSize = arrayLength / 2; + ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + + for(unsigned int size = 2 * info.localSizeLimit; size <= arrayLength; size <<= 1) + { + for(unsigned stride = size / 2; stride > 0; stride >>= 1) + { + if(stride >= info.localSizeLimit) + { + //Launch bitonicMergeGlobal + ciErrNum = clSetKernelArg(info.bitonicSortMergeGlobal, 0, sizeof(cl_mem), (void *)&pKey); + ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 1, sizeof(cl_uint), (void *)&arrayLength); + ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 2, sizeof(cl_uint), (void *)&size); + ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 3, sizeof(cl_uint), (void *)&stride); + ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 4, sizeof(cl_uint), (void *)&info.dir); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + + localWorkSize = info.localSizeLimit / 4; + globalWorkSize = arrayLength / 2; + + ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeGlobal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + } + else + { + //Launch bitonicMergeLocal + ciErrNum = clSetKernelArg(info.bitonicSortMergeLocal, 0, sizeof(cl_mem), (void *)&pKey); + ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 1, sizeof(cl_uint), (void *)&arrayLength); + ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 2, sizeof(cl_uint), (void *)&stride); + ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 3, sizeof(cl_uint), (void *)&size); + ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 4, sizeof(cl_uint), (void *)&info.dir); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + + localWorkSize = info.localSizeLimit / 2; + globalWorkSize = arrayLength / 2; + + ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + break; + } + } + } + } +} \ No newline at end of file diff --git a/test/OpenCL/BitonicSort/b3BitonicSort.h b/test/OpenCL/BitonicSort/b3BitonicSort.h new file mode 100644 index 000000000..ebd4ecb1e --- /dev/null +++ b/test/OpenCL/BitonicSort/b3BitonicSort.h @@ -0,0 +1,30 @@ +#ifndef B3_BITONIC_SORT_H +#define B3_BITONIC_SORT_H + +#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h" + +struct b3BitonicSortInfo +{ + cl_command_queue m_cqCommandQue; + cl_kernel bitonicSortLocal; + cl_kernel bitonicSortLocal1; + cl_kernel bitonicSortMergeGlobal; + cl_kernel bitonicSortMergeLocal; + unsigned int dir; + unsigned int localSizeLimit; + + b3BitonicSortInfo() + { + bitonicSortLocal=0; + bitonicSortLocal1=0; + bitonicSortMergeGlobal=0; + bitonicSortMergeLocal=0; + dir = 1; + localSizeLimit = 1024U; + } +}; + + +void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info); + +#endif //B3_BITONIC_SORT_H diff --git a/test/OpenCL/BitonicSort/main.cpp b/test/OpenCL/BitonicSort/main.cpp new file mode 100644 index 000000000..ca5a19e6a --- /dev/null +++ b/test/OpenCL/BitonicSort/main.cpp @@ -0,0 +1,192 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +///original author: Erwin Coumans + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" +#include "Bullet3Common/b3Int2.h" +#include "Bullet3Common/b3Quickprof.h" + +#include "b3BitonicSort.h" + +#include + +int numSuccess=0; +int numFailed=0; + +cl_context g_cxMainContext; +cl_command_queue g_cqCommandQue; + +#define MSTRINGIFY(A) #A +static const char* kernelSource= +#include "BitonicSort.cl" + + + + +static bool compareFunc(const b3Int2& p, const b3Int2& q) +{ + return (p.x < q.x) || ((p.x == q.x) && ((p.y < q.y))); +} + +int main(int argc, char* argv[]) +{ + int ciErrNum = 0; + + b3Clock clock; + + + cl_device_type deviceType = CL_DEVICE_TYPE_ALL; + const char* vendorSDK = b3OpenCLUtils::getSdkVendorName(); + + printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK); + int numPlatforms = b3OpenCLUtils::getNumPlatforms(); + printf("Num Platforms = %d\n", numPlatforms); + + for (int i=0;i keyValuesGPU(context,g_cqCommandQue); + b3AlignedObjectArray keyValuesCPU; + b3AlignedObjectArray keyValuesGold; + int numValues = 8*1024*1024;//2048;//1024; + keyValuesCPU.resize(numValues); + for (int i=0;i\n"); + getchar(); + return 0; +} \ No newline at end of file diff --git a/test/OpenCL/BitonicSort/premake4.lua b/test/OpenCL/BitonicSort/premake4.lua new file mode 100644 index 000000000..df9a4bba1 --- /dev/null +++ b/test/OpenCL/BitonicSort/premake4.lua @@ -0,0 +1,36 @@ +function createProject(vendor) + + hasCL = findOpenCL(vendor) + + if (hasCL) then + + project ("Test_BitonicSort_" .. vendor) + + initOpenCL(vendor) + + language "C++" + + + kind "ConsoleApp" + targetdir "../../../bin" + + includedirs {"../../../src"} + + files { + "main.cpp", + "b3BitonicSort.cpp", + "../../../src/Bullet3Common/b3AlignedAllocator.cpp", + "../../../src/Bullet3Common/b3AlignedAllocator.h", + "../../../src/Bullet3Common/b3Quickprof.cpp", + "../../../src/Bullet3Common/b3Quickprof.h", + "../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp", + "../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h" + } + + end +end + +createProject("Apple") +createProject("AMD") +createProject("Intel") +createProject("NVIDIA") diff --git a/test/OpenCL/ParallelPrimitives/main.cpp b/test/OpenCL/ParallelPrimitives/main.cpp new file mode 100644 index 000000000..088e045cb --- /dev/null +++ b/test/OpenCL/ParallelPrimitives/main.cpp @@ -0,0 +1,378 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + +#include +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h" +#include "Bullet3Common/b3CommandLineArgs.h" +#include "Bullet3Common/b3MinMax.h" + +int g_nPassed = 0; +int g_nFailed = 0; +bool g_testFailed = 0; + +#define TEST_INIT g_testFailed = 0; +#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;} +#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++; +#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment)) + +cl_context g_context=0; +cl_device_id g_device=0; +cl_command_queue g_queue =0; +const char* g_deviceName = 0; + +void initCL(int preferredDeviceIndex, int preferredPlatformIndex) +{ + void* glCtx=0; + void* glDC = 0; + int ciErrNum = 0; + //bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers) + + cl_device_type deviceType = CL_DEVICE_TYPE_ALL; + + g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + int numDev = b3OpenCLUtils::getNumDevices(g_context); + if (numDev>0) + { + b3OpenCLDeviceInfo info; + g_device= b3OpenCLUtils::getDevice(g_context,0); + g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + b3OpenCLUtils::printDeviceInfo(g_device); + b3OpenCLUtils::getDeviceInfo(g_device,&info); + g_deviceName = info.m_deviceName; + } +} + +void exitCL() +{ + clReleaseCommandQueue(g_queue); + clReleaseContext(g_context); +} + + +inline void fillIntTest() +{ + TEST_INIT; + + b3FillCL* fillCL = new b3FillCL(g_context,g_device,g_queue); + int maxSize=1024*256; + b3OpenCLArray intBuffer(g_context,g_queue,maxSize); + intBuffer.resize(maxSize); + +#define NUM_TESTS 7 + + int dx = maxSize/NUM_TESTS; + for (int iter=0;iterexecute(intBuffer,value,size,offset); + + b3AlignedObjectArray hostBuf2; + hostBuf2.resize(size); + fillCL->executeHost(hostBuf2,value,size,offset); + + b3AlignedObjectArray hostBuf; + intBuffer.copyToHost(hostBuf); + + for(int i=0; i +__inline +T getRandom(const T& minV, const T& maxV) +{ + float r = (rand()%10000)/10000.f; + T range = maxV - minV; + return (T)(minV + r*range); +} + +struct b3SortDataCompare +{ + inline bool operator()(const b3SortData& first, const b3SortData& second) const + { + return (first.m_key < second.m_key) || (first.m_key==second.m_key && first.m_value < second.m_value); + } +}; + + +void boundSearchTest( ) +{ + TEST_INIT; + + int maxSize = 1024*256; + int bucketSize = 256; + + b3OpenCLArray srcCL(g_context,g_queue,maxSize); + b3OpenCLArray upperCL(g_context,g_queue,maxSize); + b3OpenCLArray lowerCL(g_context,g_queue,maxSize); + + b3AlignedObjectArray srcHost; + b3AlignedObjectArray upperHost; + b3AlignedObjectArray lowerHost; + b3AlignedObjectArray upperHostCompare; + b3AlignedObjectArray lowerHostCompare; + + b3BoundSearchCL* search = new b3BoundSearchCL(g_context,g_device,g_queue, maxSize); + + + int dx = maxSize/NUM_TESTS; + for(int iter=0; iterexecute(srcCL,size,upperCL,bucketSize,b3BoundSearchCL::BOUND_UPPER); + search->execute(srcCL,size,lowerCL,bucketSize,b3BoundSearchCL::BOUND_LOWER); + + search->executeHost(srcHost,size,upperHostCompare,bucketSize,b3BoundSearchCL::BOUND_UPPER); + search->executeHost(srcHost,size,lowerHostCompare,bucketSize,b3BoundSearchCL::BOUND_LOWER); + + lowerCL.copyToHost(lowerHost); + upperCL.copyToHost(upperHost); + for(int i=0; i buf0Host; + b3AlignedObjectArray buf1Host; + + b3OpenCLArray buf2CL(g_context,g_queue,maxSize); + b3OpenCLArray buf3CL(g_context,g_queue,maxSize); + + + b3PrefixScanCL* scan = new b3PrefixScanCL(g_context,g_device,g_queue,maxSize); + + int dx = maxSize/NUM_TESTS; + for(int iter=0; iterexecuteHost(buf0Host, buf1Host, size, &sumHost ); + scan->execute( buf2CL, buf3CL, size, &sumGPU ); + + buf3CL.copyToHost(buf0Host); + + TEST_ASSERT( sumHost == sumGPU ); + for(int i=0; i buf0Host; + buf0Host.resize(maxSize); + b3AlignedObjectArray buf1Host; + buf1Host.resize(maxSize ); + b3OpenCLArray buf2CL(g_context,g_queue,maxSize); + + b3RadixSort32CL* sort = new b3RadixSort32CL(g_context,g_device,g_queue,maxSize); + + int dx = maxSize/NUM_TESTS; + for(int iter=0; iterexecuteHost( buf0Host); + sort->execute(buf2CL); + + buf2CL.copyToHost(buf1Host); + + for(int i=0; i\n"); + getchar(); +} diff --git a/test/OpenCL/ParallelPrimitives/premake4.lua b/test/OpenCL/ParallelPrimitives/premake4.lua new file mode 100644 index 000000000..257a4bd3c --- /dev/null +++ b/test/OpenCL/ParallelPrimitives/premake4.lua @@ -0,0 +1,41 @@ +function createProject(vendor) + hasCL = findOpenCL(vendor) + + if (hasCL) then + + project ("Test_OpenCL_Primitives_" .. vendor) + + initOpenCL(vendor) + + language "C++" + + kind "ConsoleApp" + targetdir "../../../bin" + includedirs {".","../../../src"} + + + files { + "main.cpp", + "../../../src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h", + "../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp", + "../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h", + "../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp", + "../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h", + "../../../src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp", + "../../../src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h", + "../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp", + "../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h", + "../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp", + "../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h", + "../../../src/Bullet3Common/b3AlignedAllocator.cpp", + "../../../src/Bullet3Common/b3AlignedAllocator.h", + "../../../src/Bullet3Common/b3AlignedObjectArray.h", + } + + end +end + +createProject("AMD") +createProject("Intel") +createProject("NVIDIA") +createProject("Apple") \ No newline at end of file diff --git a/test/OpenCL/RadixSortBenchmark/main.cpp b/test/OpenCL/RadixSortBenchmark/main.cpp new file mode 100644 index 000000000..207f28f0c --- /dev/null +++ b/test/OpenCL/RadixSortBenchmark/main.cpp @@ -0,0 +1,712 @@ +/****************************************************************************** + * Copyright 2010 Duane Merrill + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may ob3ain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + * + * + * AUTHORS' REQUEST: + * + * If you use|reference|benchmark this code, please cite our Technical + * Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf): + * + * @TechReport{ Merrill:Sorting:2010, + * author = "Duane Merrill and Andrew Grimshaw", + * title = "Revisiting Sorting for GPGPU Stream Architectures", + * year = "2010", + * institution = "University of Virginia, Department of Computer Science", + * address = "Charlottesville, VA, USA", + * number = "CS2010-03" + * } + * + * For more information, see our Google Code project site: + * http://code.google.com/p/back40computing/ + * + * Thanks! + ******************************************************************************/ + +/****************************************************************************** + * Simple test driver program for *large-problem* radix sorting. + * + * Useful for demonstrating how to integrate radix sorting into + * your application + ******************************************************************************/ + +/****************************************************************************** + * Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans + ******************************************************************************/ +#ifdef _WIN32 +#pragma warning (disable:4996) +#endif +#include +#include +#include +#include +#include +#include +#include + + +//#include +#include +/********************** +* +*/ + +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3Common/b3Quickprof.h" + +cl_context g_cxMainContext; +cl_device_id g_device; +cl_command_queue g_cqCommandQueue; + +/*********************** +* +*/ + +bool g_verbose; +///Preferred OpenCL device/platform. When < 0 then no preference is used. +///Note that b3OpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application. +///Preferred device/platform take priority over this platform-vendor match +int gPreferredDeviceId = -1; +int gPreferredPlatformId = -1; + + + +/****************************************************************************** + * Routines + ******************************************************************************/ + + +/** + * Keys-only sorting. Uses the GPU to sort the specified vector of elements for the given + * number of iterations, displaying runtime information. + * + * @param[in] num_elements + * Size in elements of the vector to sort + * @param[in] h_keys + * Vector of keys to sort + * @param[in] iterations + * Number of times to invoke the GPU sorting primitive + * @param[in] cfg + * Config + */ +template +void TimedSort( + unsigned int num_elements, + K *h_keys, + unsigned int iterations) +{ + printf("Keys only, %d iterations, %d elements\n", iterations, num_elements); + + int max_elements = num_elements; + b3AlignedObjectArray hostData; + hostData.resize(num_elements); + for (int i=0;i gpuData(g_cxMainContext,g_cqCommandQueue); + gpuData.copyFromHost(hostData); + //sorter.executeHost(gpuData); + sorter.execute(gpuData); + + b3AlignedObjectArray hostDataSorted; + gpuData.copyToHost(hostDataSorted); + + clFinish(g_cqCommandQueue); + + { + //printf("Key-values, %d iterations, %d elements", iterations, num_elements); + + // Create sorting enactor + + // Perform the timed number of sorting iterations + double elapsed = 0; + float duration = 0; + b3Clock watch; + + //warm-start + gpuData.copyFromHost(hostData); + clFinish(g_cqCommandQueue); + sorter.execute(gpuData); + + watch.reset(); + + + for (int i = 0; i < iterations; i++) + { + + + + // Move a fresh copy of the problem into device storage + gpuData.copyFromHost(hostData); + clFinish(g_cqCommandQueue); + + // Start GPU timing record + double startMs = watch.getTimeMicroseconds()/1e3; + + // Call the sorting API routine + sorter.execute(gpuData); + + + + clFinish(g_cqCommandQueue); + + double stopMs = watch.getTimeMicroseconds()/1e3; + + duration = stopMs - startMs; + + // End GPU timing record + elapsed += (double) duration; + printf("duration = %f\n", duration); + } + + // Display timing information + double avg_runtime = elapsed / iterations; + // double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0; + // printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput); + double throughput = ((double) num_elements) / avg_runtime / 1000.0 ; + printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput); + + gpuData.copyToHost(hostData); + for (int i=0;i +void TimedSort( + unsigned int num_elements, + K *h_keys, + V *h_values, + unsigned int iterations) +{ + + printf("Key-values, %d iterations, %d elements\n", iterations, num_elements); + + int max_elements = num_elements; + b3AlignedObjectArray hostData; + hostData.resize(num_elements); + for (int i=0;i gpuData(g_cxMainContext,g_cqCommandQueue); + gpuData.copyFromHost(hostData); + //sorter.executeHost(gpuData); + sorter.execute(gpuData); + + b3AlignedObjectArray hostDataSorted; + gpuData.copyToHost(hostDataSorted); +#if 0 + for (int i=0;i +void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8) +{ + const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char); + unsigned char key_bits[NUM_UCHARS]; + + do { + + for (int j = 0; j < NUM_UCHARS; j++) { + unsigned char quarterword = 0xff; + for (int i = 0; i <= entropy_reduction; i++) { + quarterword &= (rand() >> 7); + } + key_bits[j] = quarterword; + } + + if (lower_key_bits < sizeof(K) * 8) { + unsigned long long base = 0; + memcpy(&base, key_bits, sizeof(K)); + base &= (1 << lower_key_bits) - 1; + memcpy(key_bits, &base, sizeof(K)); + } + + memcpy(&key, key_bits, sizeof(K)); + + } while (key != key); // avoids NaNs when generating random floating point numbers +} + + +/****************************************************************************** + * Templated routines for printing keys/values to the console + ******************************************************************************/ + +template +void PrintValue(T val) { + printf("%d", val); +} + +template<> +void PrintValue(float val) { + printf("%f", val); +} + +template<> +void PrintValue(double val) { + printf("%f", val); +} + +template<> +void PrintValue(unsigned char val) { + printf("%u", val); +} + +template<> +void PrintValue(unsigned short val) { + printf("%u", val); +} + +template<> +void PrintValue(unsigned int val) { + printf("%u", val); +} + +template<> +void PrintValue(long val) { + printf("%ld", val); +} + +template<> +void PrintValue(unsigned long val) { + printf("%lu", val); +} + +template<> +void PrintValue(long long val) { + printf("%lld", val); +} + +template<> +void PrintValue(unsigned long long val) { + printf("%llu", val); +} + + + +/** + * Compares the equivalence of two arrays + */ +template +int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true) +{ + printf("\n"); + for (SizeT i = 0; i < len; i++) { + + if (computed[i] != reference[i]) { + printf("INCORRECT: [%lu]: ", (unsigned long) i); + PrintValue(computed[i]); + printf(" != "); + PrintValue(reference[i]); + + if (verbose) { + printf("\nresult[..."); + for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) { + PrintValue(computed[j]); + printf(", "); + } + printf("...]"); + printf("\nreference[..."); + for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) { + PrintValue(reference[j]); + printf(", "); + } + printf("...]"); + } + + return 1; + } + } + + printf("CORRECT\n"); + return 0; +} + +/** + * Creates an example sorting problem whose keys is a vector of the specified + * number of K elements, values of V elements, and then dispatches the problem + * to the GPU for the given number of iterations, displaying runtime information. + * + * @param[in] iterations + * Number of times to invoke the GPU sorting primitive + * @param[in] num_elements + * Size in elements of the vector to sort + * @param[in] cfg + * Config + */ +template +void TestSort( + unsigned int iterations, + int num_elements, + bool keys_only) +{ + // Allocate the sorting problem on the host and fill the keys with random bytes + + K *h_keys = NULL; + K *h_reference_keys = NULL; + V *h_values = NULL; + h_keys = (K*) malloc(num_elements * sizeof(K)); + h_reference_keys = (K*) malloc(num_elements * sizeof(K)); + if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V)); + + + // Use random bits + for (unsigned int i = 0; i < num_elements; ++i) { + RandomBits(h_keys[i], 0); + //h_keys[i] = num_elements-i; + //h_keys[i] = 0xffffffffu-i; + if (!keys_only) + h_values[i] = h_keys[i];//0xffffffffu-i; + + h_reference_keys[i] = h_keys[i]; + } + + // Run the timing test + if (keys_only) { + TimedSort(num_elements, h_keys, iterations); + } else { + TimedSort(num_elements, h_keys, h_values, iterations); + } + +// cudaThreadSynchronize(); + + // Display sorted key data + if (g_verbose) { + printf("\n\nKeys:\n"); + for (int i = 0; i < num_elements; i++) { + PrintValue(h_keys[i]); + printf(", "); + } + printf("\n\n"); + } + + // Verify solution + std::sort(h_reference_keys, h_reference_keys + num_elements); + CompareResults(h_keys, h_reference_keys, num_elements, true); + printf("\n"); + fflush(stdout); + + // Free our allocated host memory + if (h_keys != NULL) free(h_keys); + if (h_values != NULL) free(h_values); +} + + + +/** + * Displays the commandline usage for this tool + */ +void Usage() +{ + printf("\ntest_large_problem_sorting [--device=] [--v] [--i=] [--n=] [--key-values] [--deviceId=] [--platformId=]\n"); + printf("\n"); + printf("\t--v\tDisplays sorted results to the console.\n"); + printf("\n"); + printf("\t--i\tPerforms the sorting operation times\n"); + printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n"); + printf("\n"); + printf("\t--n\tThe number of elements to comprise the sample problem\n"); + printf("\t\t\tDefault = 512\n"); + printf("\n"); + printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n"); + printf("\n"); +} + + +/****************************************************************************** + * Command-line parsing + ******************************************************************************/ +#include +#include +#include + +class b3CommandLineArgs +{ +protected: + + std::map pairs; + +public: + + // Constructor + b3CommandLineArgs(int argc, char **argv) + { + using namespace std; + + for (int i = 1; i < argc; i++) + { + string arg = argv[i]; + + if ((arg[0] != '-') || (arg[1] != '-')) { + continue; + } + + string::size_type pos; + string key, val; + if ((pos = arg.find( '=')) == string::npos) { + key = string(arg, 2, arg.length() - 2); + val = ""; + } else { + key = string(arg, 2, pos - 2); + val = string(arg, pos + 1, arg.length() - 1); + } + pairs[key] = val; + } + } + + bool CheckCmdLineFlag(const char* arg_name) + { + using namespace std; + map::iterator itr; + if ((itr = pairs.find(arg_name)) != pairs.end()) { + return true; + } + return false; + } + + template + void GetCmdLineArgument(const char *arg_name, T &val); + + int ParsedArgc() + { + return pairs.size(); + } +}; + +template +void b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val) +{ + using namespace std; + map::iterator itr; + if ((itr = pairs.find(arg_name)) != pairs.end()) { + istringstream strstream(itr->second); + strstream >> val; + } +} + +template <> +void b3CommandLineArgs::GetCmdLineArgument(const char* arg_name, char* &val) +{ + using namespace std; + map::iterator itr; + if ((itr = pairs.find(arg_name)) != pairs.end()) { + + string s = itr->second; + val = (char*) malloc(sizeof(char) * (s.length() + 1)); + strcpy(val, s.c_str()); + + } else { + val = NULL; + } +} + + + + + +/****************************************************************************** + * Main + ******************************************************************************/ + +extern bool gDebugSkipLoadingBinary; + +int main( int argc, char** argv) +{ + //gDebugSkipLoadingBinary = true; + + cl_int ciErrNum; + b3CommandLineArgs args(argc,argv); + + args.GetCmdLineArgument("deviceId", gPreferredDeviceId); + args.GetCmdLineArgument("platformId", gPreferredPlatformId); + + printf("Initialize OpenCL using b3OpenCLUtils_createContextFromType\n"); + cl_platform_id platformId; +// g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId); + g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId); + //g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_CPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId); + + oclCHECKERROR(ciErrNum, CL_SUCCESS); + + int numDev = b3OpenCLUtils_getNumDevices(g_cxMainContext); + + if (!numDev) + { + printf("error: no OpenCL devices\n"); + exit(0); + } + int result; + int devId = 0; + g_device = b3OpenCLUtils_getDevice(g_cxMainContext,devId); + b3OpenCLUtils_printDeviceInfo(g_device); + // create a command-queue + g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + + + + //srand(time(NULL)); + srand(0); // presently deterministic + + unsigned int num_elements = 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288; + unsigned int iterations = 10; + bool keys_only = true; + + // + // Check command line arguments + // + + + + if (args.CheckCmdLineFlag("help")) + { + Usage(); + return 0; + } + + args.GetCmdLineArgument("i", iterations); + args.GetCmdLineArgument("n", num_elements); + + + + keys_only = !args.CheckCmdLineFlag("key-values"); + g_verbose = args.CheckCmdLineFlag("v"); + + + + TestSort( + iterations, + num_elements, + keys_only); + + +} \ No newline at end of file diff --git a/test/OpenCL/RadixSortBenchmark/premake4.lua b/test/OpenCL/RadixSortBenchmark/premake4.lua new file mode 100644 index 000000000..361e20ae4 --- /dev/null +++ b/test/OpenCL/RadixSortBenchmark/premake4.lua @@ -0,0 +1,40 @@ +function createProject(vendor) + hasCL = findOpenCL(vendor) + + if (hasCL) then + + project ("Test_OpenCL_RadixSortBenchmark_" .. vendor) + + initOpenCL(vendor) + + language "C++" + + kind "ConsoleApp" + targetdir "../../../bin" + includedirs {"..","../../../src"} + +-- links { +-- ("OpenCL_lib_parallel_primitives_host_" .. vendor) +-- } + + files { + "main.cpp", + "../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp", + "../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h", + "../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp", + "../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp", + "../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp", + "../../../src/Bullet3Common/b3AlignedAllocator.cpp", + "../../../src/Bullet3Common/b3AlignedAllocator.h", + "../../../src/Bullet3Common/b3AlignedObjectArray.h", + "../../../src/Bullet3Common/b3Quickprof.cpp", + "../../../src/Bullet3Common/b3Quickprof.h", + } + + end +end + +createProject("AMD") +createProject("Intel") +createProject("NVIDIA") +createProject("Apple") \ No newline at end of file