From 477a7f9e39120d88171cc473b2a35da2f7d5a220 Mon Sep 17 00:00:00 2001 From: erwin coumans Date: Tue, 12 Mar 2013 13:47:13 -0700 Subject: [PATCH] add gpu_broadphase with basic test --- build/premake4.lua | 1 + build/stringify.bat | 5 +- .../host/btGpuSapBroadphase.cpp | 485 ++++++++++++++++++ .../gpu_broadphase/host/btGpuSapBroadphase.h | 74 +++ opencl/gpu_broadphase/kernels/sap.cl | 320 ++++++++++++ opencl/gpu_broadphase/kernels/sapFast.cl | 161 ++++++ .../gpu_broadphase/kernels/sapFastKernels.h | 164 ++++++ opencl/gpu_broadphase/kernels/sapKernels.h | 324 ++++++++++++ opencl/gpu_broadphase/test/main.cpp | 123 +++++ opencl/gpu_broadphase/test/premake4.lua | 46 ++ .../benchmark/test_large_problem_sorting.cpp | 4 +- 11 files changed, 1705 insertions(+), 2 deletions(-) create mode 100644 opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp create mode 100644 opencl/gpu_broadphase/host/btGpuSapBroadphase.h create mode 100644 opencl/gpu_broadphase/kernels/sap.cl create mode 100644 opencl/gpu_broadphase/kernels/sapFast.cl create mode 100644 opencl/gpu_broadphase/kernels/sapFastKernels.h create mode 100644 opencl/gpu_broadphase/kernels/sapKernels.h create mode 100644 opencl/gpu_broadphase/test/main.cpp create mode 100644 opencl/gpu_broadphase/test/premake4.lua diff --git a/build/premake4.lua b/build/premake4.lua index 16a50c9c0..78bdeb312 100644 --- a/build/premake4.lua +++ b/build/premake4.lua @@ -91,6 +91,7 @@ include "../opencl/parallel_primitives/benchmark" include "../opencl/lds_bank_conflict" include "../opencl/reduce" + include "../opencl/gpu_broadphase/test" end \ No newline at end of file diff --git a/build/stringify.bat b/build/stringify.bat index 890f27d9d..820661d04 100644 --- a/build/stringify.bat +++ b/build/stringify.bat @@ -3,11 +3,14 @@ premake4 --file=stringifyKernel.lua --kernelfile="../opencl/vector_add/VectorAddKernels.cl" --headerfile="../opencl/vector_add/VectorAddKernels.h" --stringname="vectorAddCL" stringify - premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/RadixSort32Kernels.cl" --headerfile="../opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/BoundSearchKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/PrefixScanKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/FillKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kernels/sap.cl" --headerfile="../opencl/gpu_broadphase/kernels/sapKernels.h" --stringname="sapCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kernels/sapFast.cl" --headerfile="../opencl/gpu_broadphase/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify + + pause \ No newline at end of file diff --git a/opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp b/opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp new file mode 100644 index 000000000..2d16011f9 --- /dev/null +++ b/opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp @@ -0,0 +1,485 @@ + +#include "btGpuSapBroadphase.h" +#include "parallel_primitives/host/btVector3.h" +#include "parallel_primitives/host/btLauncherCL.h" +#include "parallel_primitives/host/btQuickprof.h" +#include "../basic_initialize/btOpenCLUtils.h" + +#include "../kernels/integrateKernel.h" + +#include "../kernels/sapKernels.h" +#include "../kernels/sapFastKernels.h" +#include "parallel_primitives/host/btMinMax.h" + + +btGpuSapBroadphase::btGpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ) +:m_context(ctx), +m_device(device), +m_queue(q), +m_allAabbsGPU(ctx,q), +m_smallAabbsGPU(ctx,q), +m_largeAabbsGPU(ctx,q), +m_overlappingPairs(ctx,q), +m_gpuSmallSortData(ctx,q), +m_gpuSmallSortedAabbs(ctx,q) +{ + const char* sapSrc = sapCL; + const char* sapFastSrc = sapFastCL; + + cl_int errNum=0; + + cl_program sapProg = btOpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"","opencl/gpu_broadphase/kernels/sap.cl"); + btAssert(errNum==CL_SUCCESS); + cl_program sapFastProg = btOpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"","opencl/gpu_broadphase/kernels/sapFast.cl"); + btAssert(errNum==CL_SUCCESS); + + + //m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg ); + //m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg ); + //m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); + + + m_sap2Kernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); + btAssert(errNum==CL_SUCCESS); + +#if 0 + + m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg ); + btAssert(errNum==CL_SUCCESS); +#else +#ifndef __APPLE__ + m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernel",&errNum,sapFastProg ); + btAssert(errNum==CL_SUCCESS); +#else + m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); + btAssert(errNum==CL_SUCCESS); +#endif +#endif + + m_flipFloatKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg ); + + m_copyAabbsKernel= btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); + + m_scatterKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg ); + + m_sorter = new btRadixSort32CL(m_context,m_device,m_queue); +} + +btGpuSapBroadphase::~btGpuSapBroadphase() +{ + delete m_sorter; + clReleaseKernel(m_scatterKernel); + clReleaseKernel(m_flipFloatKernel); + clReleaseKernel(m_copyAabbsKernel); + clReleaseKernel(m_sapKernel); + clReleaseKernel(m_sap2Kernel); + +} + +/// conservative test for overlap between two aabbs +static bool TestAabbAgainstAabb2(const btVector3 &aabbMin1, const btVector3 &aabbMax1, + const btVector3 &aabbMin2, const btVector3 &aabbMax2) +{ + bool overlap = true; + overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap; + overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap; + overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap; + return overlap; +} + +void btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost) +{ + int axis = 0;//todo on GPU for now hardcode + + btAssert(m_allAabbsCPU.size() == m_allAabbsGPU.size()); + + + if (forceHost) + { + + btAlignedObjectArray allHostAabbs; + m_allAabbsGPU.copyToHost(allHostAabbs); + + { + int numSmallAabbs = m_smallAabbsCPU.size(); + for (int j=0;j hostPairs; + + { + int numSmallAabbs = m_smallAabbsCPU.size(); + for (int i=0;i allHostAabbs; + m_allAabbsGPU.copyToHost(allHostAabbs); + + m_smallAabbsGPU.copyToHost(m_smallAabbsCPU); + { + int numSmallAabbs = m_smallAabbsCPU.size(); + for (int j=0;j allHostAabbs; + m_allAabbsGPU.copyToHost(allHostAabbs); + + m_largeAabbsGPU.copyToHost(m_largeAabbsCPU); + { + int numLargeAabbs = m_largeAabbsCPU.size(); + for (int j=0;jexecute(m_gpuSmallSortData); + clFinish(m_queue); + } + + m_gpuSmallSortedAabbs.resize(numSmallAabbs); + if (numSmallAabbs) + { + BT_PROFILE("scatterKernel"); + btBufferInfoCL bInfo[] = { btBufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_gpuSmallSortData.getBufferCL(),true),btBufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())}; + btLauncherCL launcher(m_queue, m_scatterKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( numSmallAabbs); + int num = numSmallAabbs; + launcher.launch1D( num); + clFinish(m_queue); + + } + + + int maxPairsPerBody = 64; + int maxPairs = maxPairsPerBody * numSmallAabbs;//todo + m_overlappingPairs.resize(maxPairs); + + btOpenCLArray pairCount(m_context, m_queue); + pairCount.push_back(0); + int numPairs=0; + + { + int numLargeAabbs = m_largeAabbsGPU.size(); + if (numLargeAabbs && numSmallAabbs) + { + BT_PROFILE("sap2Kernel"); + btBufferInfoCL bInfo[] = { btBufferInfoCL( m_largeAabbsGPU.getBufferCL() ),btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())}; + btLauncherCL launcher(m_queue, m_sap2Kernel); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( numLargeAabbs ); + launcher.setConst( numSmallAabbs); + launcher.setConst( axis ); + launcher.setConst( maxPairs ); +//@todo: use actual maximum work item sizes of the device instead of hardcoded values + launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); + + numPairs = pairCount.at(0); + if (numPairs >maxPairs) + numPairs =maxPairs; + + } + } + if (m_gpuSmallSortedAabbs.size()) + { + BT_PROFILE("sapKernel"); + btBufferInfoCL bInfo[] = { btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())}; + btLauncherCL launcher(m_queue, m_sapKernel); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( numSmallAabbs ); + launcher.setConst( axis ); + launcher.setConst( maxPairs ); + + + int num = numSmallAabbs; +#if 0 + int buffSize = launcher.getSerializationBufferSize(); + unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; + for (int i=0;imaxPairs) + numPairs = maxPairs; + } + +#else + int numPairs = 0; + + + btLauncherCL launcher(m_queue, m_sapKernel); + + const char* fileName = "m_sapKernelArgs.bin"; + FILE* f = fopen(fileName,"rb"); + if (f) + { + int sizeInBytes=0; + if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) + { + printf("error, cannot get file size\n"); + exit(0); + } + + unsigned char* buf = (unsigned char*) malloc(sizeInBytes); + fread(buf,sizeInBytes,1,f); + int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); + int num = *(int*)&buf[serializedBytes]; + launcher.launch1D( num); + + btOpenCLArray pairCount(m_context, m_queue); + int numElements = launcher.m_arrays[2]->size()/sizeof(int); + pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements); + numPairs = pairCount.at(0); + //printf("overlapping pairs = %d\n",numPairs); + btAlignedObjectArray hostOoverlappingPairs; + btOpenCLArray tmpGpuPairs(m_context,m_queue); + tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs ); + + tmpGpuPairs.copyToHost(hostOoverlappingPairs); + m_overlappingPairs.copyFromHost(hostOoverlappingPairs); + //printf("hello %d\n", m_overlappingPairs.size()); + free(buf); + fclose(f); + + } else { + printf("error: cannot find file %s\n",fileName); + } + + clFinish(m_queue); + + +#endif + + + m_overlappingPairs.resize(numPairs); + + }//BT_PROFILE("GPU_RADIX SORT"); + +} + +void btGpuSapBroadphase::writeAabbsToGpu() +{ + m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this + m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU); + m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU); + +} + +void btGpuSapBroadphase::createLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask) +{ + int index = userPtr; + btSapAabb aabb; + for (int i=0;i<4;i++) + { + aabb.m_min[i] = aabbMin[i]; + aabb.m_max[i] = aabbMax[i]; + } + aabb.m_minIndices[3] = index; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); + m_largeAabbsCPU.push_back(aabb); + m_allAabbsCPU.push_back(aabb); +} + +void btGpuSapBroadphase::createProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask) +{ + int index = userPtr; + btSapAabb aabb; + for (int i=0;i<4;i++) + { + aabb.m_min[i] = aabbMin[i]; + aabb.m_max[i] = aabbMax[i]; + } + aabb.m_minIndices[3] = index; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); + m_smallAabbsCPU.push_back(aabb); + m_allAabbsCPU.push_back(aabb); +} + +cl_mem btGpuSapBroadphase::getAabbBuffer() +{ + return m_allAabbsGPU.getBufferCL(); +} + +int btGpuSapBroadphase::getNumOverlap() +{ + return m_overlappingPairs.size(); +} +cl_mem btGpuSapBroadphase::getOverlappingPairBuffer() +{ + return m_overlappingPairs.getBufferCL(); +} \ No newline at end of file diff --git a/opencl/gpu_broadphase/host/btGpuSapBroadphase.h b/opencl/gpu_broadphase/host/btGpuSapBroadphase.h new file mode 100644 index 000000000..e7483726d --- /dev/null +++ b/opencl/gpu_broadphase/host/btGpuSapBroadphase.h @@ -0,0 +1,74 @@ +#ifndef BT_GPU_SAP_BROADPHASE_H +#define BT_GPU_SAP_BROADPHASE_H + +#include "parallel_primitives/host/btOpenCLArray.h" +#include "parallel_primitives/host/btFillCL.h" //btInt2 +class btVector3; +#include "parallel_primitives/host/btRadixSort32CL.h" + +struct btSapAabb +{ + union + { + float m_min[4]; + int m_minIndices[4]; + }; + union + { + float m_max[4]; + int m_signedMaxIndices[4]; + //unsigned int m_unsignedMaxIndices[4]; + }; +}; + + + +class btGpuSapBroadphase +{ + + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + cl_kernel m_flipFloatKernel; + cl_kernel m_scatterKernel ; + cl_kernel m_copyAabbsKernel; + cl_kernel m_sapKernel; + cl_kernel m_sap2Kernel; + + class btRadixSort32CL* m_sorter; + + public: + + btOpenCLArray m_allAabbsGPU; + btAlignedObjectArray m_allAabbsCPU; + + btOpenCLArray m_smallAabbsGPU; + btAlignedObjectArray m_smallAabbsCPU; + + btOpenCLArray m_largeAabbsGPU; + btAlignedObjectArray m_largeAabbsCPU; + + btOpenCLArray m_overlappingPairs; + + //temporary gpu work memory + btOpenCLArray m_gpuSmallSortData; + btOpenCLArray m_gpuSmallSortedAabbs; + + + btGpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ); + virtual ~btGpuSapBroadphase(); + + void calculateOverlappingPairs(bool forceHost=false); + + void createProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask); + void createLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask); + + //call writeAabbsToGpu after done making all changes (createProxy etc) + void writeAabbsToGpu(); + + cl_mem getAabbBuffer(); + int getNumOverlap(); + cl_mem getOverlappingPairBuffer(); +}; + +#endif //BT_GPU_SAP_BROADPHASE_H \ No newline at end of file diff --git a/opencl/gpu_broadphase/kernels/sap.cl b/opencl/gpu_broadphase/kernels/sap.cl new file mode 100644 index 000000000..a744fd347 --- /dev/null +++ b/opencl/gpu_broadphase/kernels/sap.cl @@ -0,0 +1,320 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + + +typedef struct +{ + union + { + float4 m_min; + float m_minElems[4]; + int m_minIndices[4]; + }; + union + { + float4 m_max; + float m_maxElems[4]; + int m_maxIndices[4]; + }; +} btAabbCL; + + +/// conservative test for overlap between two aabbs +bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2); +bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2) +{ + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} +bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2); +bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2) +{ + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} + +bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2); +bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2) +{ + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} + + +__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs) +{ + int i = get_global_id(0); + if (i>=numUnsortedAabbs) + return; + + int j = get_global_id(1); + if (j>=numSortedAabbs) + return; + + if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j])) + { + int2 myPair; + + myPair.x = unsortedAabbs[i].m_minIndices[3]; + myPair.y = sortedAabbs[j].m_minIndices[3]; + + int curPair = atomic_inc (pairCount); + if (curPair=numObjects) + return; + for (int j=i+1;j=numObjects && !localBreak) + { + atomic_inc(breakRequest); + localBreak = 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (!localBreak) + { + if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j])) + { + int2 myPair; + myPair.x = aabbs[i].m_minIndices[3]; + myPair.y = aabbs[j].m_minIndices[3]; + int curPair = atomic_inc (pairCount); + if (curPair=numObjects && !localBreak) + { + atomic_inc(breakRequest); + localBreak = 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (!localBreak) + { + if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1])) + { + int2 myPair; + myPair.x = myAabb.m_minIndices[3]; + myPair.y = localAabbs[localCount+localId+1].m_minIndices[3]; + int curPair = atomic_inc (pairCount); + if (curPair> 31) | 0x80000000; + return f ^ mask; +} +float IFloatFlip(unsigned int f); +float IFloatFlip(unsigned int f) +{ + unsigned int mask = ((f >> 31) - 1) | 0x80000000; + unsigned int fl = f ^ mask; + return *(float*)&fl; +} + + + + +__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects) +{ + int i = get_global_id(0); + if (i>=numObjects) + return; + int src = destAabbs[i].m_maxIndices[3]; + destAabbs[i] = allAabbs[src]; + destAabbs[i].m_maxIndices[3] = src; +} + + +__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis) +{ + int i = get_global_id(0); + if (i>=numObjects) + return; + + sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]); + sortData[i].y = i; + +} + + +__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects) +{ + int i = get_global_id(0); + if (i>=numObjects) + return; + + sortedAabbs[i] = aabbs[sortData[i].y]; +} diff --git a/opencl/gpu_broadphase/kernels/sapFast.cl b/opencl/gpu_broadphase/kernels/sapFast.cl new file mode 100644 index 000000000..4bf018384 --- /dev/null +++ b/opencl/gpu_broadphase/kernels/sapFast.cl @@ -0,0 +1,161 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + + +typedef struct +{ + union + { + float4 m_min; + float m_minElems[4]; + int m_minIndices[4]; + }; + union + { + float4 m_max; + float m_maxElems[4]; + int m_maxIndices[4]; + }; +} btAabbCL; + + +/// conservative test for overlap between two aabbs +bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2); +bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2) +{ +//skip pairs between static (mass=0) objects + if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0)) + return false; + + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} + + +//computePairsKernelBatchWrite +__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs) +{ + int i = get_global_id(0); + int localId = get_local_id(0); + + __local int numActiveWgItems[1]; + __local int breakRequest[1]; + __local btAabbCL localAabbs[128];// = aabbs[i]; + + int2 myPairs[64]; + + btAabbCL myAabb; + + myAabb = (i=numObjects && !localBreak) + { + atomic_inc(breakRequest); + localBreak = 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (!localBreak) + { + if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1])) + { + int2 myPair; + myPair.x = myAabb.m_minIndices[3]; + myPair.y = localAabbs[localCount+localId+1].m_minIndices[3]; + myPairs[curNumPairs] = myPair; + curNumPairs++; + if (curNumPairs==64) + { + int curPair = atomic_add(pairCount,curNumPairs); + //avoid a buffer overrun + if ((curPair+curNumPairs)0) + { + //avoid a buffer overrun + int curPair = atomic_add(pairCount,curNumPairs); + if ((curPair+curNumPairs)m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n" +" return false;\n" +" \n" +" bool overlap = true;\n" +" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"\n" +"\n" +"//computePairsKernelBatchWrite\n" +"__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" int localId = get_local_id(0);\n" +"\n" +" __local int numActiveWgItems[1];\n" +" __local int breakRequest[1];\n" +" __local btAabbCL localAabbs[128];// = aabbs[i];\n" +" \n" +" int2 myPairs[64];\n" +" \n" +" btAabbCL myAabb;\n" +" \n" +" myAabb = (i=numObjects && !localBreak)\n" +" {\n" +" atomic_inc(breakRequest);\n" +" localBreak = 1;\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (!localBreak)\n" +" {\n" +" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n" +" {\n" +" int2 myPair;\n" +" myPair.x = myAabb.m_minIndices[3];\n" +" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n" +" myPairs[curNumPairs] = myPair;\n" +" curNumPairs++;\n" +" if (curNumPairs==64)\n" +" {\n" +" int curPair = atomic_add(pairCount,curNumPairs);\n" +" //avoid a buffer overrun\n" +" if ((curPair+curNumPairs)0)\n" +" {\n" +" //avoid a buffer overrun\n" +" int curPair = atomic_add(pairCount,curNumPairs);\n" +" if ((curPair+curNumPairs)m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" +"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"\n" +"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" +"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"\n" +"\n" +"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numUnsortedAabbs)\n" +" return;\n" +"\n" +" int j = get_global_id(1);\n" +" if (j>=numSortedAabbs)\n" +" return;\n" +"\n" +" if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))\n" +" {\n" +" int2 myPair;\n" +" \n" +" myPair.x = unsortedAabbs[i].m_minIndices[3];\n" +" myPair.y = sortedAabbs[j].m_minIndices[3];\n" +"\n" +" int curPair = atomic_inc (pairCount);\n" +" if (curPair=numObjects)\n" +" return;\n" +" for (int j=i+1;j=numObjects && !localBreak)\n" +" {\n" +" atomic_inc(breakRequest);\n" +" localBreak = 1;\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (!localBreak)\n" +" {\n" +" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" +" {\n" +" int2 myPair;\n" +" myPair.x = aabbs[i].m_minIndices[3];\n" +" myPair.y = aabbs[j].m_minIndices[3];\n" +" int curPair = atomic_inc (pairCount);\n" +" if (curPair=numObjects && !localBreak)\n" +" {\n" +" atomic_inc(breakRequest);\n" +" localBreak = 1;\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if (!localBreak)\n" +" {\n" +" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n" +" {\n" +" int2 myPair;\n" +" myPair.x = myAabb.m_minIndices[3];\n" +" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n" +" int curPair = atomic_inc (pairCount);\n" +" if (curPair> 31) | 0x80000000;\n" +" return f ^ mask;\n" +"}\n" +"float IFloatFlip(unsigned int f);\n" +"float IFloatFlip(unsigned int f)\n" +"{\n" +" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n" +" unsigned int fl = f ^ mask;\n" +" return *(float*)&fl;\n" +"}\n" +"\n" +"\n" +"\n" +"\n" +"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numObjects)\n" +" return;\n" +" int src = destAabbs[i].m_maxIndices[3];\n" +" destAabbs[i] = allAabbs[src];\n" +" destAabbs[i].m_maxIndices[3] = src;\n" +"}\n" +"\n" +"\n" +"__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numObjects)\n" +" return;\n" +" \n" +" sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);\n" +" sortData[i].y = i;\n" +" \n" +"}\n" +"\n" +"\n" +"__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numObjects)\n" +" return;\n" +"\n" +" sortedAabbs[i] = aabbs[sortData[i].y];\n" +"}\n" +"\n" +; diff --git a/opencl/gpu_broadphase/test/main.cpp b/opencl/gpu_broadphase/test/main.cpp new file mode 100644 index 000000000..b86dd8fe4 --- /dev/null +++ b/opencl/gpu_broadphase/test/main.cpp @@ -0,0 +1,123 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + +#include +#include "../basic_initialize/btOpenCLUtils.h" +#include "../host/btGpuSapBroadphase.h" +#include "parallel_primitives/host/btVector3.h" +#include "parallel_primitives/host/btFillCL.h" +#include "parallel_primitives/host/btBoundSearchCL.h" +#include "parallel_primitives/host/btRadixSort32CL.h" +#include "parallel_primitives/host/btPrefixScanCL.h" +#include "parallel_primitives/host/CommandLineArgs.h" + +#include "parallel_primitives/host/btMinMax.h" +int g_nPassed = 0; +int g_nFailed = 0; +bool g_testFailed = 0; + +#define TEST_INIT g_testFailed = 0; +#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;} +#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++; +#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment)) + +cl_context g_context=0; +cl_device_id g_device=0; +cl_command_queue g_queue =0; +const char* g_deviceName = 0; + +void initCL(int preferredDeviceIndex, int preferredPlatformIndex) +{ + void* glCtx=0; + void* glDC = 0; + int ciErrNum = 0; + //bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers) + + cl_device_type deviceType = CL_DEVICE_TYPE_ALL; + + g_context = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + int numDev = btOpenCLUtils::getNumDevices(g_context); + if (numDev>0) + { + btOpenCLDeviceInfo info; + g_device= btOpenCLUtils::getDevice(g_context,0); + g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + btOpenCLUtils::printDeviceInfo(g_device); + btOpenCLUtils::getDeviceInfo(g_device,&info); + g_deviceName = info.m_deviceName; + } +} + +void exitCL() +{ + clReleaseCommandQueue(g_queue); + clReleaseContext(g_context); +} + + +inline void broadphaseTest() +{ + TEST_INIT; + + btGpuSapBroadphase* sap = new btGpuSapBroadphase(g_context,g_device,g_queue); + int group=1; + int mask=1; + btVector3 aabbMin(0,0,0); + btVector3 aabbMax(1,1,1); + int usrPtr = 1; + sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask); + +// aabbMin.setValue(2,2,2); +// aabbMax.setValue(3,3,3); + usrPtr = 2; + sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask); + sap->writeAabbsToGpu(); + + sap->calculateOverlappingPairs(); + + int numOverlap = sap->getNumOverlap(); + cl_mem buf = sap->getOverlappingPairBuffer(); + + TEST_ASSERT(numOverlap==1); + + delete sap; + + TEST_REPORT( "broadphaseTest" ); +} + +int main(int argc, char** argv) +{ + int preferredDeviceIndex = -1; + int preferredPlatformIndex = -1; + + CommandLineArgs args(argc, argv); + args.GetCmdLineArgument("deviceId", preferredDeviceIndex); + args.GetCmdLineArgument("platformId", preferredPlatformIndex); + + initCL(preferredDeviceIndex,preferredPlatformIndex); + + + broadphaseTest(); + + printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed); + printf("End, press \n"); + getchar(); + + exitCL(); + +} + diff --git a/opencl/gpu_broadphase/test/premake4.lua b/opencl/gpu_broadphase/test/premake4.lua new file mode 100644 index 000000000..2420bf9ce --- /dev/null +++ b/opencl/gpu_broadphase/test/premake4.lua @@ -0,0 +1,46 @@ +function createProject(vendor) + hasCL = findOpenCL(vendor) + + if (hasCL) then + + project ("OpenCL_broadphase_test_" .. vendor) + + initOpenCL(vendor) + + language "C++" + + kind "ConsoleApp" + targetdir "../../../bin" + includedirs {"..","../.."} + + + files { + "main.cpp", + "../../basic_initialize/btOpenCLInclude.h", + "../../basic_initialize/btOpenCLUtils.cpp", + "../../basic_initialize/btOpenCLUtils.h", + "../host/btGpuSapBroadphase.cpp", + "../host/btGpuSapBroadphase.h", + "../../parallel_primitives/host/btFillCL.cpp", + "../../parallel_primitives/host/btFillCL.h", + "../../parallel_primitives/host/btBoundSearchCL.cpp", + "../../parallel_primitives/host/btBoundSearchCL.h", + "../../parallel_primitives/host/btPrefixScanCL.cpp", + "../../parallel_primitives/host/btPrefixScanCL.h", + "../../parallel_primitives/host/btRadixSort32CL.cpp", + "../../parallel_primitives/host/btRadixSort32CL.h", + "../../parallel_primitives/host/btAlignedAllocator.cpp", + "../../parallel_primitives/host/btAlignedAllocator.h", + "../../parallel_primitives/host/btAlignedObjectArray.h", + "../../parallel_primitives/host/btQuickprof.cpp", + "../../parallel_primitives/host/btQuickprof.h", + + } + + end +end + +createProject("AMD") +createProject("Intel") +createProject("NVIDIA") +createProject("Apple") \ No newline at end of file diff --git a/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp b/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp index b3629c3f8..05ad99162 100644 --- a/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp +++ b/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp @@ -652,6 +652,8 @@ int main( int argc, char** argv) printf("Initialize OpenCL using btOpenCLUtils_createContextFromType\n"); cl_platform_id platformId; g_cxMainContext = btOpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId); +// g_cxMainContext = btOpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId); + oclCHECKERROR(ciErrNum, CL_SUCCESS); int numDev = btOpenCLUtils_getNumDevices(g_cxMainContext); @@ -674,7 +676,7 @@ int main( int argc, char** argv) //srand(time(NULL)); srand(0); // presently deterministic - unsigned int num_elements = 32*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288; + unsigned int num_elements = 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288; unsigned int iterations = 10; bool keys_only = true;