diff --git a/Demos3/GpuDemos/GpuDemo.h b/Demos3/GpuDemos/GpuDemo.h index 9095383f6..03d6d7ef3 100644 --- a/Demos3/GpuDemos/GpuDemo.h +++ b/Demos3/GpuDemos/GpuDemo.h @@ -37,7 +37,7 @@ public: class b3gWindowInterface* m_window; class GwenUserInterface* m_gui; - + ConstructionInfo() :useOpenCL(true), preferredOpenCLPlatformIndex(-1), diff --git a/build3/stringify.bat b/build3/stringify.bat index 670db8ed5..5680be14c 100644 --- a/build3/stringify.bat +++ b/build3/stringify.bat @@ -5,6 +5,8 @@ rem @echo off premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h" --stringname="prefixScanKernelsFloat4CL" stringify + premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h" --stringname="sapCL" stringify diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp index 22756c685..6678008f3 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp @@ -2,6 +2,8 @@ #include "b3GpuSapBroadphase.h" #include "Bullet3Common/b3Vector3.h" #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h" + #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #include "kernels/sapKernels.h" @@ -21,6 +23,9 @@ m_largeAabbsGPU(ctx,q), m_overlappingPairs(ctx,q), m_gpuSmallSortData(ctx,q), m_gpuSmallSortedAabbs(ctx,q), +m_sum(ctx,q), +m_sum2(ctx,q), +m_dst(ctx,q), m_currentBuffer(-1) { const char* sapSrc = sapCL; @@ -33,7 +38,7 @@ m_currentBuffer(-1) cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"",B3_BROADPHASE_SAPFAST_PATH); b3Assert(errNum==CL_SUCCESS); - + m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context,m_device,m_queue); //m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg ); //m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg ); //m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); @@ -42,6 +47,10 @@ m_currentBuffer(-1) m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); b3Assert(errNum==CL_SUCCESS); + m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "prepareSumVarianceKernel",&errNum,sapProg ); + b3Assert(errNum==CL_SUCCESS); + + #if 0 m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg ); @@ -68,11 +77,14 @@ m_currentBuffer(-1) b3GpuSapBroadphase::~b3GpuSapBroadphase() { delete m_sorter; + delete m_prefixScanFloat4; + clReleaseKernel(m_scatterKernel); clReleaseKernel(m_flipFloatKernel); clReleaseKernel(m_copyAabbsKernel); clReleaseKernel(m_sapKernel); clReleaseKernel(m_sap2Kernel); + clReleaseKernel(m_prepareSumVarianceKernel); } @@ -155,22 +167,26 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() } + + + void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) { //test //if (m_currentBuffer>=0) // calculateOverlappingPairsHostIncremental3Sap(); - int axis=0; - b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size()); - + m_allAabbsGPU.copyToHost(m_allAabbsCPU); + + + + //m_data->m_broadphaseSap->calculateOverlappingPairs(m_data->m_config.m_maxBroadphasePairs); - m_allAabbsGPU.copyToHost(m_allAabbsCPU); - + int numSmallAabbs = m_smallAabbsCPU.size(); { - int numSmallAabbs = m_smallAabbsCPU.size(); + for (int j=0;j v[0]) + axis = 1; + if(v[2] > v[axis]) + axis = 2; + } + + { int numLargeAabbs = m_largeAabbsCPU.size(); for (int j=0;jexecute(m_sum,m_dst,numSmallAabbs+1,&s); + m_prefixScanFloat4->execute(m_sum2,m_dst,numSmallAabbs+1,&s2); + + b3Vector3 v = s2 - (s*s) / (float)numSmallAabbs; + + if(v[1] > v[0]) + axis = 1; + if(v[2] > v[axis]) + axis = 2; + } + + if (syncOnHost) { B3_PROFILE("Synchronize m_largeAabbsGPU (CPU/slow)"); @@ -360,7 +436,7 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) - B3_PROFILE("GPU SAP"); + int numSmallAabbs = m_smallAabbsGPU.size(); m_gpuSmallSortData.resize(numSmallAabbs); diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h index cf5709435..177117ad0 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h @@ -21,6 +21,7 @@ class b3GpuSapBroadphase cl_kernel m_copyAabbsKernel; cl_kernel m_sapKernel; cl_kernel m_sap2Kernel; + cl_kernel m_prepareSumVarianceKernel; class b3RadixSort32CL* m_sorter; @@ -33,6 +34,10 @@ class b3GpuSapBroadphase b3OpenCLArray m_allAabbsGPU; b3AlignedObjectArray m_allAabbsCPU; + b3OpenCLArray m_sum; + b3OpenCLArray m_sum2; + b3OpenCLArray m_dst; + b3OpenCLArray m_smallAabbsGPU; b3AlignedObjectArray m_smallAabbsCPU; @@ -45,6 +50,7 @@ class b3GpuSapBroadphase b3OpenCLArray m_gpuSmallSortData; b3OpenCLArray m_gpuSmallSortedAabbs; + class b3PrefixScanFloat4CL* m_prefixScanFloat4; b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ); virtual ~b3GpuSapBroadphase(); diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl index a744fd347..4d94f9cdf 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl @@ -318,3 +318,16 @@ __kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global sortedAabbs[i] = aabbs[sortData[i].y]; } + + + +__kernel void prepareSumVarianceKernel( __global const btAabbCL* aabbs, __global float4* sum, __global float4* sum2,int numAabbs) +{ + int i = get_global_id(0); + if (i>numAabbs) + return; + float4 s; + s = (aabbs[i].m_max+aabbs[i].m_min)*0.5f; + sum[i]=s; + sum2[i]=s*s; +} diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h index 30a28d16f..172ed6d67 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h @@ -321,4 +321,17 @@ static const char* sapCL= \ " sortedAabbs[i] = aabbs[sortData[i].y];\n" "}\n" "\n" +"\n" +"\n" +"__kernel void prepareSumVarianceKernel( __global const btAabbCL* aabbs, __global float4* sum, __global float4* sum2,int numAabbs)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>numAabbs)\n" +" return;\n" +" float4 s;\n" +" s = (aabbs[i].m_max+aabbs[i].m_min)*0.5f;\n" +" sum[i]=s;\n" +" sum2[i]=s*s; \n" +"}\n" +"\n" ; diff --git a/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp b/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp new file mode 100644 index 000000000..06ac95d93 --- /dev/null +++ b/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp @@ -0,0 +1,126 @@ +#include "b3PrefixScanFloat4CL.h" +#include "b3FillCL.h" +#define B3_PREFIXSCAN_FLOAT4_PROG_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl" + +#include "b3LauncherCL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "kernels/PrefixScanKernelsFloat4CL.h" + +b3PrefixScanFloat4CL::b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size) +:m_commandQueue(queue) +{ + const char* scanKernelSource = prefixScanKernelsFloat4CL; + cl_int pErrNum; + char* additionalMacros=0; + + m_workBuffer = new b3OpenCLArray(ctx,queue,size); + cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH); + b3Assert(scanProg); + + m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_localScanKernel ); + m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_blockSumKernel ); + m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_propagationKernel ); +} + + +b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL() +{ + delete m_workBuffer; + clReleaseKernel(m_localScanKernel); + clReleaseKernel(m_blockSumKernel); + clReleaseKernel(m_propagationKernel); +} + +template +T b3NextPowerOf2(T n) +{ + n -= 1; + for(int i=0; i>i); + return n+1; +} + +void b3PrefixScanFloat4CL::execute(b3OpenCLArray& src, b3OpenCLArray& dst, int n, b3Vector3* sum) +{ + +// b3Assert( data->m_option == EXCLUSIVE ); + const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) ); + + dst.resize(src.size()); + m_workBuffer->resize(src.size()); + + b3Int4 constBuffer; + constBuffer.x = n; + constBuffer.y = numBlocks; + constBuffer.z = (int)b3NextPowerOf2( numBlocks ); + + b3OpenCLArray* srcNative = &src; + b3OpenCLArray* dstNative = &dst; + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + + b3LauncherCL launcher( m_commandQueue, m_localScanKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE ); + } + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + + b3LauncherCL launcher( m_commandQueue, m_blockSumKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE ); + } + + + if( numBlocks > 1 ) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + b3LauncherCL launcher( m_commandQueue, m_propagationKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE ); + } + + + if( sum ) + { + clFinish(m_commandQueue); + dstNative->copyToHostPointer(sum,1,n-1,true); + } + +} + + +void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray& src, b3AlignedObjectArray& dst, int n, b3Vector3* sum) +{ + b3Vector3 s(0,0,0); + //if( data->m_option == EXCLUSIVE ) + { + for(int i=0; i* m_workBuffer; + + + public: + + b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0); + + virtual ~b3PrefixScanFloat4CL(); + + void execute(b3OpenCLArray& src, b3OpenCLArray& dst, int n, b3Vector3* sum = 0); + void executeHost(b3AlignedObjectArray& src, b3AlignedObjectArray& dst, int n, b3Vector3* sum); +}; + +#endif //B3_PREFIX_SCAN_CL_H diff --git a/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl b/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl new file mode 100644 index 000000000..c9da79854 --- /dev/null +++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl @@ -0,0 +1,154 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) + +// takahiro end +#define WG_SIZE 128 +#define m_numElems x +#define m_numBlocks y +#define m_numScanBlocks z + +/*typedef struct +{ + uint m_numElems; + uint m_numBlocks; + uint m_numScanBlocks; + uint m_padding[1]; +} ConstBuffer; +*/ + +float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize) +{ + float4 blocksum; + int offset = 1; + for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1) + { + GROUP_LDS_BARRIER; + for(int iIdx=lIdx; iIdx>= 1; + for(int nActive=1; nActive>=1 ) + { + GROUP_LDS_BARRIER; + for( int iIdx = lIdx; iIdx>1; nActive>0; nActive>>=1, offset<<=1)\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for(int iIdx=lIdx; iIdx>= 1;\n" +" for(int nActive=1; nActive>=1 )\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for( int iIdx = lIdx; iIdxm_broadphaseSap->calculateOverlappingPairs(m_data->m_config.m_maxBroadphasePairs); + //m_data->m_broadphaseSap->calculateOverlappingPairsHost(m_data->m_config.m_maxBroadphasePairs); + numPairs = m_data->m_broadphaseSap->getNumOverlap(); } }