For OSX: cd build ./premake_osx xcode4 for iOS: cd build ./ios_build.sh ./ios_run.sh Also integrated the branches/StackAllocation to make it easier to multi-thread collision detection in the near future. It avoids changing the btCollisionObject while performing collision detection. As this is a large patch, some stuff might be temporarily broken, I'll keep an eye out on issues.
696 lines
22 KiB
C++
696 lines
22 KiB
C++
/*
|
|
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
|
|
Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc.
|
|
|
|
This software is provided 'as-is', without any express or implied warranty.
|
|
In no event will the authors be held liable for any damages arising from the use of this software.
|
|
Permission is granted to anyone to use this software for any purpose,
|
|
including commercial applications, and to alter it and redistribute it freely,
|
|
subject to the following restrictions:
|
|
|
|
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
|
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
|
3. This notice may not be removed or altered from any source distribution.
|
|
*/
|
|
|
|
|
|
#include "LinearMath/btAlignedAllocator.h"
|
|
#include "LinearMath/btQuickprof.h"
|
|
#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
|
|
#include "../basic_initialize/btOpenCLUtils.h"
|
|
|
|
#include "bt3dGridBroadphaseOCL.h"
|
|
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include "Adl/Adl.h"
|
|
#include <AdlPrimitives/Scan/PrefixScan.h>
|
|
#include <AdlPrimitives/Sort/RadixSort32.h>
|
|
#include <AdlPrimitives/Sort/RadixSort.h>
|
|
|
|
#define ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
|
|
#define GRID_OCL_PATH "..\\..\\opencl\\3dGridBroadphase\\Shared\\bt3dGridBroadphaseOCL.cl"
|
|
|
|
|
|
#define MSTRINGIFY(A) #A
|
|
|
|
static const char* spProgramSource =
|
|
#include "bt3dGridBroadphaseOCL.cl"
|
|
|
|
adl::PrefixScan<adl::TYPE_CL>::Data* gData1=0;
|
|
adl::Buffer<unsigned int>* m_srcClBuffer=0;
|
|
|
|
struct MySortData
|
|
{
|
|
int key;
|
|
int value;
|
|
};
|
|
|
|
adl::RadixSort32<adl::TYPE_CL>::Data* dataC = 0;
|
|
adl::RadixSort<adl::TYPE_HOST>::Data* dataHost = 0;
|
|
|
|
|
|
static unsigned int infElem = 0x2fffffff;
|
|
|
|
static unsigned int zeroEl = 0;
|
|
static unsigned int minusOne= -1;
|
|
|
|
|
|
bt3dGridBroadphaseOCL::bt3dGridBroadphaseOCL( btOverlappingPairCache* overlappingPairCache,
|
|
const btVector3& cellSize,
|
|
int gridSizeX, int gridSizeY, int gridSizeZ,
|
|
int maxSmallProxies, int maxLargeProxies, int maxPairsPerSmallProxy,
|
|
btScalar maxSmallProxySize,
|
|
int maxSmallProxiesPerCell,
|
|
cl_context context, cl_device_id device, cl_command_queue queue,
|
|
adl::DeviceCL* deviceCL
|
|
) :
|
|
btGpu3DGridBroadphase(overlappingPairCache, cellSize, gridSizeX, gridSizeY, gridSizeZ, maxSmallProxies, maxLargeProxies, maxPairsPerSmallProxy, maxSmallProxySize, maxSmallProxiesPerCell)
|
|
{
|
|
|
|
|
|
initCL(context, device, queue);
|
|
allocateBuffers();
|
|
|
|
prefillBuffers();
|
|
|
|
initKernels();
|
|
|
|
//create an Adl device host and OpenCL device
|
|
|
|
adl::DeviceUtils::Config cfg;
|
|
m_deviceHost = adl::DeviceUtils::allocate( adl::TYPE_HOST, cfg );
|
|
m_ownsDevice = false;
|
|
if (!deviceCL)
|
|
{
|
|
m_ownsDevice = true;
|
|
deviceCL = new adl::DeviceCL;
|
|
deviceCL->m_context = context;
|
|
deviceCL->m_deviceIdx = device;
|
|
deviceCL->m_commandQueue = queue;
|
|
deviceCL->m_kernelManager = new adl::KernelManager;
|
|
}
|
|
|
|
m_deviceCL = deviceCL;
|
|
|
|
int minSize = 256*1024;
|
|
int maxSortBuffer = maxSmallProxies < minSize ? minSize :maxSmallProxies;
|
|
|
|
m_srcClBuffer = new adl::Buffer<unsigned int> (m_deviceCL,maxSmallProxies+2);
|
|
m_srcClBuffer->write(&zeroEl,1,0);
|
|
|
|
//m_srcClBuffer->write(&infElem,maxSmallProxies,0);
|
|
m_srcClBuffer->write(&infElem,1,maxSmallProxies);
|
|
m_srcClBuffer->write(&zeroEl,1,maxSmallProxies+1);
|
|
m_deviceCL->waitForCompletion();
|
|
|
|
gData1 = adl::PrefixScan<adl::TYPE_CL>::allocate( m_deviceCL, maxSortBuffer+2,adl::PrefixScanBase::EXCLUSIVE );
|
|
dataHost = adl::RadixSort<adl::TYPE_HOST>::allocate( m_deviceHost, maxSmallProxies+2 );
|
|
dataC = adl::RadixSort32<adl::TYPE_CL>::allocate( m_deviceCL, maxSortBuffer+2 );
|
|
|
|
}
|
|
|
|
|
|
|
|
bt3dGridBroadphaseOCL::~bt3dGridBroadphaseOCL()
|
|
{
|
|
//btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache
|
|
assert(m_bInitialized);
|
|
adl::RadixSort<adl::TYPE_HOST>::deallocate(dataHost);
|
|
adl::PrefixScan<adl::TYPE_CL>::deallocate(gData1);
|
|
adl::RadixSort32<adl::TYPE_CL>::deallocate(dataC);
|
|
adl::DeviceUtils::deallocate(m_deviceHost);
|
|
delete m_srcClBuffer;
|
|
if (m_ownsDevice)
|
|
{
|
|
delete m_deviceCL->m_kernelManager;
|
|
delete m_deviceCL;
|
|
}
|
|
}
|
|
|
|
#ifdef CL_PLATFORM_MINI_CL
|
|
// there is a problem with MSVC9 : static constructors are not called if variables defined in library and are not used
|
|
// looks like it is because of optimization
|
|
// probably this will happen with other compilers as well
|
|
// so to make it robust, register kernels again (it is safe)
|
|
#define MINICL_DECLARE(a) extern "C" void a();
|
|
MINICL_DECLARE(kCalcHashAABB)
|
|
MINICL_DECLARE(kClearCellStart)
|
|
MINICL_DECLARE(kFindCellStart)
|
|
MINICL_DECLARE(kFindOverlappingPairs)
|
|
MINICL_DECLARE(kFindPairsLarge)
|
|
MINICL_DECLARE(kComputePairCacheChanges)
|
|
MINICL_DECLARE(kSqueezeOverlappingPairBuff)
|
|
#undef MINICL_DECLARE
|
|
#endif
|
|
|
|
void bt3dGridBroadphaseOCL::initCL(cl_context context, cl_device_id device, cl_command_queue queue)
|
|
{
|
|
|
|
#ifdef CL_PLATFORM_MINI_CL
|
|
// call constructors here
|
|
MINICL_REGISTER(kCalcHashAABB)
|
|
MINICL_REGISTER(kClearCellStart)
|
|
MINICL_REGISTER(kFindCellStart)
|
|
MINICL_REGISTER(kFindOverlappingPairs)
|
|
MINICL_REGISTER(kFindPairsLarge)
|
|
MINICL_REGISTER(kComputePairCacheChanges)
|
|
MINICL_REGISTER(kSqueezeOverlappingPairBuff)
|
|
#endif
|
|
|
|
cl_int ciErrNum;
|
|
|
|
btAssert(context);
|
|
m_cxMainContext = context;
|
|
btAssert(device);
|
|
m_cdDevice = device;
|
|
btAssert(queue);
|
|
m_cqCommandQue = queue;
|
|
|
|
//adl::Kernel kern = m_deviceCL->getKernel(fileName,funcName,options,src);
|
|
|
|
m_cpProgram = btOpenCLUtils::compileCLProgramFromString(m_cxMainContext,m_cdDevice,spProgramSource, &ciErrNum,"-DGUID_ARG=""""",GRID_OCL_PATH);
|
|
|
|
printf("OK\n");
|
|
}
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::initKernels()
|
|
{
|
|
initKernel(GRID3DOCL_KERNEL_CALC_HASH_AABB, "kCalcHashAABB");
|
|
setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 1, sizeof(cl_mem),(void*)&m_dAABB);
|
|
setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
|
|
setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 3, sizeof(cl_mem),(void*)&m_dBpParams);
|
|
|
|
initKernel(GRID3DOCL_KERNEL_CLEAR_CELL_START, "kClearCellStart");
|
|
setKernelArg(GRID3DOCL_KERNEL_CLEAR_CELL_START, 1, sizeof(cl_mem),(void*)&m_dCellStart);
|
|
|
|
initKernel(GRID3DOCL_KERNEL_FIND_CELL_START, "kFindCellStart");
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_CELL_START, 1, sizeof(cl_mem),(void*)&m_dBodiesHash);
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_CELL_START, 2, sizeof(cl_mem),(void*)&m_dCellStart);
|
|
|
|
initKernel(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, "kFindOverlappingPairs");
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 1, sizeof(cl_mem),(void*)&m_dAABB);
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 3, sizeof(cl_mem),(void*)&m_dCellStart);
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 4, sizeof(cl_mem),(void*)&m_dPairBuff);
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 5, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 6, sizeof(cl_mem),(void*)&m_dBpParams);
|
|
|
|
initKernel(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, "kFindPairsLarge");
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 1, sizeof(cl_mem),(void*)&m_dAABB);
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 3, sizeof(cl_mem),(void*)&m_dCellStart);
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 4, sizeof(cl_mem),(void*)&m_dPairBuff);
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 5, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
|
|
|
|
initKernel(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, "kComputePairCacheChanges");
|
|
setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
|
|
setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
|
|
setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
|
|
setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 4, sizeof(cl_mem),(void*)&m_dAABB);
|
|
|
|
initKernel(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, "kSqueezeOverlappingPairBuff");
|
|
setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
|
|
setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
|
|
setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
|
|
setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 4, sizeof(cl_mem),(void*)&m_dPairsChanged);
|
|
setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 5, sizeof(cl_mem),(void*)&m_dAABB);
|
|
|
|
}
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::allocateBuffers()
|
|
{
|
|
cl_int ciErrNum;
|
|
unsigned int memSize;
|
|
// current version of bitonic sort works for power of 2 arrays only, so ...
|
|
m_hashSize = 1;
|
|
for(int bit = 1; bit < 32; bit++)
|
|
{
|
|
if(m_hashSize >= m_maxHandles)
|
|
{
|
|
break;
|
|
}
|
|
m_hashSize <<= 1;
|
|
}
|
|
memSize = m_hashSize * 2 * sizeof(unsigned int);
|
|
if (memSize < 1024*1024)
|
|
memSize = 1024*1024;
|
|
|
|
m_dBodiesHash = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
|
|
memSize = m_numCells * sizeof(unsigned int);
|
|
m_dCellStart = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
|
|
memSize = m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int);
|
|
m_dPairBuff = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
|
|
memSize = (m_maxHandles * 2 + 1) * sizeof(unsigned int);
|
|
m_dPairBuffStartCurr = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
|
|
unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
|
|
memSize = numAABB * sizeof(float) * 4 * 2;
|
|
m_dAABB = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
|
|
memSize = (m_maxHandles + 2) * sizeof(unsigned int);
|
|
m_dPairScanChanged = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
|
|
memSize = m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int);
|
|
m_dPairsChanged = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
|
|
|
|
memSize = 3 * 4 * sizeof(float);
|
|
m_dBpParams = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
}
|
|
|
|
void bt3dGridBroadphaseOCL::prefillBuffers()
|
|
{
|
|
memset(m_hBodiesHash, 0xFF, m_maxHandles*2*sizeof(unsigned int));
|
|
copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_maxHandles * 2 * sizeof(unsigned int));
|
|
// now fill the rest (bitonic sorting works with size == pow of 2)
|
|
int remainder = m_hashSize - m_maxHandles;
|
|
if(remainder)
|
|
{
|
|
copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, remainder * 2 * sizeof(unsigned int), m_maxHandles * 2 * sizeof(unsigned int), 0);
|
|
}
|
|
copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int));
|
|
memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
|
|
copyArrayToDevice(m_dPairBuff, m_hPairBuff, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
|
|
}
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::initKernel(int kernelId, char* pName)
|
|
{
|
|
|
|
cl_int ciErrNum;
|
|
cl_kernel kernel = clCreateKernel(m_cpProgram, pName, &ciErrNum);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
size_t wgSize;
|
|
ciErrNum = clGetKernelWorkGroupInfo(kernel, m_cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
m_kernels[kernelId].m_Id = kernelId;
|
|
m_kernels[kernelId].m_kernel = kernel;
|
|
m_kernels[kernelId].m_name = pName;
|
|
m_kernels[kernelId].m_workgroupSize = (int)wgSize;
|
|
return;
|
|
}
|
|
|
|
void bt3dGridBroadphaseOCL::runKernelWithWorkgroupSize(int kernelId, int globalSize)
|
|
{
|
|
if(globalSize <= 0)
|
|
{
|
|
return;
|
|
}
|
|
cl_kernel kernelFunc = m_kernels[kernelId].m_kernel;
|
|
cl_int ciErrNum = clSetKernelArg(kernelFunc, 0, sizeof(int), (void*)&globalSize);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
int workgroupSize = btMin(64,m_kernels[kernelId].m_workgroupSize);
|
|
|
|
if(workgroupSize <= 0)
|
|
{ // let OpenCL library calculate workgroup size
|
|
size_t globalWorkSize[2];
|
|
globalWorkSize[0] = globalSize;
|
|
globalWorkSize[1] = 1;
|
|
ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, NULL, 0,0,0 );
|
|
}
|
|
else
|
|
{
|
|
size_t localWorkSize[2], globalWorkSize[2];
|
|
//workgroupSize = btMin(workgroupSize, globalSize);
|
|
int num_t = globalSize / workgroupSize;
|
|
int num_g = num_t * workgroupSize;
|
|
if(num_g < globalSize)
|
|
{
|
|
num_t++;
|
|
}
|
|
localWorkSize[0] = workgroupSize;
|
|
globalWorkSize[0] = num_t * workgroupSize;
|
|
localWorkSize[1] = 1;
|
|
globalWorkSize[1] = 1;
|
|
ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, localWorkSize, 0,0,0 );
|
|
}
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
ciErrNum = clFlush(m_cqCommandQue);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
}
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::setKernelArg(int kernelId, int argNum, int argSize, void* argPtr)
|
|
{
|
|
cl_int ciErrNum;
|
|
ciErrNum = clSetKernelArg(m_kernels[kernelId].m_kernel, argNum, argSize, argPtr);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
}
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::copyArrayToDevice(cl_mem device, const void* host, unsigned int size, int devOffs, int hostOffs)
|
|
{
|
|
if (size)
|
|
{
|
|
cl_int ciErrNum;
|
|
char* pHost = (char*)host + hostOffs;
|
|
ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, device, CL_TRUE, devOffs, size, pHost, 0, NULL, NULL);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
}
|
|
}
|
|
|
|
void bt3dGridBroadphaseOCL::copyArrayFromDevice(void* host, const cl_mem device, unsigned int size, int hostOffs, int devOffs)
|
|
{
|
|
if (size)
|
|
{
|
|
cl_int ciErrNum;
|
|
char* pHost = (char*)host + hostOffs;
|
|
ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, device, CL_TRUE, devOffs, size, pHost, 0, NULL, NULL);
|
|
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//
|
|
// overrides
|
|
//
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::prepareAABB()
|
|
{
|
|
btGpu3DGridBroadphase::prepareAABB();
|
|
copyArrayToDevice(m_dAABB, m_hAABB, sizeof(bt3DGrid3F1U) * 2 * (m_numHandles + m_numLargeHandles));
|
|
return;
|
|
}
|
|
|
|
void bt3dGridBroadphaseOCL::setParameters(bt3DGridBroadphaseParams* hostParams)
|
|
{
|
|
btGpu3DGridBroadphase::setParameters(hostParams);
|
|
struct btParamsBpOCL
|
|
{
|
|
float m_invCellSize[4];
|
|
int m_gridSize[4];
|
|
};
|
|
btParamsBpOCL hParams;
|
|
hParams.m_invCellSize[0] = m_params.m_invCellSizeX;
|
|
hParams.m_invCellSize[1] = m_params.m_invCellSizeY;
|
|
hParams.m_invCellSize[2] = m_params.m_invCellSizeZ;
|
|
hParams.m_invCellSize[3] = 0.f;
|
|
hParams.m_gridSize[0] = m_params.m_gridSizeX;
|
|
hParams.m_gridSize[1] = m_params.m_gridSizeY;
|
|
hParams.m_gridSize[2] = m_params.m_gridSizeZ;
|
|
hParams.m_gridSize[3] = m_params.m_maxBodiesPerCell;
|
|
copyArrayToDevice(m_dBpParams, &hParams, sizeof(btParamsBpOCL));
|
|
return;
|
|
}
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::calcHashAABB()
|
|
{
|
|
BT_PROFILE("calcHashAABB");
|
|
#if 1
|
|
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_CALC_HASH_AABB, m_numHandles);
|
|
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
clFinish(m_cqCommandQue);
|
|
#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
|
|
#else
|
|
btGpu3DGridBroadphase::calcHashAABB();
|
|
#endif
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::sortHash()
|
|
{
|
|
BT_PROFILE("sortHash");
|
|
#ifdef CL_PLATFORM_MINI_CL
|
|
//copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
|
|
btGpu3DGridBroadphase::sortHash();
|
|
copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
|
|
#else
|
|
|
|
//#define USE_HOST
|
|
#ifdef USE_HOST
|
|
copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
|
|
//adl::Buffer<unsigned int> keysIn,keysOut,valuesIn,valuesOut;
|
|
///adl::RadixSort32<adl::TYPE_CL>::execute(dataC,keysIn,keysOut,valuesIn,valuesOut,m_numHandles);
|
|
adl::HostBuffer<adl::SortData> inoutHost;
|
|
inoutHost.m_device = m_deviceHost;
|
|
inoutHost.m_ptr = (adl::SortData*)m_hBodiesHash;
|
|
inoutHost.m_size = m_numHandles;
|
|
adl::RadixSort<adl::TYPE_HOST>::execute(dataHost, inoutHost,m_numHandles);
|
|
copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
|
|
#else
|
|
{
|
|
clFinish(m_cqCommandQue);
|
|
BT_PROFILE("RadixSort32::execute");
|
|
adl::Buffer<adl::SortData> inout;
|
|
inout.m_device = this->m_deviceCL;
|
|
inout.m_size = m_numHandles;
|
|
inout.m_ptr = (adl::SortData*)m_dBodiesHash;
|
|
int actualHandles = m_numHandles;
|
|
int dataAlignment = adl::RadixSort32<adl::TYPE_CL>::DATA_ALIGNMENT;
|
|
|
|
if (actualHandles%dataAlignment)
|
|
{
|
|
actualHandles += dataAlignment-(actualHandles%dataAlignment);
|
|
}
|
|
|
|
adl::RadixSort32<adl::TYPE_CL>::execute(dataC,inout, actualHandles);
|
|
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
clFinish(m_cqCommandQue);
|
|
#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
}
|
|
{
|
|
//BT_PROFILE("copyArrayFromDevice");
|
|
//copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
|
|
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
clFinish(m_cqCommandQue);
|
|
#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
}
|
|
|
|
|
|
#endif //USE_HOST
|
|
#endif
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::findCellStart()
|
|
{
|
|
#if 1
|
|
BT_PROFILE("findCellStart");
|
|
|
|
#if defined(CL_PLATFORM_MINI_CL)
|
|
btGpu3DGridBroadphase::findCellStart();
|
|
copyArrayToDevice(m_dCellStart, m_hCellStart, m_numCells * sizeof(unsigned int));
|
|
#else
|
|
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_CLEAR_CELL_START, m_numCells);
|
|
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_CELL_START, m_numHandles);
|
|
#endif
|
|
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
clFinish(m_cqCommandQue);
|
|
#endif
|
|
|
|
#else
|
|
btGpu3DGridBroadphase::findCellStart();
|
|
#endif
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::findOverlappingPairs()
|
|
{
|
|
#if 1
|
|
BT_PROFILE("findOverlappingPairs");
|
|
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, m_numHandles);
|
|
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
clFinish(m_cqCommandQue);
|
|
#endif
|
|
|
|
#else
|
|
btGpu3DGridBroadphase::findOverlappingPairs();
|
|
copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int));
|
|
copyArrayToDevice(m_dPairBuff, m_hPairBuff, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::findPairsLarge()
|
|
{
|
|
BT_PROFILE("findPairsLarge");
|
|
#if 1
|
|
if(m_numLargeHandles)
|
|
{
|
|
setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 6, sizeof(int),(void*)&m_numLargeHandles);
|
|
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, m_numHandles);
|
|
}
|
|
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
clFinish(m_cqCommandQue);
|
|
#endif
|
|
|
|
#else
|
|
btGpu3DGridBroadphase::findPairsLarge();
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::computePairCacheChanges()
|
|
{
|
|
BT_PROFILE("computePairCacheChanges");
|
|
#if 1
|
|
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, m_numHandles);
|
|
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
clFinish(m_cqCommandQue);
|
|
#endif
|
|
copyArrayFromDevice( m_hPairScanChanged,m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
|
|
|
|
#else
|
|
btGpu3DGridBroadphase::computePairCacheChanges();
|
|
copyArrayToDevice(m_dPairScanChanged, m_hPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
|
|
|
|
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
|
|
|
|
|
|
extern cl_device_type deviceType;
|
|
|
|
void bt3dGridBroadphaseOCL::scanOverlappingPairBuff(bool copyToCpu)
|
|
{
|
|
|
|
//Intel/CPU version doesn't handlel Adl scan well
|
|
#if 0
|
|
{
|
|
copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
|
|
btGpu3DGridBroadphase::scanOverlappingPairBuff();
|
|
copyArrayToDevice(m_dPairScanChanged, m_hPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
|
|
m_numPrefixSum = m_hPairScanChanged[m_numHandles+1];
|
|
clFinish(m_cqCommandQue);
|
|
//memset(m_hPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
|
|
}
|
|
#else
|
|
{
|
|
|
|
// copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
|
|
// btGpu3DGridBroadphase::scanOverlappingPairBuff();
|
|
|
|
adl::Buffer<unsigned int> destBuffer;
|
|
|
|
{
|
|
BT_PROFILE("copy GPU->GPU");
|
|
|
|
destBuffer.m_ptr = (unsigned int*)m_dPairScanChanged;
|
|
destBuffer.m_device = m_deviceCL;
|
|
destBuffer.m_size = sizeof(unsigned int)*(m_numHandles+2);
|
|
m_deviceCL->copy(m_srcClBuffer, &destBuffer,m_numHandles,1,1);
|
|
|
|
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
clFinish(m_cqCommandQue);
|
|
#endif
|
|
|
|
}
|
|
|
|
{
|
|
BT_PROFILE("PrefixScan");
|
|
|
|
adl::PrefixScan<adl::TYPE_CL>::execute(gData1,*m_srcClBuffer,destBuffer, m_numHandles+2,&m_numPrefixSum);
|
|
|
|
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
clFinish(m_cqCommandQue);
|
|
#endif
|
|
//if (m_numPrefixSum>0x1000)
|
|
// {
|
|
// printf("error m_numPrefixSum==%d\n",m_numPrefixSum);
|
|
// }
|
|
|
|
}
|
|
|
|
#if 0
|
|
unsigned int* verifyhPairScanChanged = new unsigned int[m_maxHandles + 2];
|
|
memset(verifyhPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
|
|
|
|
copyArrayFromDevice(verifyhPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
|
|
clFinish(m_cqCommandQue);
|
|
|
|
/*for (int i=0;i<m_numHandles+2;i++)
|
|
{
|
|
if (verifyhPairScanChanged[i] != m_hPairScanChanged[i])
|
|
{
|
|
printf("hello!\n");
|
|
}
|
|
}
|
|
*/
|
|
|
|
#endif
|
|
|
|
|
|
if (1)
|
|
{
|
|
|
|
//the data
|
|
if (copyToCpu)
|
|
{
|
|
BT_PROFILE("copy GPU -> CPU");
|
|
copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
|
|
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
clFinish(m_cqCommandQue);
|
|
#endif
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::squeezeOverlappingPairBuff()
|
|
{
|
|
BT_PROFILE("btCuda_squeezeOverlappingPairBuff");
|
|
#if 1
|
|
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, m_numHandles);
|
|
// btCuda_squeezeOverlappingPairBuff(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScanChanged, m_dPairsChanged, m_dAABB, m_numHandles);
|
|
|
|
//copyArrayFromDevice(m_hPairsChanged, m_dPairsChanged, sizeof(unsigned int) * m_numPrefixSum);//m_hPairScanChanged[m_numHandles+1]); //gSum
|
|
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
|
|
clFinish(m_cqCommandQue);
|
|
#endif
|
|
|
|
#else
|
|
btGpu3DGridBroadphase::squeezeOverlappingPairBuff();
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
|
|
|
|
void bt3dGridBroadphaseOCL::resetPool(btDispatcher* dispatcher)
|
|
{
|
|
btGpu3DGridBroadphase::resetPool(dispatcher);
|
|
prefillBuffers();
|
|
}
|
|
|
|
|