Add the GPU rigid body pipeline from https://github.com/erwincoumans/experiments as a Bullet 3.x preview for Bullet 2.80

This commit is contained in:
erwin.coumans
2012-03-05 00:54:32 +00:00
parent 73c4646b40
commit 571af41cf6
257 changed files with 55106 additions and 0 deletions

View File

@@ -0,0 +1,29 @@
hasCL = findOpenCL_AMD()
if (hasCL) then
project "OpenCL_bt3dGridBroadphase_AMD"
initOpenCL_AMD()
language "C++"
kind "StaticLib"
targetdir "../../../bin"
libdirs {"../../../rendering/GlutGlewWindows"}
includedirs {
-- "../../../rendering/GlutGlewWindows",
"../../../opencl/3dGridBroadphase/Shared",
"../../../../../src",
"../../primitives"
}
files {
"../Shared/*.cpp",
"../Shared/*.h"
}
end

View File

@@ -0,0 +1,23 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2007 Erwin Coumans http://bulletphysics.com
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include <MiniCL/cl_MiniCL_Defs.h>
extern "C"
{
#define MSTRINGIFY(A) A
#include "bt3dGridBroadphaseOCL.cl"
#undef MSTRINGIFY
}

View File

@@ -0,0 +1,349 @@
MSTRINGIFY(
int getPosHash(int4 gridPos, __global float4* pParams)
{
int4 gridDim = *((__global int4*)(pParams + 1));
gridPos.x &= gridDim.x - 1;
gridPos.y &= gridDim.y - 1;
gridPos.z &= gridDim.z - 1;
int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;
return hash;
}
int4 getGridPos(float4 worldPos, __global float4* pParams)
{
int4 gridPos;
int4 gridDim = *((__global int4*)(pParams + 1));
gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);
gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);
gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);
return gridPos;
}
// calculate grid hash value for each body using its AABB
__kernel void kCalcHashAABB(int numObjects, __global float4* pAABB, __global int2* pHash, __global float4* pParams GUID_ARG)
{
int index = get_global_id(0);
if(index >= numObjects)
{
return;
}
float4 bbMin = pAABB[index*2];
float4 bbMax = pAABB[index*2 + 1];
float4 pos;
pos.x = (bbMin.x + bbMax.x) * 0.5f;
pos.y = (bbMin.y + bbMax.y) * 0.5f;
pos.z = (bbMin.z + bbMax.z) * 0.5f;
pos.w = 0.f;
// get address in grid
int4 gridPos = getGridPos(pos, pParams);
int gridHash = getPosHash(gridPos, pParams);
// store grid hash and body index
int2 hashVal;
hashVal.x = gridHash;
hashVal.y = index;
pHash[index] = hashVal;
}
__kernel void kClearCellStart( int numCells,
__global int* pCellStart GUID_ARG)
{
int index = get_global_id(0);
if(index >= numCells)
{
return;
}
pCellStart[index] = -1;
}
__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart GUID_ARG)
{
__local int sharedHash[513];
int index = get_global_id(0);
int2 sortedData;
if(index < numObjects)
{
sortedData = pHash[index];
// Load hash data into shared memory so that we can look
// at neighboring body's hash value without loading
// two hash values per thread
sharedHash[get_local_id(0) + 1] = sortedData.x;
if((index > 0) && (get_local_id(0) == 0))
{
// first thread in block must load neighbor body hash
sharedHash[0] = pHash[index-1].x;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(index < numObjects)
{
if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))
{
cellStart[sortedData.x] = index;
}
}
}
int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)
{
return (min0.x <= max1.x)&& (min1.x <= max0.x) &&
(min0.y <= max1.y)&& (min1.y <= max0.y) &&
(min0.z <= max1.z)&& (min1.z <= max0.z);
}
void findPairsInCell( int numObjects,
int4 gridPos,
int index,
__global int2* pHash,
__global int* pCellStart,
__global float4* pAABB,
__global int* pPairBuff,
__global int2* pPairBuffStartCurr,
__global float4* pParams)
{
int4 pGridDim = *((__global int4*)(pParams + 1));
int maxBodiesPerCell = pGridDim.w;
int gridHash = getPosHash(gridPos, pParams);
// get start of bucket for this cell
int bucketStart = pCellStart[gridHash];
if (bucketStart == -1)
{
return; // cell empty
}
// iterate over bodies in this cell
int2 sortedData = pHash[index];
int unsorted_indx = sortedData.y;
float4 min0 = pAABB[unsorted_indx*2 + 0];
float4 max0 = pAABB[unsorted_indx*2 + 1];
int handleIndex = as_int(min0.w);
int2 start_curr = pPairBuffStartCurr[handleIndex];
int start = start_curr.x;
int curr = start_curr.y;
int2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
int curr_max = start_curr_next.x - start - 1;
int bucketEnd = bucketStart + maxBodiesPerCell;
bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;
for(int index2 = bucketStart; index2 < bucketEnd; index2++)
{
int2 cellData = pHash[index2];
if (cellData.x != gridHash)
{
break; // no longer in same bucket
}
int unsorted_indx2 = cellData.y;
if (unsorted_indx2 < unsorted_indx) // check not colliding with self
{
float4 min1 = pAABB[unsorted_indx2*2 + 0];
float4 max1 = pAABB[unsorted_indx2*2 + 1];
if(testAABBOverlap(min0, max0, min1, max1))
{
int handleIndex2 = as_int(min1.w);
int k;
for(k = 0; k < curr; k++)
{
int old_pair = pPairBuff[start+k] & (~0x60000000);
if(old_pair == handleIndex2)
{
pPairBuff[start+k] |= 0x40000000;
break;
}
}
if(k == curr)
{
if(curr >= curr_max)
{ // not a good solution, but let's avoid crash
break;
}
pPairBuff[start+curr] = handleIndex2 | 0x20000000;
curr++;
}
}
}
}
int2 newStartCurr;
newStartCurr.x = start;
newStartCurr.y = curr;
pPairBuffStartCurr[handleIndex] = newStartCurr;
return;
}
__kernel void kFindOverlappingPairs( int numObjects,
__global float4* pAABB,
__global int2* pHash,
__global int* pCellStart,
__global int* pPairBuff,
__global int2* pPairBuffStartCurr,
__global float4* pParams GUID_ARG)
{
int index = get_global_id(0);
if(index >= numObjects)
{
return;
}
int2 sortedData = pHash[index];
int unsorted_indx = sortedData.y;
float4 bbMin = pAABB[unsorted_indx*2 + 0];
float4 bbMax = pAABB[unsorted_indx*2 + 1];
float4 pos;
pos.x = (bbMin.x + bbMax.x) * 0.5f;
pos.y = (bbMin.y + bbMax.y) * 0.5f;
pos.z = (bbMin.z + bbMax.z) * 0.5f;
// get address in grid
int4 gridPosA = getGridPos(pos, pParams);
int4 gridPosB;
// examine only neighbouring cells
for(int z=-1; z<=1; z++)
{
gridPosB.z = gridPosA.z + z;
for(int y=-1; y<=1; y++)
{
gridPosB.y = gridPosA.y + y;
for(int x=-1; x<=1; x++)
{
gridPosB.x = gridPosA.x + x;
findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, pParams);
}
}
}
}
__kernel void kFindPairsLarge( int numObjects,
__global float4* pAABB,
__global int2* pHash,
__global int* pCellStart,
__global int* pPairBuff,
__global int2* pPairBuffStartCurr,
uint numLarge GUID_ARG)
{
int index = get_global_id(0);
if(index >= numObjects)
{
return;
}
int2 sortedData = pHash[index];
int unsorted_indx = sortedData.y;
float4 min0 = pAABB[unsorted_indx*2 + 0];
float4 max0 = pAABB[unsorted_indx*2 + 1];
int handleIndex = as_int(min0.w);
int2 start_curr = pPairBuffStartCurr[handleIndex];
int start = start_curr.x;
int curr = start_curr.y;
int2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
int curr_max = start_curr_next.x - start - 1;
for(uint i = 0; i < numLarge; i++)
{
int indx2 = numObjects + i;
float4 min1 = pAABB[indx2*2 + 0];
float4 max1 = pAABB[indx2*2 + 1];
if(testAABBOverlap(min0, max0, min1, max1))
{
int k;
int handleIndex2 = as_int(min1.w);
for(k = 0; k < curr; k++)
{
int old_pair = pPairBuff[start+k] & (~0x60000000);
if(old_pair == handleIndex2)
{
pPairBuff[start+k] |= 0x40000000;
break;
}
}
if(k == curr)
{
pPairBuff[start+curr] = handleIndex2 | 0x20000000;
if(curr >= curr_max)
{ // not a good solution, but let's avoid crash
break;
}
curr++;
}
}
}
int2 newStartCurr;
newStartCurr.x = start;
newStartCurr.y = curr;
pPairBuffStartCurr[handleIndex] = newStartCurr;
return;
}
__kernel void kComputePairCacheChanges( int numObjects,
__global int* pPairBuff,
__global int2* pPairBuffStartCurr,
__global int* pPairScan,
__global float4* pAABB GUID_ARG)
{
int index = get_global_id(0);
if(index >= numObjects)
{
return;
}
float4 bbMin = pAABB[index * 2];
int handleIndex = as_int(bbMin.w);
int2 start_curr = pPairBuffStartCurr[handleIndex];
int start = start_curr.x;
int curr = start_curr.y;
__global int *pInp = pPairBuff + start;
int num_changes = 0;
for(int k = 0; k < curr; k++, pInp++)
{
if(!((*pInp) & 0x40000000))
{
num_changes++;
}
}
pPairScan[index+1] = num_changes;
}
__kernel void kSqueezeOverlappingPairBuff( int numObjects,
__global int* pPairBuff,
__global int2* pPairBuffStartCurr,
__global int* pPairScan,
__global int* pPairOut,
__global float4* pAABB GUID_ARG)
{
int index = get_global_id(0);
if(index >= numObjects)
{
return;
}
float4 bbMin = pAABB[index * 2];
int handleIndex = as_int(bbMin.w);
int2 start_curr = pPairBuffStartCurr[handleIndex];
int start = start_curr.x;
int curr = start_curr.y;
__global int* pInp = pPairBuff + start;
__global int* pOut = pPairOut + pPairScan[index+1];
__global int* pOut2 = pInp;
int num = 0;
for(int k = 0; k < curr; k++, pInp++)
{
if(!((*pInp) & 0x40000000))
{
*pOut = *pInp;
pOut++;
}
if((*pInp) & 0x60000000)
{
*pOut2 = (*pInp) & (~0x60000000);
pOut2++;
num++;
}
}
int2 newStartCurr;
newStartCurr.x = start;
newStartCurr.y = num;
pPairBuffStartCurr[handleIndex] = newStartCurr;
}
);

View File

@@ -0,0 +1,697 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "LinearMath/btAlignedAllocator.h"
#include "LinearMath/btQuickprof.h"
#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
#include "../basic_initialize/btOpenCLUtils.h"
#include "bt3dGridBroadphaseOCL.h"
#include <stdio.h>
#include <string.h>
#include "Adl/Adl.h"
#include <AdlPrimitives/Scan/PrefixScan.h>
#include <AdlPrimitives/Sort/RadixSort32.h>
#include <AdlPrimitives/Sort/RadixSort.h>
#define ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
#define GRID_OCL_PATH "..\\..\\opencl\\3dGridBroadphase\\Shared\\bt3dGridBroadphaseOCL.cl"
#define MSTRINGIFY(A) #A
static const char* spProgramSource =
#include "bt3dGridBroadphaseOCL.cl"
adl::PrefixScan<adl::TYPE_CL>::Data* gData1=0;
adl::Buffer<unsigned int>* m_srcClBuffer=0;
struct MySortData
{
int key;
int value;
};
adl::RadixSort32<adl::TYPE_CL>::Data* dataC = 0;
adl::RadixSort<adl::TYPE_HOST>::Data* dataHost = 0;
static unsigned int infElem = 0x2fffffff;
static unsigned int zeroEl = 0;
static unsigned int minusOne= -1;
bt3dGridBroadphaseOCL::bt3dGridBroadphaseOCL( btOverlappingPairCache* overlappingPairCache,
const btVector3& cellSize,
int gridSizeX, int gridSizeY, int gridSizeZ,
int maxSmallProxies, int maxLargeProxies, int maxPairsPerSmallProxy,
btScalar maxSmallProxySize,
int maxSmallProxiesPerCell,
cl_context context, cl_device_id device, cl_command_queue queue,
adl::DeviceCL* deviceCL
) :
btGpu3DGridBroadphase(overlappingPairCache, cellSize, gridSizeX, gridSizeY, gridSizeZ, maxSmallProxies, maxLargeProxies, maxPairsPerSmallProxy, maxSmallProxySize, maxSmallProxiesPerCell)
{
initCL(context, device, queue);
allocateBuffers();
prefillBuffers();
initKernels();
//create an Adl device host and OpenCL device
adl::DeviceUtils::Config cfg;
m_deviceHost = adl::DeviceUtils::allocate( adl::TYPE_HOST, cfg );
m_ownsDevice = false;
if (!deviceCL)
{
m_ownsDevice = true;
deviceCL = new adl::DeviceCL;
deviceCL->m_context = context;
deviceCL->m_deviceIdx = device;
deviceCL->m_commandQueue = queue;
deviceCL->m_kernelManager = new adl::KernelManager;
}
m_deviceCL = deviceCL;
int minSize = 256*1024;
int maxSortBuffer = maxSmallProxies < minSize ? minSize :maxSmallProxies;
m_srcClBuffer = new adl::Buffer<unsigned int> (m_deviceCL,maxSmallProxies+2);
m_srcClBuffer->write(&zeroEl,1,0);
//m_srcClBuffer->write(&infElem,maxSmallProxies,0);
m_srcClBuffer->write(&infElem,1,maxSmallProxies);
m_srcClBuffer->write(&zeroEl,1,maxSmallProxies+1);
m_deviceCL->waitForCompletion();
gData1 = adl::PrefixScan<adl::TYPE_CL>::allocate( m_deviceCL, maxSortBuffer+2,adl::PrefixScanBase::EXCLUSIVE );
dataHost = adl::RadixSort<adl::TYPE_HOST>::allocate( m_deviceHost, maxSmallProxies+2 );
dataC = adl::RadixSort32<adl::TYPE_CL>::allocate( m_deviceCL, maxSortBuffer+2 );
}
bt3dGridBroadphaseOCL::~bt3dGridBroadphaseOCL()
{
//btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache
assert(m_bInitialized);
adl::RadixSort<adl::TYPE_HOST>::deallocate(dataHost);
adl::PrefixScan<adl::TYPE_CL>::deallocate(gData1);
adl::RadixSort32<adl::TYPE_CL>::deallocate(dataC);
adl::DeviceUtils::deallocate(m_deviceHost);
delete m_srcClBuffer;
if (m_ownsDevice)
{
delete m_deviceCL->m_kernelManager;
delete m_deviceCL;
}
}
#ifdef CL_PLATFORM_MINI_CL
// there is a problem with MSVC9 : static constructors are not called if variables defined in library and are not used
// looks like it is because of optimization
// probably this will happen with other compilers as well
// so to make it robust, register kernels again (it is safe)
#define MINICL_DECLARE(a) extern "C" void a();
MINICL_DECLARE(kCalcHashAABB)
MINICL_DECLARE(kClearCellStart)
MINICL_DECLARE(kFindCellStart)
MINICL_DECLARE(kFindOverlappingPairs)
MINICL_DECLARE(kFindPairsLarge)
MINICL_DECLARE(kComputePairCacheChanges)
MINICL_DECLARE(kSqueezeOverlappingPairBuff)
#undef MINICL_DECLARE
#endif
void bt3dGridBroadphaseOCL::initCL(cl_context context, cl_device_id device, cl_command_queue queue)
{
#ifdef CL_PLATFORM_MINI_CL
// call constructors here
MINICL_REGISTER(kCalcHashAABB)
MINICL_REGISTER(kClearCellStart)
MINICL_REGISTER(kFindCellStart)
MINICL_REGISTER(kFindOverlappingPairs)
MINICL_REGISTER(kFindPairsLarge)
MINICL_REGISTER(kComputePairCacheChanges)
MINICL_REGISTER(kSqueezeOverlappingPairBuff)
#endif
cl_int ciErrNum;
btAssert(context);
m_cxMainContext = context;
btAssert(device);
m_cdDevice = device;
btAssert(queue);
m_cqCommandQue = queue;
//adl::Kernel kern = m_deviceCL->getKernel(fileName,funcName,options,src);
m_cpProgram = btOpenCLUtils::compileCLProgramFromString(m_cxMainContext,m_cdDevice,spProgramSource, &ciErrNum,"-DGUID_ARG=""""",GRID_OCL_PATH);
printf("OK\n");
}
void bt3dGridBroadphaseOCL::initKernels()
{
initKernel(GRID3DOCL_KERNEL_CALC_HASH_AABB, "kCalcHashAABB");
setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 1, sizeof(cl_mem),(void*)&m_dAABB);
setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 3, sizeof(cl_mem),(void*)&m_dBpParams);
initKernel(GRID3DOCL_KERNEL_CLEAR_CELL_START, "kClearCellStart");
setKernelArg(GRID3DOCL_KERNEL_CLEAR_CELL_START, 1, sizeof(cl_mem),(void*)&m_dCellStart);
initKernel(GRID3DOCL_KERNEL_FIND_CELL_START, "kFindCellStart");
setKernelArg(GRID3DOCL_KERNEL_FIND_CELL_START, 1, sizeof(cl_mem),(void*)&m_dBodiesHash);
setKernelArg(GRID3DOCL_KERNEL_FIND_CELL_START, 2, sizeof(cl_mem),(void*)&m_dCellStart);
initKernel(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, "kFindOverlappingPairs");
setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 1, sizeof(cl_mem),(void*)&m_dAABB);
setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 3, sizeof(cl_mem),(void*)&m_dCellStart);
setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 4, sizeof(cl_mem),(void*)&m_dPairBuff);
setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 5, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 6, sizeof(cl_mem),(void*)&m_dBpParams);
initKernel(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, "kFindPairsLarge");
setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 1, sizeof(cl_mem),(void*)&m_dAABB);
setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 3, sizeof(cl_mem),(void*)&m_dCellStart);
setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 4, sizeof(cl_mem),(void*)&m_dPairBuff);
setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 5, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
initKernel(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, "kComputePairCacheChanges");
setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 4, sizeof(cl_mem),(void*)&m_dAABB);
initKernel(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, "kSqueezeOverlappingPairBuff");
setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 4, sizeof(cl_mem),(void*)&m_dPairsChanged);
setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 5, sizeof(cl_mem),(void*)&m_dAABB);
}
void bt3dGridBroadphaseOCL::allocateBuffers()
{
cl_int ciErrNum;
unsigned int memSize;
// current version of bitonic sort works for power of 2 arrays only, so ...
m_hashSize = 1;
for(int bit = 1; bit < 32; bit++)
{
if(m_hashSize >= m_maxHandles)
{
break;
}
m_hashSize <<= 1;
}
memSize = m_hashSize * 2 * sizeof(unsigned int);
if (memSize < 1024*1024)
memSize = 1024*1024;
m_dBodiesHash = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
memSize = m_numCells * sizeof(unsigned int);
m_dCellStart = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
memSize = m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int);
m_dPairBuff = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
memSize = (m_maxHandles * 2 + 1) * sizeof(unsigned int);
m_dPairBuffStartCurr = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
memSize = numAABB * sizeof(float) * 4 * 2;
m_dAABB = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
memSize = (m_maxHandles + 2) * sizeof(unsigned int);
m_dPairScanChanged = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
memSize = m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int);
m_dPairsChanged = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
m_dPairsContiguous = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
memSize = 3 * 4 * sizeof(float);
m_dBpParams = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
}
void bt3dGridBroadphaseOCL::prefillBuffers()
{
memset(m_hBodiesHash, 0xFF, m_maxHandles*2*sizeof(unsigned int));
copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_maxHandles * 2 * sizeof(unsigned int));
// now fill the rest (bitonic sorting works with size == pow of 2)
int remainder = m_hashSize - m_maxHandles;
if(remainder)
{
copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, remainder * 2 * sizeof(unsigned int), m_maxHandles * 2 * sizeof(unsigned int), 0);
}
copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int));
memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
copyArrayToDevice(m_dPairBuff, m_hPairBuff, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
}
void bt3dGridBroadphaseOCL::initKernel(int kernelId, char* pName)
{
cl_int ciErrNum;
cl_kernel kernel = clCreateKernel(m_cpProgram, pName, &ciErrNum);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
size_t wgSize;
ciErrNum = clGetKernelWorkGroupInfo(kernel, m_cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
m_kernels[kernelId].m_Id = kernelId;
m_kernels[kernelId].m_kernel = kernel;
m_kernels[kernelId].m_name = pName;
m_kernels[kernelId].m_workgroupSize = (int)wgSize;
return;
}
void bt3dGridBroadphaseOCL::runKernelWithWorkgroupSize(int kernelId, int globalSize)
{
if(globalSize <= 0)
{
return;
}
cl_kernel kernelFunc = m_kernels[kernelId].m_kernel;
cl_int ciErrNum = clSetKernelArg(kernelFunc, 0, sizeof(int), (void*)&globalSize);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
int workgroupSize = btMin(64,m_kernels[kernelId].m_workgroupSize);
if(workgroupSize <= 0)
{ // let OpenCL library calculate workgroup size
size_t globalWorkSize[2];
globalWorkSize[0] = globalSize;
globalWorkSize[1] = 1;
ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, NULL, 0,0,0 );
}
else
{
size_t localWorkSize[2], globalWorkSize[2];
//workgroupSize = btMin(workgroupSize, globalSize);
int num_t = globalSize / workgroupSize;
int num_g = num_t * workgroupSize;
if(num_g < globalSize)
{
num_t++;
}
localWorkSize[0] = workgroupSize;
globalWorkSize[0] = num_t * workgroupSize;
localWorkSize[1] = 1;
globalWorkSize[1] = 1;
ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, localWorkSize, 0,0,0 );
}
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
ciErrNum = clFlush(m_cqCommandQue);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
}
void bt3dGridBroadphaseOCL::setKernelArg(int kernelId, int argNum, int argSize, void* argPtr)
{
cl_int ciErrNum;
ciErrNum = clSetKernelArg(m_kernels[kernelId].m_kernel, argNum, argSize, argPtr);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
}
void bt3dGridBroadphaseOCL::copyArrayToDevice(cl_mem device, const void* host, unsigned int size, int devOffs, int hostOffs)
{
if (size)
{
cl_int ciErrNum;
char* pHost = (char*)host + hostOffs;
ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, device, CL_TRUE, devOffs, size, pHost, 0, NULL, NULL);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
}
}
void bt3dGridBroadphaseOCL::copyArrayFromDevice(void* host, const cl_mem device, unsigned int size, int hostOffs, int devOffs)
{
if (size)
{
cl_int ciErrNum;
char* pHost = (char*)host + hostOffs;
ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, device, CL_TRUE, devOffs, size, pHost, 0, NULL, NULL);
GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
}
}
//
// overrides
//
void bt3dGridBroadphaseOCL::prepareAABB()
{
btGpu3DGridBroadphase::prepareAABB();
copyArrayToDevice(m_dAABB, m_hAABB, sizeof(bt3DGrid3F1U) * 2 * (m_numHandles + m_numLargeHandles));
return;
}
void bt3dGridBroadphaseOCL::setParameters(bt3DGridBroadphaseParams* hostParams)
{
btGpu3DGridBroadphase::setParameters(hostParams);
struct btParamsBpOCL
{
float m_invCellSize[4];
int m_gridSize[4];
};
btParamsBpOCL hParams;
hParams.m_invCellSize[0] = m_params.m_invCellSizeX;
hParams.m_invCellSize[1] = m_params.m_invCellSizeY;
hParams.m_invCellSize[2] = m_params.m_invCellSizeZ;
hParams.m_invCellSize[3] = 0.f;
hParams.m_gridSize[0] = m_params.m_gridSizeX;
hParams.m_gridSize[1] = m_params.m_gridSizeY;
hParams.m_gridSize[2] = m_params.m_gridSizeZ;
hParams.m_gridSize[3] = m_params.m_maxBodiesPerCell;
copyArrayToDevice(m_dBpParams, &hParams, sizeof(btParamsBpOCL));
return;
}
void bt3dGridBroadphaseOCL::calcHashAABB()
{
BT_PROFILE("calcHashAABB");
#if 1
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_CALC_HASH_AABB, m_numHandles);
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
clFinish(m_cqCommandQue);
#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
#else
btGpu3DGridBroadphase::calcHashAABB();
#endif
return;
}
void bt3dGridBroadphaseOCL::sortHash()
{
BT_PROFILE("sortHash");
#ifdef CL_PLATFORM_MINI_CL
//copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
btGpu3DGridBroadphase::sortHash();
copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
#else
//#define USE_HOST
#ifdef USE_HOST
copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
//adl::Buffer<unsigned int> keysIn,keysOut,valuesIn,valuesOut;
///adl::RadixSort32<adl::TYPE_CL>::execute(dataC,keysIn,keysOut,valuesIn,valuesOut,m_numHandles);
adl::HostBuffer<adl::SortData> inoutHost;
inoutHost.m_device = m_deviceHost;
inoutHost.m_ptr = (adl::SortData*)m_hBodiesHash;
inoutHost.m_size = m_numHandles;
adl::RadixSort<adl::TYPE_HOST>::execute(dataHost, inoutHost,m_numHandles);
copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
#else
{
clFinish(m_cqCommandQue);
BT_PROFILE("RadixSort32::execute");
adl::Buffer<adl::SortData> inout;
inout.m_device = this->m_deviceCL;
inout.m_size = m_numHandles;
inout.m_ptr = (adl::SortData*)m_dBodiesHash;
int actualHandles = m_numHandles;
int dataAlignment = adl::RadixSort32<adl::TYPE_CL>::DATA_ALIGNMENT;
if (actualHandles%dataAlignment)
{
actualHandles += dataAlignment-(actualHandles%dataAlignment);
}
adl::RadixSort32<adl::TYPE_CL>::execute(dataC,inout, actualHandles);
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
clFinish(m_cqCommandQue);
#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
}
{
//BT_PROFILE("copyArrayFromDevice");
//copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
clFinish(m_cqCommandQue);
#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
}
#endif //USE_HOST
#endif
return;
}
void bt3dGridBroadphaseOCL::findCellStart()
{
#if 1
BT_PROFILE("findCellStart");
#if defined(CL_PLATFORM_MINI_CL)
btGpu3DGridBroadphase::findCellStart();
copyArrayToDevice(m_dCellStart, m_hCellStart, m_numCells * sizeof(unsigned int));
#else
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_CLEAR_CELL_START, m_numCells);
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_CELL_START, m_numHandles);
#endif
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
clFinish(m_cqCommandQue);
#endif
#else
btGpu3DGridBroadphase::findCellStart();
#endif
return;
}
void bt3dGridBroadphaseOCL::findOverlappingPairs()
{
#if 1
BT_PROFILE("findOverlappingPairs");
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, m_numHandles);
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
clFinish(m_cqCommandQue);
#endif
#else
btGpu3DGridBroadphase::findOverlappingPairs();
copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int));
copyArrayToDevice(m_dPairBuff, m_hPairBuff, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
#endif
return;
}
void bt3dGridBroadphaseOCL::findPairsLarge()
{
BT_PROFILE("findPairsLarge");
#if 1
if(m_numLargeHandles)
{
setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 6, sizeof(int),(void*)&m_numLargeHandles);
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, m_numHandles);
}
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
clFinish(m_cqCommandQue);
#endif
#else
btGpu3DGridBroadphase::findPairsLarge();
#endif
return;
}
void bt3dGridBroadphaseOCL::computePairCacheChanges()
{
BT_PROFILE("computePairCacheChanges");
#if 1
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, m_numHandles);
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
clFinish(m_cqCommandQue);
#endif
copyArrayFromDevice( m_hPairScanChanged,m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
#else
btGpu3DGridBroadphase::computePairCacheChanges();
copyArrayToDevice(m_dPairScanChanged, m_hPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
#endif
return;
}
extern cl_device_type deviceType;
void bt3dGridBroadphaseOCL::scanOverlappingPairBuff(bool copyToCpu)
{
//Intel/CPU version doesn't handlel Adl scan well
#if 0
{
copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
btGpu3DGridBroadphase::scanOverlappingPairBuff();
copyArrayToDevice(m_dPairScanChanged, m_hPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
m_numPrefixSum = m_hPairScanChanged[m_numHandles+1];
clFinish(m_cqCommandQue);
//memset(m_hPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
}
#else
{
// copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
// btGpu3DGridBroadphase::scanOverlappingPairBuff();
adl::Buffer<unsigned int> destBuffer;
{
BT_PROFILE("copy GPU->GPU");
destBuffer.m_ptr = (unsigned int*)m_dPairScanChanged;
destBuffer.m_device = m_deviceCL;
destBuffer.m_size = sizeof(unsigned int)*(m_numHandles+2);
m_deviceCL->copy(m_srcClBuffer, &destBuffer,m_numHandles,1,1);
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
clFinish(m_cqCommandQue);
#endif
}
{
BT_PROFILE("PrefixScan");
adl::PrefixScan<adl::TYPE_CL>::execute(gData1,*m_srcClBuffer,destBuffer, m_numHandles+2,&m_numPrefixSum);
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
clFinish(m_cqCommandQue);
#endif
//if (m_numPrefixSum>0x1000)
// {
// printf("error m_numPrefixSum==%d\n",m_numPrefixSum);
// }
}
#if 0
unsigned int* verifyhPairScanChanged = new unsigned int[m_maxHandles + 2];
memset(verifyhPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
copyArrayFromDevice(verifyhPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
clFinish(m_cqCommandQue);
/*for (int i=0;i<m_numHandles+2;i++)
{
if (verifyhPairScanChanged[i] != m_hPairScanChanged[i])
{
printf("hello!\n");
}
}
*/
#endif
if (1)
{
//the data
if (copyToCpu)
{
BT_PROFILE("copy GPU -> CPU");
copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
clFinish(m_cqCommandQue);
#endif
}
}
}
#endif
}
void bt3dGridBroadphaseOCL::squeezeOverlappingPairBuff()
{
BT_PROFILE("btCuda_squeezeOverlappingPairBuff");
#if 1
runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, m_numHandles);
// btCuda_squeezeOverlappingPairBuff(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScanChanged, m_dPairsChanged, m_dAABB, m_numHandles);
//copyArrayFromDevice(m_hPairsChanged, m_dPairsChanged, sizeof(unsigned int) * m_numPrefixSum);//m_hPairScanChanged[m_numHandles+1]); //gSum
#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
clFinish(m_cqCommandQue);
#endif
#else
btGpu3DGridBroadphase::squeezeOverlappingPairBuff();
#endif
return;
}
void bt3dGridBroadphaseOCL::resetPool(btDispatcher* dispatcher)
{
btGpu3DGridBroadphase::resetPool(dispatcher);
prefillBuffers();
}

View File

@@ -0,0 +1,146 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT3DGRIDBROADPHASEOCL_H
#define BT3DGRIDBROADPHASEOCL_H
#ifdef __APPLE__
#ifdef USE_MINICL
#include <MiniCL/cl.h>
#else
#include <MiniCL/cl.h>
#endif
//CL_PLATFORM_MINI_CL could be defined in build system
#else
//#include <GL/glew.h>
// standard utility and system includes
#ifdef USE_MINICL
#include <MiniCL/cl.h>
#else
#include <CL/cl.h>
#endif
// Extra CL/GL include
//#include <CL/cl_gl.h>
#endif //__APPLE__
namespace adl
{
struct Device;
struct DeviceCL;
};
#include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"
#include "btGpu3DGridBroadphaseSharedTypes.h"
#include "btGpu3DGridBroadphase.h"
#define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); btAssert((a) == (b)); }
enum
{
GRID3DOCL_KERNEL_CALC_HASH_AABB = 0,
GRID3DOCL_KERNEL_CLEAR_CELL_START,
GRID3DOCL_KERNEL_FIND_CELL_START,
GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS,
GRID3DOCL_KERNEL_FIND_PAIRS_LARGE,
GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES,
GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF,
GRID3DOCL_KERNEL_TOTAL
};
struct bt3dGridOCLKernelInfo
{
int m_Id;
cl_kernel m_kernel;
char* m_name;
int m_workgroupSize;
};
///The bt3dGridBroadphaseOCL uses OpenCL-capable GPU to compute overlapping pairs
class bt3dGridBroadphaseOCL : public btGpu3DGridBroadphase
{
protected:
int m_hashSize;
cl_context m_cxMainContext;
cl_device_id m_cdDevice;
cl_command_queue m_cqCommandQue;
cl_program m_cpProgram;
bt3dGridOCLKernelInfo m_kernels[GRID3DOCL_KERNEL_TOTAL];
// data buffers
cl_mem m_dBodiesHash;
cl_mem m_dCellStart;
cl_mem m_dPairBuff;
cl_mem m_dPairBuffStartCurr;
public:
cl_mem m_dAABB;
protected:
cl_mem m_dPairScanChanged;
cl_mem m_dPairsChanged;
cl_mem m_dPairsContiguous;
cl_mem m_dBpParams;
adl::Device* m_deviceHost;
adl::DeviceCL* m_deviceCL;
bool m_ownsDevice;
public:
unsigned int m_numPrefixSum;
bt3dGridBroadphaseOCL( btOverlappingPairCache* overlappingPairCache,
const btVector3& cellSize,
int gridSizeX, int gridSizeY, int gridSizeZ,
int maxSmallProxies, int maxLargeProxies, int maxPairsPerSmallProxy,
btScalar maxSmallProxySize,
int maxSmallProxiesPerCell = 8,
cl_context context = NULL,
cl_device_id device = NULL,
cl_command_queue queue = NULL,
adl::DeviceCL* deviceCL = 0
);
virtual ~bt3dGridBroadphaseOCL();
protected:
void initCL(cl_context context, cl_device_id device, cl_command_queue queue);
void initKernels();
void allocateBuffers();
void prefillBuffers();
void initKernel(int kernelId, char* pName);
void allocateArray(void** devPtr, unsigned int size);
void freeArray(void* devPtr);
void runKernelWithWorkgroupSize(int kernelId, int globalSize);
void setKernelArg(int kernelId, int argNum, int argSize, void* argPtr);
void copyArrayToDevice(cl_mem device, const void* host, unsigned int size, int devOffs = 0, int hostOffs = 0);
void copyArrayFromDevice(void* host, const cl_mem device, unsigned int size, int hostOffs = 0, int devOffs = 0);
// overrides
virtual void setParameters(bt3DGridBroadphaseParams* hostParams);
virtual void prepareAABB();
virtual void calcHashAABB();
virtual void sortHash();
virtual void findCellStart();
virtual void findOverlappingPairs();
virtual void findPairsLarge();
virtual void computePairCacheChanges();
virtual void scanOverlappingPairBuff(bool copyToCpu=true);
virtual void squeezeOverlappingPairBuff();
virtual void resetPool(btDispatcher* dispatcher);
};
#endif //BT3DGRIDBROADPHASEOCL_H

View File

@@ -0,0 +1,626 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006, 2009 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///The 3 following lines include the CPU implementation of the kernels, keep them in this order.
#include "btGpuDefines.h"
#include "btGpuUtilsSharedDefs.h"
#include "btGpuUtilsSharedCode.h"
#include "LinearMath/btAlignedAllocator.h"
#include "LinearMath/btQuickprof.h"
#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
#include "btGpuDefines.h"
#include "btGpuUtilsSharedDefs.h"
#include "btGpu3DGridBroadphaseSharedDefs.h"
#include "btGpu3DGridBroadphase.h"
#include <string.h> //for memset
#include <stdio.h>
static bt3DGridBroadphaseParams s3DGridBroadphaseParams;
btGpu3DGridBroadphase::btGpu3DGridBroadphase( const btVector3& cellSize,
int gridSizeX, int gridSizeY, int gridSizeZ,
int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
btScalar maxSmallProxySize,
int maxBodiesPerCell) :
btSimpleBroadphase(maxSmallProxies,
// new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache),
new (btAlignedAlloc(sizeof(btHashedOverlappingPairCache),16)) btHashedOverlappingPairCache),
m_bInitialized(false),
m_numBodies(0)
{
_initialize(cellSize, gridSizeX, gridSizeY, gridSizeZ,
maxSmallProxies, maxLargeProxies, maxPairsPerBody,
maxSmallProxySize, maxBodiesPerCell);
}
btGpu3DGridBroadphase::btGpu3DGridBroadphase( btOverlappingPairCache* overlappingPairCache,
const btVector3& cellSize,
int gridSizeX, int gridSizeY, int gridSizeZ,
int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
btScalar maxSmallProxySize,
int maxBodiesPerCell) :
btSimpleBroadphase(maxSmallProxies, overlappingPairCache),
m_bInitialized(false),
m_numBodies(0)
{
_initialize(cellSize, gridSizeX, gridSizeY, gridSizeZ,
maxSmallProxies, maxLargeProxies, maxPairsPerBody,
maxSmallProxySize, maxBodiesPerCell);
}
btGpu3DGridBroadphase::~btGpu3DGridBroadphase()
{
//btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache
assert(m_bInitialized);
_finalize();
}
// returns 2^n : 2^(n+1) > val >= 2^n
int btGpu3DGridBroadphase::getFloorPowOfTwo(int val)
{
int mask = 0x40000000;
for(int k = 0; k < 30; k++, mask >>= 1)
{
if(mask & val)
{
break;
}
}
return mask;
}
void btGpu3DGridBroadphase::_initialize( const btVector3& cellSize,
int gridSizeX, int gridSizeY, int gridSizeZ,
int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
btScalar maxSmallProxySize,
int maxBodiesPerCell)
{
// set various paramerers
m_ownsPairCache = true;
m_params.m_gridSizeX = getFloorPowOfTwo(gridSizeX);
m_params.m_gridSizeY = getFloorPowOfTwo(gridSizeY);
m_params.m_gridSizeZ = getFloorPowOfTwo(gridSizeZ);
m_params.m_numCells = m_params.m_gridSizeX * m_params.m_gridSizeY * m_params.m_gridSizeZ;
m_numCells = m_params.m_numCells;
m_params.m_invCellSizeX = btScalar(1.f) / cellSize[0];
m_params.m_invCellSizeY = btScalar(1.f) / cellSize[1];
m_params.m_invCellSizeZ = btScalar(1.f) / cellSize[2];
m_maxRadius = maxSmallProxySize * btScalar(0.5f);
m_params.m_numBodies = m_numBodies;
m_params.m_maxBodiesPerCell = maxBodiesPerCell;
m_numLargeHandles = 0;
m_maxLargeHandles = maxLargeProxies;
m_maxPairsPerBody = maxPairsPerBody;
m_LastLargeHandleIndex = -1;
assert(!m_bInitialized);
// allocate host storage
m_hBodiesHash = new unsigned int[m_maxHandles * 2];
memset(m_hBodiesHash, 0x00, m_maxHandles*2*sizeof(unsigned int));
m_hCellStart = new unsigned int[m_params.m_numCells];
memset(m_hCellStart, 0x00, m_params.m_numCells * sizeof(unsigned int));
m_hPairBuffStartCurr = new unsigned int[m_maxHandles * 2 + 2];
// --------------- for now, init with m_maxPairsPerBody for each body
m_hPairBuffStartCurr[0] = 0;
m_hPairBuffStartCurr[1] = 0;
for(int i = 1; i <= m_maxHandles; i++)
{
m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
m_hPairBuffStartCurr[i * 2 + 1] = 0;
}
//----------------
unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
m_hAABB = new bt3DGrid3F1U[numAABB * 2]; // AABB Min & Max
m_hPairBuff = new unsigned int[m_maxHandles * m_maxPairsPerBody];
memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); // needed?
m_hPairScanChanged = new unsigned int[m_maxHandles + 2];
memset(m_hPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
m_hPairsChanged = new unsigned int[m_maxHandles * m_maxPairsPerBody];
memset(m_hPairsChanged,0,sizeof(int)*(m_maxHandles * m_maxPairsPerBody));
m_hAllOverlappingPairs= new MyUint2[m_maxHandles * m_maxPairsPerBody];
memset(m_hAllOverlappingPairs,0,sizeof(MyUint2)*(m_maxHandles * m_maxPairsPerBody));
// large proxies
// allocate handles buffer and put all handles on free list
m_pLargeHandlesRawPtr = btAlignedAlloc(sizeof(btSimpleBroadphaseProxy) * m_maxLargeHandles, 16);
m_pLargeHandles = new(m_pLargeHandlesRawPtr) btSimpleBroadphaseProxy[m_maxLargeHandles];
m_firstFreeLargeHandle = 0;
{
for (int i = m_firstFreeLargeHandle; i < m_maxLargeHandles; i++)
{
m_pLargeHandles[i].SetNextFree(i + 1);
m_pLargeHandles[i].m_uniqueId = m_maxHandles+2+i;
}
m_pLargeHandles[m_maxLargeHandles - 1].SetNextFree(0);
}
// debug data
m_numPairsAdded = 0;
m_numOverflows = 0;
m_bInitialized = true;
}
void btGpu3DGridBroadphase::_finalize()
{
assert(m_bInitialized);
delete [] m_hBodiesHash;
delete [] m_hCellStart;
delete [] m_hPairBuffStartCurr;
delete [] m_hAABB;
delete [] m_hPairBuff;
delete [] m_hPairScanChanged;
delete [] m_hPairsChanged;
delete [] m_hAllOverlappingPairs;
btAlignedFree(m_pLargeHandlesRawPtr);
m_bInitialized = false;
}
void btGpu3DGridBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher)
{
btSimpleBroadphase::calculateOverlappingPairs(dispatcher);
if(m_numHandles <= 0)
{
BT_PROFILE("addLarge2LargePairsToCache");
addLarge2LargePairsToCache(dispatcher);
return;
}
// update constants
{
BT_PROFILE("setParameters");
setParameters(&m_params);
}
// prepare AABB array
{
BT_PROFILE("prepareAABB");
prepareAABB();
}
// calculate hash
{
BT_PROFILE("calcHashAABB");
calcHashAABB();
}
{
BT_PROFILE("sortHash");
// sort bodies based on hash
sortHash();
}
// find start of each cell
{
BT_PROFILE("findCellStart");
findCellStart();
}
{
BT_PROFILE("findOverlappingPairs");
// findOverlappingPairs (small/small)
findOverlappingPairs();
}
// findOverlappingPairs (small/large)
{
BT_PROFILE("findPairsLarge");
findPairsLarge();
}
// add pairs to CPU cache
{
BT_PROFILE("computePairCacheChanges");
computePairCacheChanges();
}
{
BT_PROFILE("scanOverlappingPairBuff");
scanOverlappingPairBuff();
}
{
BT_PROFILE("squeezeOverlappingPairBuff");
squeezeOverlappingPairBuff();
}
{
BT_PROFILE("addPairsToCache");
addPairsToCache(dispatcher);
}
// find and add large/large pairs to CPU cache
{
BT_PROFILE("addLarge2LargePairsToCache");
addLarge2LargePairsToCache(dispatcher);
}
return;
}
void btGpu3DGridBroadphase::addPairsToCache(btDispatcher* dispatcher)
{
m_numPairsAdded = 0;
m_numPairsRemoved = 0;
for(int i = 0; i < m_numHandles; i++)
{
unsigned int num = m_hPairScanChanged[i+2] - m_hPairScanChanged[i+1];
if(!num)
{
continue;
}
unsigned int* pInp = m_hPairsChanged + m_hPairScanChanged[i+1];
unsigned int index0 = m_hAABB[i * 2].uw;
btSimpleBroadphaseProxy* proxy0 = &m_pHandles[index0];
for(unsigned int j = 0; j < num; j++)
{
unsigned int indx1_s = pInp[j];
unsigned int index1 = indx1_s & (~BT_3DGRID_PAIR_ANY_FLG);
btSimpleBroadphaseProxy* proxy1;
if(index1 < (unsigned int)m_maxHandles)
{
proxy1 = &m_pHandles[index1];
}
else
{
index1 -= m_maxHandles;
btAssert((index1 >= 0) && (index1 < (unsigned int)m_maxLargeHandles));
proxy1 = &m_pLargeHandles[index1];
}
if(indx1_s & BT_3DGRID_PAIR_NEW_FLG)
{
m_pairCache->addOverlappingPair(proxy0,proxy1);
m_numPairsAdded++;
}
else
{
m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
m_numPairsRemoved++;
}
}
}
}
btBroadphaseProxy* btGpu3DGridBroadphase::createProxy( const btVector3& aabbMin, const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy)
{
btBroadphaseProxy* proxy;
bool bIsLarge = isLargeProxy(aabbMin, aabbMax);
if(bIsLarge)
{
if (m_numLargeHandles >= m_maxLargeHandles)
{
///you have to increase the cell size, so 'large' proxies become 'small' proxies (fitting a cell)
btAssert(0);
return 0; //should never happen, but don't let the game crash ;-)
}
btAssert((aabbMin[0]<= aabbMax[0]) && (aabbMin[1]<= aabbMax[1]) && (aabbMin[2]<= aabbMax[2]));
int newHandleIndex = allocLargeHandle();
proxy = new (&m_pLargeHandles[newHandleIndex])btSimpleBroadphaseProxy(aabbMin,aabbMax,shapeType,userPtr,collisionFilterGroup,collisionFilterMask,multiSapProxy);
}
else
{
proxy = btSimpleBroadphase::createProxy(aabbMin, aabbMax, shapeType, userPtr, collisionFilterGroup, collisionFilterMask, dispatcher, multiSapProxy);
}
return proxy;
}
void btGpu3DGridBroadphase::destroyProxy(btBroadphaseProxy* proxy, btDispatcher* dispatcher)
{
bool bIsLarge = isLargeProxy(proxy);
if(bIsLarge)
{
btSimpleBroadphaseProxy* proxy0 = static_cast<btSimpleBroadphaseProxy*>(proxy);
freeLargeHandle(proxy0);
m_pairCache->removeOverlappingPairsContainingProxy(proxy,dispatcher);
}
else
{
btSimpleBroadphase::destroyProxy(proxy, dispatcher);
}
return;
}
void btGpu3DGridBroadphase::resetPool(btDispatcher* dispatcher)
{
m_hPairBuffStartCurr[0] = 0;
m_hPairBuffStartCurr[1] = 0;
for(int i = 1; i <= m_maxHandles; i++)
{
m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
m_hPairBuffStartCurr[i * 2 + 1] = 0;
}
}
bool btGpu3DGridBroadphase::isLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax)
{
btVector3 diag = aabbMax - aabbMin;
///use the bounding sphere radius of this bounding box, to include rotation
btScalar radius = diag.length() * btScalar(0.5f);
return (radius > m_maxRadius);
}
bool btGpu3DGridBroadphase::isLargeProxy(btBroadphaseProxy* proxy)
{
return (proxy->getUid() >= (m_maxHandles+2));
}
void btGpu3DGridBroadphase::addLarge2LargePairsToCache(btDispatcher* dispatcher)
{
int i,j;
if (m_numLargeHandles <= 0)
{
return;
}
int new_largest_index = -1;
for(i = 0; i <= m_LastLargeHandleIndex; i++)
{
btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
new_largest_index = i;
for(j = i + 1; j <= m_LastLargeHandleIndex; j++)
{
btSimpleBroadphaseProxy* proxy1 = &m_pLargeHandles[j];
btAssert(proxy0 != proxy1);
btSimpleBroadphaseProxy* p0 = getSimpleProxyFromProxy(proxy0);
btSimpleBroadphaseProxy* p1 = getSimpleProxyFromProxy(proxy1);
if(aabbOverlap(p0,p1))
{
if (!m_pairCache->findPair(proxy0,proxy1))
{
m_pairCache->addOverlappingPair(proxy0,proxy1);
}
}
else
{
if(m_pairCache->findPair(proxy0,proxy1))
{
m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
}
}
}
}
m_LastLargeHandleIndex = new_largest_index;
return;
}
void btGpu3DGridBroadphase::rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback)
{
btSimpleBroadphase::rayTest(rayFrom, rayTo, rayCallback);
for (int i=0; i <= m_LastLargeHandleIndex; i++)
{
btSimpleBroadphaseProxy* proxy = &m_pLargeHandles[i];
rayCallback.process(proxy);
}
}
//
// overrides for CPU version
//
void btGpu3DGridBroadphase::prepareAABB()
{
BT_PROFILE("prepareAABB");
bt3DGrid3F1U* pBB = m_hAABB;
int i;
int new_largest_index = -1;
unsigned int num_small = 0;
for(i = 0; i <= m_LastHandleIndex; i++)
{
btSimpleBroadphaseProxy* proxy0 = &m_pHandles[i];
new_largest_index = i;
pBB->fx = proxy0->m_aabbMin.getX();
pBB->fy = proxy0->m_aabbMin.getY();
pBB->fz = proxy0->m_aabbMin.getZ();
pBB->uw = i;
pBB++;
pBB->fx = proxy0->m_aabbMax.getX();
pBB->fy = proxy0->m_aabbMax.getY();
pBB->fz = proxy0->m_aabbMax.getZ();
pBB->uw = num_small;
pBB++;
num_small++;
}
m_LastHandleIndex = new_largest_index;
new_largest_index = -1;
unsigned int num_large = 0;
for(i = 0; i <= m_LastLargeHandleIndex; i++)
{
btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
new_largest_index = i;
pBB->fx = proxy0->m_aabbMin.getX();
pBB->fy = proxy0->m_aabbMin.getY();
pBB->fz = proxy0->m_aabbMin.getZ();
pBB->uw = i + m_maxHandles;
pBB++;
pBB->fx = proxy0->m_aabbMax.getX();
pBB->fy = proxy0->m_aabbMax.getY();
pBB->fz = proxy0->m_aabbMax.getZ();
pBB->uw = num_large + m_maxHandles;
pBB++;
num_large++;
}
m_LastLargeHandleIndex = new_largest_index;
// paranoid checks
btAssert(num_small == m_numHandles);
btAssert(num_large == m_numLargeHandles);
return;
}
void btGpu3DGridBroadphase::setParameters(bt3DGridBroadphaseParams* hostParams)
{
s3DGridBroadphaseParams = *hostParams;
return;
}
void btGpu3DGridBroadphase::calcHashAABB()
{
BT_PROFILE("bt3DGrid_calcHashAABB");
btGpu_calcHashAABB(m_hAABB, m_hBodiesHash, m_numHandles);
return;
}
void btGpu3DGridBroadphase::sortHash()
{
class bt3DGridHashKey
{
public:
unsigned int hash;
unsigned int index;
void quickSort(bt3DGridHashKey* pData, int lo, int hi)
{
int i=lo, j=hi;
bt3DGridHashKey x = pData[(lo+hi)/2];
do
{
while(pData[i].hash > x.hash) i++;
while(x.hash > pData[j].hash) j--;
if(i <= j)
{
bt3DGridHashKey t = pData[i];
pData[i] = pData[j];
pData[j] = t;
i++; j--;
}
} while(i <= j);
if(lo < j) pData->quickSort(pData, lo, j);
if(i < hi) pData->quickSort(pData, i, hi);
}
};
BT_PROFILE("bt3DGrid_sortHash");
bt3DGridHashKey* pHash = (bt3DGridHashKey*)m_hBodiesHash;
pHash->quickSort(pHash, 0, m_numHandles - 1);
return;
}
void btGpu3DGridBroadphase::findCellStart()
{
BT_PROFILE("bt3DGrid_findCellStart");
btGpu_findCellStart(m_hBodiesHash, m_hCellStart, m_numHandles, m_params.m_numCells);
return;
}
void btGpu3DGridBroadphase::findOverlappingPairs()
{
BT_PROFILE("bt3DGrid_findOverlappingPairs");
btGpu_findOverlappingPairs(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr, m_numHandles);
return;
}
void btGpu3DGridBroadphase::findPairsLarge()
{
BT_PROFILE("bt3DGrid_findPairsLarge");
btGpu_findPairsLarge(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr, m_numHandles, m_numLargeHandles);
return;
}
void btGpu3DGridBroadphase::computePairCacheChanges()
{
BT_PROFILE("bt3DGrid_computePairCacheChanges");
btGpu_computePairCacheChanges(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScanChanged, m_hAABB, m_numHandles);
return;
}
void btGpu3DGridBroadphase::scanOverlappingPairBuff(bool copyToCpu)
{
BT_PROFILE("bt3DGrid_scanOverlappingPairBuff");
unsigned int sum = 0;
m_hPairScanChanged[0]=0;
for(int i = 0; i <= m_numHandles+1; i++)
{
unsigned int delta = m_hPairScanChanged[i];
m_hPairScanChanged[i] = sum;
sum += delta;
}
return;
}
void btGpu3DGridBroadphase::squeezeOverlappingPairBuff()
{
BT_PROFILE("bt3DGrid_squeezeOverlappingPairBuff");
//btGpu_squeezeOverlappingPairBuff(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScanChanged, m_hPairsChanged, m_hAABB, m_numHandles);
btGpu_squeezeOverlappingPairBuff(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScanChanged, (unsigned int*)m_hAllOverlappingPairs, m_hAABB, m_numHandles);
return;
}
#include "btGpu3DGridBroadphaseSharedCode.h"

View File

@@ -0,0 +1,154 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006, 2009 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//----------------------------------------------------------------------------------------
#ifndef BTGPU3DGRIDBROADPHASE_H
#define BTGPU3DGRIDBROADPHASE_H
//----------------------------------------------------------------------------------------
#include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"
#include "btGpu3DGridBroadphaseSharedTypes.h"
struct MyUint2
{
int x;
int y;
};
//----------------------------------------------------------------------------------------
///The btGpu3DGridBroadphase uses GPU-style code compiled for CPU to compute overlapping pairs
class btGpu3DGridBroadphase : public btSimpleBroadphase
{
protected:
bool m_bInitialized;
unsigned int m_numBodies;
unsigned int m_numCells;
unsigned int m_maxPairsPerBody;
unsigned int m_maxBodiesPerCell;
bt3DGridBroadphaseParams m_params;
btScalar m_maxRadius;
// CPU data
unsigned int* m_hBodiesHash;
unsigned int* m_hCellStart;
unsigned int* m_hPairBuffStartCurr;
bt3DGrid3F1U* m_hAABB;
unsigned int* m_hPairBuff;
unsigned int* m_hPairScanChanged;
unsigned int* m_hPairsChanged;
MyUint2* m_hAllOverlappingPairs;
// large proxies
int m_numLargeHandles;
int m_maxLargeHandles;
int m_LastLargeHandleIndex;
btSimpleBroadphaseProxy* m_pLargeHandles;
void* m_pLargeHandlesRawPtr;
int m_firstFreeLargeHandle;
int allocLargeHandle()
{
btAssert(m_numLargeHandles < m_maxLargeHandles);
int freeLargeHandle = m_firstFreeLargeHandle;
m_firstFreeLargeHandle = m_pLargeHandles[freeLargeHandle].GetNextFree();
m_numLargeHandles++;
if(freeLargeHandle > m_LastLargeHandleIndex)
{
m_LastLargeHandleIndex = freeLargeHandle;
}
return freeLargeHandle;
}
void freeLargeHandle(btSimpleBroadphaseProxy* proxy)
{
int handle = int(proxy - m_pLargeHandles);
btAssert((handle >= 0) && (handle < m_maxHandles));
if(handle == m_LastLargeHandleIndex)
{
m_LastLargeHandleIndex--;
}
proxy->SetNextFree(m_firstFreeLargeHandle);
m_firstFreeLargeHandle = handle;
proxy->m_clientObject = 0;
m_numLargeHandles--;
}
bool isLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax);
bool isLargeProxy(btBroadphaseProxy* proxy);
// debug
unsigned int m_numPairsAdded;
unsigned int m_numPairsRemoved;
unsigned int m_numOverflows;
//
public:
virtual int getNumOverlap()
{
return m_hPairScanChanged[m_numHandles+1];
}
virtual MyUint2* getOverlap()
{
return m_hAllOverlappingPairs;
}
// NOTE : for better results gridSizeX, gridSizeY and gridSizeZ should be powers of 2
btGpu3DGridBroadphase(const btVector3& cellSize,
int gridSizeX, int gridSizeY, int gridSizeZ,
int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
btScalar maxSmallProxySize,
int maxBodiesPerCell = 8);
btGpu3DGridBroadphase( btOverlappingPairCache* overlappingPairCache,
const btVector3& cellSize,
int gridSizeX, int gridSizeY, int gridSizeZ,
int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
btScalar maxSmallProxySize,
int maxBodiesPerCell = 8);
virtual ~btGpu3DGridBroadphase();
virtual void calculateOverlappingPairs(btDispatcher* dispatcher);
virtual btBroadphaseProxy* createProxy(const btVector3& aabbMin, const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy);
virtual void destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher);
virtual void rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback);
virtual void resetPool(btDispatcher* dispatcher);
static int getFloorPowOfTwo(int val); // returns 2^n : 2^(n+1) > val >= 2^n
protected:
void _initialize( const btVector3& cellSize,
int gridSizeX, int gridSizeY, int gridSizeZ,
int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
btScalar maxSmallProxySize,
int maxBodiesPerCell);
void _finalize();
void addPairsToCache(btDispatcher* dispatcher);
void addLarge2LargePairsToCache(btDispatcher* dispatcher);
// overrides for CPU version
virtual void setParameters(bt3DGridBroadphaseParams* hostParams);
virtual void prepareAABB();
virtual void calcHashAABB();
virtual void sortHash();
virtual void findCellStart();
virtual void findOverlappingPairs();
virtual void findPairsLarge();
virtual void computePairCacheChanges();
virtual void scanOverlappingPairBuff(bool copyToCpu=true);
virtual void squeezeOverlappingPairBuff();
};
//----------------------------------------------------------------------------------------
#endif //BTGPU3DGRIDBROADPHASE_H
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------

View File

@@ -0,0 +1,428 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006, 2009 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
// K E R N E L F U N C T I O N S
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
// calculate position in uniform grid
BT_GPU___device__ int3 bt3DGrid_calcGridPos(float4 p)
{
int3 gridPos;
gridPos.x = (int)floor(p.x * BT_GPU_params.m_invCellSizeX) & (BT_GPU_params.m_gridSizeX - 1);
gridPos.y = (int)floor(p.y * BT_GPU_params.m_invCellSizeY) & (BT_GPU_params.m_gridSizeY - 1);
gridPos.z = (int)floor(p.z * BT_GPU_params.m_invCellSizeZ) & (BT_GPU_params.m_gridSizeZ - 1);
return gridPos;
} // bt3DGrid_calcGridPos()
//----------------------------------------------------------------------------------------
// calculate address in grid from position (clamping to edges)
BT_GPU___device__ uint bt3DGrid_calcGridHash(int3 gridPos)
{
gridPos.x &= (BT_GPU_params.m_gridSizeX - 1);
gridPos.y &= (BT_GPU_params.m_gridSizeY - 1);
gridPos.z &= (BT_GPU_params.m_gridSizeZ - 1);
return BT_GPU___mul24(BT_GPU___mul24(gridPos.z, BT_GPU_params.m_gridSizeY), BT_GPU_params.m_gridSizeX) + BT_GPU___mul24(gridPos.y, BT_GPU_params.m_gridSizeX) + gridPos.x;
} // bt3DGrid_calcGridHash()
//----------------------------------------------------------------------------------------
// calculate grid hash value for each body using its AABB
BT_GPU___global__ void calcHashAABBD(bt3DGrid3F1U* pAABB, uint2* pHash, uint numBodies)
{
int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
if(index >= (int)numBodies)
{
return;
}
bt3DGrid3F1U bbMin = pAABB[index*2];
bt3DGrid3F1U bbMax = pAABB[index*2 + 1];
float4 pos;
pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
// get address in grid
int3 gridPos = bt3DGrid_calcGridPos(pos);
uint gridHash = bt3DGrid_calcGridHash(gridPos);
// store grid hash and body index
pHash[index] = BT_GPU_make_uint2(gridHash, index);
} // calcHashAABBD()
//----------------------------------------------------------------------------------------
BT_GPU___global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
{
int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
if(index >= (int)numBodies)
{
return;
}
uint2 sortedData = pHash[index];
// Load hash data into shared memory so that we can look
// at neighboring body's hash value without loading
// two hash values per thread
BT_GPU___shared__ uint sharedHash[257];
sharedHash[BT_GPU_threadIdx.x+1] = sortedData.x;
if((index > 0) && (BT_GPU_threadIdx.x == 0))
{
// first thread in block must load neighbor body hash
volatile uint2 prevData = pHash[index-1];
sharedHash[0] = prevData.x;
}
BT_GPU___syncthreads();
if((index == 0) || (sortedData.x != sharedHash[BT_GPU_threadIdx.x]))
{
cellStart[sortedData.x] = index;
}
} // findCellStartD()
//----------------------------------------------------------------------------------------
BT_GPU___device__ uint cudaTestAABBOverlap(bt3DGrid3F1U min0, bt3DGrid3F1U max0, bt3DGrid3F1U min1, bt3DGrid3F1U max1)
{
return (min0.fx <= max1.fx)&& (min1.fx <= max0.fx) &&
(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) &&
(min0.fz <= max1.fz)&& (min1.fz <= max0.fz);
} // cudaTestAABBOverlap()
//----------------------------------------------------------------------------------------
BT_GPU___device__ void findPairsInCell( int3 gridPos,
uint index,
uint2* pHash,
uint* pCellStart,
bt3DGrid3F1U* pAABB,
uint* pPairBuff,
uint2* pPairBuffStartCurr,
uint numBodies)
{
uint gridHash = bt3DGrid_calcGridHash(gridPos);
// get start of bucket for this cell
uint bucketStart = pCellStart[gridHash];
if (bucketStart == 0xffffffff)
{
return; // cell empty
}
// iterate over bodies in this cell
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2);
bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
uint handleIndex = min0.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
uint curr_max = start_curr_next.x - start - 1;
uint bucketEnd = bucketStart + BT_GPU_params.m_maxBodiesPerCell;
bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
for(uint index2 = bucketStart; index2 < bucketEnd; index2++)
{
uint2 cellData = pHash[index2];
if (cellData.x != gridHash)
{
break; // no longer in same bucket
}
uint unsorted_indx2 = cellData.y;
if (unsorted_indx2 < unsorted_indx) // check not colliding with self
{
bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2);
bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2 + 1);
if(cudaTestAABBOverlap(min0, max0, min1, max1))
{
uint handleIndex2 = min1.uw;
uint k;
for(k = 0; k < curr; k++)
{
uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
if(old_pair == handleIndex2)
{
pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
break;
}
}
if(k == curr)
{
if(curr >= curr_max)
{ // not a good solution, but let's avoid crash
break;
}
pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
curr++;
}
}
}
}
pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
return;
} // findPairsInCell()
//----------------------------------------------------------------------------------------
BT_GPU___global__ void findOverlappingPairsD( bt3DGrid3F1U* pAABB, uint2* pHash, uint* pCellStart,
uint* pPairBuff, uint2* pPairBuffStartCurr, uint numBodies)
{
int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
if(index >= (int)numBodies)
{
return;
}
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
bt3DGrid3F1U bbMin = BT_GPU_FETCH(pAABB, unsorted_indx*2);
bt3DGrid3F1U bbMax = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
float4 pos;
pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
// get address in grid
int3 gridPos = bt3DGrid_calcGridPos(pos);
// examine only neighbouring cells
for(int z=-1; z<=1; z++) {
for(int y=-1; y<=1; y++) {
for(int x=-1; x<=1; x++) {
findPairsInCell(gridPos + BT_GPU_make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
}
}
}
} // findOverlappingPairsD()
//----------------------------------------------------------------------------------------
BT_GPU___global__ void findPairsLargeD( bt3DGrid3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff,
uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
{
int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
if(index >= (int)numBodies)
{
return;
}
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2);
bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
uint handleIndex = min0.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
uint curr_max = start_curr_next.x - start - 1;
for(uint i = 0; i < numLarge; i++)
{
uint indx2 = numBodies + i;
bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, indx2*2);
bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, indx2*2 + 1);
if(cudaTestAABBOverlap(min0, max0, min1, max1))
{
uint k;
uint handleIndex2 = min1.uw;
for(k = 0; k < curr; k++)
{
uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
if(old_pair == handleIndex2)
{
pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
break;
}
}
if(k == curr)
{
pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
if(curr >= curr_max)
{ // not a good solution, but let's avoid crash
break;
}
curr++;
}
}
}
pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
return;
} // findPairsLargeD()
//----------------------------------------------------------------------------------------
BT_GPU___global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr,
uint* pPairScan, bt3DGrid3F1U* pAABB, uint numBodies)
{
int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
if(index >= (int)numBodies)
{
return;
}
bt3DGrid3F1U bbMin = pAABB[index * 2];
uint handleIndex = bbMin.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint *pInp = pPairBuff + start;
uint num_changes = 0;
for(uint k = 0; k < curr; k++, pInp++)
{
//if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
if(((*pInp) & BT_3DGRID_PAIR_ANY_FLG))
{
num_changes++;
}
}
pPairScan[index+1] = num_changes;
} // computePairCacheChangesD()
//----------------------------------------------------------------------------------------
BT_GPU___global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan,
uint2* pPairOut, bt3DGrid3F1U* pAABB, uint numBodies)
{
int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
if(index >= (int)numBodies)
{
return;
}
bt3DGrid3F1U bbMin = pAABB[index * 2];
uint handleIndex = bbMin.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint* pInp = pPairBuff + start;
uint2* pOut = pPairOut + pPairScan[index+1];
uint* pOut2 = pInp;
uint num = 0;
for(uint k = 0; k < curr; k++, pInp++)
{
if((*pInp) & BT_3DGRID_PAIR_ANY_FLG)
//if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
{
pOut->x = handleIndex;
pOut->y = (*pInp) & (~BT_3DGRID_PAIR_ANY_FLG);
pOut++;
}
if((*pInp) & BT_3DGRID_PAIR_ANY_FLG)
{
*pOut2 = (*pInp) & (~BT_3DGRID_PAIR_ANY_FLG);
pOut2++;
num++;
}
}
pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, num);
} // squeezeOverlappingPairBuffD()
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
// E N D O F K E R N E L F U N C T I O N S
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
extern "C"
{
//----------------------------------------------------------------------------------------
void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash, unsigned int numBodies)
{
int numThreads, numBlocks;
BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
// execute the kernel
BT_GPU_EXECKERNEL(numBlocks, numThreads, calcHashAABBD, (pAABB, (uint2*)hash, numBodies));
// check if kernel invocation generated an error
BT_GPU_CHECK_ERROR("calcHashAABBD kernel execution failed");
} // calcHashAABB()
//----------------------------------------------------------------------------------------
void BT_GPU_PREF(findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells))
{
int numThreads, numBlocks;
BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
BT_GPU_SAFE_CALL(BT_GPU_Memset(cellStart, 0xffffffff, numCells*sizeof(uint)));
BT_GPU_EXECKERNEL(numBlocks, numThreads, findCellStartD, ((uint2*)hash, (uint*)cellStart, numBodies));
BT_GPU_CHECK_ERROR("Kernel execution failed: findCellStartD");
} // findCellStart()
//----------------------------------------------------------------------------------------
void BT_GPU_PREF(findOverlappingPairs(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies))
{
#if B_CUDA_USE_TEX
BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(bt3DGrid3F1U)));
#endif
int numThreads, numBlocks;
BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
BT_GPU_EXECKERNEL(numBlocks, numThreads, findOverlappingPairsD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies));
BT_GPU_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
#if B_CUDA_USE_TEX
BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
#endif
} // findOverlappingPairs()
//----------------------------------------------------------------------------------------
void BT_GPU_PREF(findPairsLarge(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge))
{
#if B_CUDA_USE_TEX
BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(bt3DGrid3F1U)));
#endif
int numThreads, numBlocks;
BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
BT_GPU_EXECKERNEL(numBlocks, numThreads, findPairsLargeD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies,numLarge));
BT_GPU_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
#if B_CUDA_USE_TEX
BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
#endif
} // findPairsLarge()
//----------------------------------------------------------------------------------------
void BT_GPU_PREF(computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies))
{
int numThreads, numBlocks;
BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
BT_GPU_EXECKERNEL(numBlocks, numThreads, computePairCacheChangesD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,pAABB,numBodies));
BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
} // computePairCacheChanges()
//----------------------------------------------------------------------------------------
void BT_GPU_PREF(squeezeOverlappingPairBuff(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies))
{
int numThreads, numBlocks;
BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
BT_GPU_EXECKERNEL(numBlocks, numThreads, squeezeOverlappingPairBuffD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,(uint2*)pPairOut,pAABB,numBodies));
BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
} // btCuda_squeezeOverlappingPairBuff()
//------------------------------------------------------------------------------------------------
} // extern "C"
//------------------------------------------------------------------------------------------------
//------------------------------------------------------------------------------------------------
//------------------------------------------------------------------------------------------------

View File

@@ -0,0 +1,61 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006, 2009 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//----------------------------------------------------------------------------------------
// Shared definitions for GPU-based 3D Grid collision detection broadphase
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// Keep this file free from Bullet headers
// it is included into both CUDA and CPU code
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
//----------------------------------------------------------------------------------------
#ifndef BTGPU3DGRIDBROADPHASESHAREDDEFS_H
#define BTGPU3DGRIDBROADPHASESHAREDDEFS_H
//----------------------------------------------------------------------------------------
#include "btGpu3DGridBroadphaseSharedTypes.h"
//----------------------------------------------------------------------------------------
extern "C"
{
//----------------------------------------------------------------------------------------
void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash, unsigned int numBodies);
void BT_GPU_PREF(findCellStart)(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells);
void BT_GPU_PREF(findOverlappingPairs)(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies);
void BT_GPU_PREF(findPairsLarge)(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge);
void BT_GPU_PREF(computePairCacheChanges)(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies);
void BT_GPU_PREF(squeezeOverlappingPairBuff)(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies);
//----------------------------------------------------------------------------------------
} // extern "C"
//----------------------------------------------------------------------------------------
#endif // BTGPU3DGRIDBROADPHASESHAREDDEFS_H

View File

@@ -0,0 +1,64 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006, 2009 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//----------------------------------------------------------------------------------------
// Shared definitions for GPU-based 3D Grid collision detection broadphase
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// Keep this file free from Bullet headers
// it is included into both CUDA and CPU code
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
//----------------------------------------------------------------------------------------
#ifndef BTGPU3DGRIDBROADPHASESHAREDTYPES_H
#define BTGPU3DGRIDBROADPHASESHAREDTYPES_H
//----------------------------------------------------------------------------------------
#define BT_3DGRID_PAIR_FOUND_FLG (0x40000000)
#define BT_3DGRID_PAIR_NEW_FLG (0x20000000)
#define BT_3DGRID_PAIR_ANY_FLG (BT_3DGRID_PAIR_FOUND_FLG | BT_3DGRID_PAIR_NEW_FLG)
//----------------------------------------------------------------------------------------
struct bt3DGridBroadphaseParams
{
unsigned int m_gridSizeX;
unsigned int m_gridSizeY;
unsigned int m_gridSizeZ;
unsigned int m_numCells;
float m_invCellSizeX;
float m_invCellSizeY;
float m_invCellSizeZ;
unsigned int m_numBodies;
unsigned int m_maxBodiesPerCell;
};
//----------------------------------------------------------------------------------------
struct bt3DGrid3F1U
{
float fx;
float fy;
float fz;
unsigned int uw;
};
//----------------------------------------------------------------------------------------
#endif // BTGPU3DGRIDBROADPHASESHAREDTYPES_H

View File

@@ -0,0 +1,211 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006, 2009 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
// definitions for "GPU on CPU" code
#ifndef BT_GPU_DEFINES_H
#define BT_GPU_DEFINES_H
typedef unsigned int uint;
struct int2
{
int x, y;
};
struct uint2
{
unsigned int x, y;
};
struct int3
{
int x, y, z;
};
struct uint3
{
unsigned int x, y, z;
};
struct float4
{
float x, y, z, w;
};
struct float3
{
float x, y, z;
};
#define BT_GPU___device__ inline
#define BT_GPU___devdata__
#define BT_GPU___constant__
#define BT_GPU_max(a, b) ((a) > (b) ? (a) : (b))
#define BT_GPU_min(a, b) ((a) < (b) ? (a) : (b))
#define BT_GPU_params s3DGridBroadphaseParams
#define BT_GPU___mul24(a, b) ((a)*(b))
#define BT_GPU___global__ inline
#define BT_GPU___shared__ static
#define BT_GPU___syncthreads()
#define CUDART_PI_F SIMD_PI
static inline uint2 bt3dGrid_make_uint2(unsigned int x, unsigned int y)
{
uint2 t; t.x = x; t.y = y; return t;
}
#define BT_GPU_make_uint2(x, y) bt3dGrid_make_uint2(x, y)
static inline int3 bt3dGrid_make_int3(int x, int y, int z)
{
int3 t; t.x = x; t.y = y; t.z = z; return t;
}
#define BT_GPU_make_int3(x, y, z) bt3dGrid_make_int3(x, y, z)
static inline float3 bt3dGrid_make_float3(float x, float y, float z)
{
float3 t; t.x = x; t.y = y; t.z = z; return t;
}
#define BT_GPU_make_float3(x, y, z) bt3dGrid_make_float3(x, y, z)
static inline float3 bt3dGrid_make_float34(float4 f)
{
float3 t; t.x = f.x; t.y = f.y; t.z = f.z; return t;
}
#define BT_GPU_make_float34(f) bt3dGrid_make_float34(f)
static inline float3 bt3dGrid_make_float31(float f)
{
float3 t; t.x = t.y = t.z = f; return t;
}
#define BT_GPU_make_float31(x) bt3dGrid_make_float31(x)
static inline float4 bt3dGrid_make_float42(float3 v, float f)
{
float4 t; t.x = v.x; t.y = v.y; t.z = v.z; t.w = f; return t;
}
#define BT_GPU_make_float42(a, b) bt3dGrid_make_float42(a, b)
static inline float4 bt3dGrid_make_float44(float a, float b, float c, float d)
{
float4 t; t.x = a; t.y = b; t.z = c; t.w = d; return t;
}
#define BT_GPU_make_float44(a, b, c, d) bt3dGrid_make_float44(a, b, c, d)
inline int3 operator+(int3 a, int3 b)
{
return bt3dGrid_make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline float4 operator+(const float4& a, const float4& b)
{
float4 r; r.x = a.x+b.x; r.y = a.y+b.y; r.z = a.z+b.z; r.w = a.w+b.w; return r;
}
inline float4 operator*(const float4& a, float fact)
{
float4 r; r.x = a.x*fact; r.y = a.y*fact; r.z = a.z*fact; r.w = a.w*fact; return r;
}
inline float4 operator*(float fact, float4& a)
{
return (a * fact);
}
inline float4& operator*=(float4& a, float fact)
{
a = fact * a;
return a;
}
inline float4& operator+=(float4& a, const float4& b)
{
a = a + b;
return a;
}
inline float3 operator+(const float3& a, const float3& b)
{
float3 r; r.x = a.x+b.x; r.y = a.y+b.y; r.z = a.z+b.z; return r;
}
inline float3 operator-(const float3& a, const float3& b)
{
float3 r; r.x = a.x-b.x; r.y = a.y-b.y; r.z = a.z-b.z; return r;
}
static inline float bt3dGrid_dot(float3& a, float3& b)
{
return a.x*b.x+a.y*b.y+a.z*b.z;
}
#define BT_GPU_dot(a,b) bt3dGrid_dot(a,b)
static inline float bt3dGrid_dot4(float4& a, float4& b)
{
return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
}
#define BT_GPU_dot4(a,b) bt3dGrid_dot4(a,b)
static inline float3 bt3dGrid_cross(const float3& a, const float3& b)
{
float3 r; r.x = a.y*b.z-a.z*b.y; r.y = -a.x*b.z+a.z*b.x; r.z = a.x*b.y-a.y*b.x; return r;
}
#define BT_GPU_cross(a,b) bt3dGrid_cross(a,b)
inline float3 operator*(const float3& a, float fact)
{
float3 r; r.x = a.x*fact; r.y = a.y*fact; r.z = a.z*fact; return r;
}
inline float3& operator+=(float3& a, const float3& b)
{
a = a + b;
return a;
}
inline float3& operator-=(float3& a, const float3& b)
{
a = a - b;
return a;
}
inline float3& operator*=(float3& a, float fact)
{
a = a * fact;
return a;
}
inline float3 operator-(const float3& v)
{
float3 r; r.x = -v.x; r.y = -v.y; r.z = -v.z; return r;
}
#define BT_GPU_FETCH(a, b) a[b]
#define BT_GPU_FETCH4(a, b) a[b]
#define BT_GPU_PREF(func) btGpu_##func
#define BT_GPU_SAFE_CALL(func) func
#define BT_GPU_Memset memset
#define BT_GPU_MemcpyToSymbol(a, b, c) memcpy(&a, b, c)
#define BT_GPU_BindTexture(a, b, c, d)
#define BT_GPU_UnbindTexture(a)
static uint2 s_blockIdx, s_blockDim, s_threadIdx;
#define BT_GPU_blockIdx s_blockIdx
#define BT_GPU_blockDim s_blockDim
#define BT_GPU_threadIdx s_threadIdx
#define BT_GPU_EXECKERNEL(numb, numt, kfunc, args) {s_blockDim.x=numt;for(int nb=0;nb<numb;nb++){s_blockIdx.x=nb;for(int nt=0;nt<numt;nt++){s_threadIdx.x=nt;kfunc args;}}}
#define BT_GPU_CHECK_ERROR(s)
#endif //BT_GPU_DEFINES_H

View File

@@ -0,0 +1,55 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006, 2009 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//----------------------------------------------------------------------------------------
// Shared code for GPU-based utilities
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// Keep this file free from Bullet headers
// will be compiled by both CPU and CUDA compilers
// file with definitions of BT_GPU_xxx should be included first
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
//----------------------------------------------------------------------------------------
#include "btGpuUtilsSharedDefs.h"
//----------------------------------------------------------------------------------------
extern "C"
{
//----------------------------------------------------------------------------------------
//Round a / b to nearest higher integer value
int BT_GPU_PREF(iDivUp)(int a, int b)
{
return (a % b != 0) ? (a / b + 1) : (a / b);
} // iDivUp()
//----------------------------------------------------------------------------------------
// compute grid and thread block size for a given number of elements
void BT_GPU_PREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads)
{
numThreads = BT_GPU_min(blockSize, n);
numBlocks = BT_GPU_PREF(iDivUp)(n, numThreads);
} // computeGridSize()
//----------------------------------------------------------------------------------------
} // extern "C"

View File

@@ -0,0 +1,52 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
// Shared definitions for GPU-based utilities
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// Keep this file free from Bullet headers
// it is included into both CUDA and CPU code
// file with definitions of BT_GPU_xxx should be included first
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#ifndef BTGPUUTILSDHAREDDEFS_H
#define BTGPUUTILSDHAREDDEFS_H
extern "C"
{
//Round a / b to nearest higher integer value
int BT_GPU_PREF(iDivUp)(int a, int b);
// compute grid and thread block size for a given number of elements
void BT_GPU_PREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads);
void BT_GPU_PREF(allocateArray)(void** devPtr, unsigned int size);
void BT_GPU_PREF(freeArray)(void* devPtr);
void BT_GPU_PREF(copyArrayFromDevice)(void* host, const void* device, unsigned int size);
void BT_GPU_PREF(copyArrayToDevice)(void* device, const void* host, unsigned int size);
void BT_GPU_PREF(registerGLBufferObject(unsigned int vbo));
void* BT_GPU_PREF(mapGLBufferObject(unsigned int vbo));
void BT_GPU_PREF(unmapGLBufferObject(unsigned int vbo));
} // extern "C"
#endif // BTGPUUTILSDHAREDDEFS_H

View File

@@ -0,0 +1,5 @@
include "AMD"
-- include "Intel"
-- include "NVIDIA"

View File

@@ -0,0 +1,23 @@
hasCL = findOpenCL_AMD()
if (hasCL) then
project "OpenCL_intialize_AMD"
initOpenCL_AMD()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
-- includedirs {"..","../../../../include/gpu_research"}
files {
"../main.cpp",
"../btOpenCLUtils.cpp",
"../btOpenCLUtils.h"
}
end

View File

@@ -0,0 +1,23 @@
hasCL = findOpenCL_Intel()
if (hasCL) then
project "OpenCL_intialize_Intel"
initOpenCL_Intel()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
-- includedirs {"..","../../../../include/gpu_research"}
files {
"../main.cpp",
"../btOpenCLUtils.cpp",
"../btOpenCLUtils.h"
}
end

View File

@@ -0,0 +1,23 @@
hasCL = findOpenCL_NVIDIA()
if (hasCL) then
project "OpenCL_intialize_NVIDIA"
initOpenCL_NVIDIA()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
-- includedirs {"..","../../../../include/gpu_research"}
files {
"../main.cpp",
"../btOpenCLUtils.cpp",
"../btOpenCLUtils.h"
}
end

View File

@@ -0,0 +1,43 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_OPENCL_INCLUDE_H
#define BT_OPENCL_INCLUDE_H
#ifdef __APPLE__
#ifdef USE_MINICL
#include <MiniCL/cl.h>
#else
#include <OpenCL/cl.h>
#endif
#else
#ifdef USE_MINICL
#include <MiniCL/cl.h>
#else
#include <CL/cl.h>
#ifdef _WIN32
#include "CL/cl_gl.h"
#endif //_WIN32
#endif
#endif //__APPLE__
#include <assert.h>
#include <stdio.h>
#define oclCHECKERROR(a, b) if((a)!=(b)) { printf("OCL Error : %d\n", (a)); assert((a) == (b)); }
#endif //BT_OPENCL_INCLUDE_H

View File

@@ -0,0 +1,731 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//original author: Roman Ponomarev
//cleanup by Erwin Coumans
#include <string.h>
#include "btOpenCLUtils.h"
#include <stdio.h>
#include <stdlib.h>
#define BT_MAX_CL_DEVICES 16 //who needs 16 devices?
#ifdef _WIN32
#include <Windows.h>
#include <assert.h>
#define btAssert assert
#endif
//Set the preferred platform vendor using the OpenCL SDK
static char* spPlatformVendor =
#if defined(CL_PLATFORM_MINI_CL)
"MiniCL, SCEA";
#elif defined(CL_PLATFORM_AMD)
"Advanced Micro Devices, Inc.";
#elif defined(CL_PLATFORM_NVIDIA)
"NVIDIA Corporation";
#elif defined(CL_PLATFORM_INTEL)
"Intel(R) Corporation";
#else
"Unknown Vendor";
#endif
#ifndef CL_PLATFORM_MINI_CL
#ifdef _WIN32
#include "CL/cl_gl.h"
#endif //_WIN32
#endif
int btOpenCLUtils::getNumPlatforms(cl_int* pErrNum)
{
cl_uint numPlatforms=0;
cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL)
*pErrNum = ciErrNum;
}
return numPlatforms;
}
const char* btOpenCLUtils::getSdkVendorName()
{
return spPlatformVendor;
}
cl_platform_id btOpenCLUtils::getPlatform(int platformIndex, cl_int* pErrNum)
{
cl_platform_id platform = 0;
cl_uint numPlatforms;
cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
if (platformIndex>=0 && platformIndex<numPlatforms)
{
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL)
*pErrNum = ciErrNum;
return platform;
}
platform = platforms[platformIndex];
delete[] platforms;
}
return platform;
}
void btOpenCLUtils::getPlatformInfo(cl_platform_id platform, btOpenCLPlatformInfo& platformInfo)
{
cl_int ciErrNum;
ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_VENDOR,BT_MAX_STRING_LENGTH,platformInfo.m_platformVendor,NULL);
oclCHECKERROR(ciErrNum,CL_SUCCESS);
ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_NAME,BT_MAX_STRING_LENGTH,platformInfo.m_platformName,NULL);
oclCHECKERROR(ciErrNum,CL_SUCCESS);
ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_VERSION,BT_MAX_STRING_LENGTH,platformInfo.m_platformVersion,NULL);
oclCHECKERROR(ciErrNum,CL_SUCCESS);
}
cl_context btOpenCLUtils::createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex)
{
cl_context retContext = 0;
cl_int ciErrNum=0;
/*
* If we could find our platform, use it. Otherwise pass a NULL and get whatever the
* implementation thinks we should be using.
*/
cl_context_properties cps[7] = {0,0,0,0,0,0,0};
cps[0] = CL_CONTEXT_PLATFORM;
cps[1] = (cl_context_properties)platform;
if (pGLContext && pGLDC)
{
cps[2] = CL_GL_CONTEXT_KHR;
cps[3] = (cl_context_properties)pGLContext;
cps[4] = CL_WGL_HDC_KHR;
cps[5] = (cl_context_properties)pGLDC;
}
cl_uint num_entries = BT_MAX_CL_DEVICES;
cl_device_id devices[BT_MAX_CL_DEVICES];
cl_uint num_devices=-1;
ciErrNum = clGetDeviceIDs(
platform,
deviceType,
num_entries,
devices,
&num_devices);
cl_context_properties* cprops = (NULL == platform) ? NULL : cps;
if (pGLContext)
{
//search for the GPU that relates to the OpenCL context
for (int i=0;i<num_devices;i++)
{
retContext = clCreateContext(cprops,1,&devices[i],NULL,NULL,&ciErrNum);
if (ciErrNum==CL_SUCCESS)
break;
}
}
else
{
if (preferredDeviceIndex>=0 && preferredDeviceIndex<num_devices)
{
//create a context of the preferred device index
retContext = clCreateContext(cprops,1,&devices[preferredDeviceIndex],NULL,NULL,&ciErrNum);
} else
{
//create a context of all devices
retContext = clCreateContext(cprops,num_devices,devices,NULL,NULL,&ciErrNum);
}
}
if(pErrNum != NULL)
{
*pErrNum = ciErrNum;
};
return retContext;
}
cl_context btOpenCLUtils::createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC , int preferredDeviceIndex, int preferredPlatformIndex)
{
cl_uint numPlatforms;
cl_context retContext = 0;
cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL) *pErrNum = ciErrNum;
return NULL;
}
if(numPlatforms > 0)
{
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL) *pErrNum = ciErrNum;
return NULL;
}
int i;
for ( i = 0; i < numPlatforms; ++i)
{
char pbuf[128];
ciErrNum = clGetPlatformInfo( platforms[i],
CL_PLATFORM_VENDOR,
sizeof(pbuf),
pbuf,
NULL);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL) *pErrNum = ciErrNum;
return NULL;
}
if (preferredPlatformIndex>=0 && i==preferredPlatformIndex)
{
cl_platform_id tmpPlatform = platforms[0];
platforms[0] = platforms[i];
platforms[i] = tmpPlatform;
break;
} else
{
if(!strcmp(pbuf, spPlatformVendor))
{
cl_platform_id tmpPlatform = platforms[0];
platforms[0] = platforms[i];
platforms[i] = tmpPlatform;
break;
}
}
}
for (i = 0; i < numPlatforms; ++i)
{
cl_platform_id platform = platforms[i];
assert(platform);
retContext = btOpenCLUtils::createContextFromPlatform(platform,deviceType,pErrNum,pGLContext,pGLDC,preferredDeviceIndex);
if (retContext)
{
// printf("OpenCL platform details:\n");
btOpenCLPlatformInfo platformInfo;
btOpenCLUtils::getPlatformInfo(platform, platformInfo);
printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
break;
}
}
delete[] platforms;
}
return retContext;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxMainContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
cl_device_id btOpenCLUtils::getDevice(cl_context cxMainContext, int deviceIndex)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of devices associated with context
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
if( szParmDataBytes / sizeof(cl_device_id) < deviceIndex ) {
return (cl_device_id)-1;
}
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id device = cdDevices[deviceIndex];
free(cdDevices);
return device;
}
int btOpenCLUtils::getNumDevices(cl_context cxMainContext)
{
size_t szParamDataBytes;
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParamDataBytes);
int device_count = (int) szParamDataBytes/ sizeof(cl_device_id);
return device_count;
}
void btOpenCLUtils::printDeviceInfo(cl_device_id device)
{
btOpenCLDeviceInfo info;
getDeviceInfo(device,info);
printf(" CL_DEVICE_NAME: \t\t\t%s\n", info.m_deviceName);
printf(" CL_DEVICE_VENDOR: \t\t\t%s\n", info.m_deviceVendor);
printf(" CL_DRIVER_VERSION: \t\t\t%s\n", info.m_driverVersion);
if( info.m_deviceType & CL_DEVICE_TYPE_CPU )
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
if( info.m_deviceType & CL_DEVICE_TYPE_GPU )
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
if( info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR )
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
if( info.m_deviceType & CL_DEVICE_TYPE_DEFAULT )
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
printf(" CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", info.m_computeUnits);
printf(" CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", info.m_workitemDims);
printf(" CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", info.m_workItemSize[0], info.m_workItemSize[1], info.m_workItemSize[2]);
printf(" CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", info.m_workgroupSize);
printf(" CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", info.m_clockFrequency);
printf(" CL_DEVICE_ADDRESS_BITS:\t\t%u\n", info.m_addressBits);
printf(" CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize/ (1024 * 1024)));
printf(" CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize/ (1024 * 1024)));
printf(" CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport== CL_TRUE ? "yes" : "no");
printf(" CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", info.m_localMemType == 1 ? "local" : "global");
printf(" CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(info.m_localMemSize / 1024));
printf(" CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(info.m_constantBufferSize / 1024));
if( info.m_queueProperties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
if( info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE )
printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
printf(" CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", info.m_imageSupport);
printf(" CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", info.m_maxReadImageArgs);
printf(" CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", info.m_maxWriteImageArgs);
printf("\n CL_DEVICE_IMAGE <dim>");
printf("\t\t\t2D_MAX_WIDTH\t %u\n", info.m_image2dMaxWidth);
printf("\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", info.m_image2dMaxHeight);
printf("\t\t\t\t\t3D_MAX_WIDTH\t %u\n", info.m_image3dMaxWidth);
printf("\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", info.m_image3dMaxHeight);
printf("\t\t\t\t\t3D_MAX_DEPTH\t %u\n", info.m_image3dMaxDepth);
if (info.m_deviceExtensions != 0)
printf("\n CL_DEVICE_EXTENSIONS:%s\n",info.m_deviceExtensions);
else
printf(" CL_DEVICE_EXTENSIONS: None\n");
printf(" CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
printf("CHAR %u, SHORT %u, INT %u,LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong,info.m_vecWidthFloat, info.m_vecWidthDouble);
}
void btOpenCLUtils::getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo& info)
{
// CL_DEVICE_NAME
clGetDeviceInfo(device, CL_DEVICE_NAME, BT_MAX_STRING_LENGTH, &info.m_deviceName, NULL);
// CL_DEVICE_VENDOR
clGetDeviceInfo(device, CL_DEVICE_VENDOR, BT_MAX_STRING_LENGTH, &info.m_deviceVendor, NULL);
// CL_DRIVER_VERSION
clGetDeviceInfo(device, CL_DRIVER_VERSION, BT_MAX_STRING_LENGTH, &info.m_driverVersion, NULL);
// CL_DEVICE_INFO
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &info.m_deviceType, NULL);
// CL_DEVICE_MAX_COMPUTE_UNITS
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(info.m_computeUnits), &info.m_computeUnits, NULL);
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(info.m_workitemDims), &info.m_workitemDims, NULL);
// CL_DEVICE_MAX_WORK_ITEM_SIZES
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(info.m_workItemSize), &info.m_workItemSize, NULL);
// CL_DEVICE_MAX_WORK_GROUP_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(info.m_workgroupSize), &info.m_workgroupSize, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(info.m_clockFrequency), &info.m_clockFrequency, NULL);
// CL_DEVICE_ADDRESS_BITS
clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(info.m_addressBits), &info.m_addressBits, NULL);
// CL_DEVICE_MAX_MEM_ALLOC_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(info.m_maxMemAllocSize), &info.m_maxMemAllocSize, NULL);
// CL_DEVICE_GLOBAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(info.m_globalMemSize), &info.m_globalMemSize, NULL);
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(info.m_errorCorrectionSupport), &info.m_errorCorrectionSupport, NULL);
// CL_DEVICE_LOCAL_MEM_TYPE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(info.m_localMemType), &info.m_localMemType, NULL);
// CL_DEVICE_LOCAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(info.m_localMemSize), &info.m_localMemSize, NULL);
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(info.m_constantBufferSize), &info.m_constantBufferSize, NULL);
// CL_DEVICE_QUEUE_PROPERTIES
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(info.m_queueProperties), &info.m_queueProperties, NULL);
// CL_DEVICE_IMAGE_SUPPORT
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(info.m_imageSupport), &info.m_imageSupport, NULL);
// CL_DEVICE_MAX_READ_IMAGE_ARGS
clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(info.m_maxReadImageArgs), &info.m_maxReadImageArgs, NULL);
// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(info.m_maxWriteImageArgs), &info.m_maxWriteImageArgs, NULL);
// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &info.m_image2dMaxWidth, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &info.m_image2dMaxHeight, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &info.m_image3dMaxWidth, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &info.m_image3dMaxHeight, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &info.m_image3dMaxDepth, NULL);
// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, BT_MAX_STRING_LENGTH, &info.m_deviceExtensions, NULL);
// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &info.m_vecWidthChar, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &info.m_vecWidthShort, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &info.m_vecWidthInt, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &info.m_vecWidthLong, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &info.m_vecWidthFloat, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &info.m_vecWidthDouble, NULL);
}
static const char* strip2(const char* name, const char* pattern)
{
size_t const patlen = strlen(pattern);
size_t patcnt = 0;
const char * oriptr;
const char * patloc;
// find how many times the pattern occurs in the original string
for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
{
patcnt++;
}
return oriptr;
}
cl_program btOpenCLUtils::compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSource, cl_int* pErrNum, const char* additionalMacros , const char* clFileNameForCaching)
{
cl_program m_cpProgram=0;
cl_int status;
char binaryFileName[522];
if (clFileNameForCaching)
{
char deviceName[256];
char driverVersion[256];
clGetDeviceInfo(device, CL_DEVICE_NAME, 256, &deviceName, NULL);
clGetDeviceInfo(device, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
const char* strippedName = strip2(clFileNameForCaching,"\\");
strippedName = strip2(strippedName,"/");
sprintf_s(binaryFileName,"cache/%s.%s.%s.bin",strippedName, deviceName,driverVersion );
//printf("searching for %s\n", binaryFileName);
bool fileUpToDate = false;
bool binaryFileValid=false;
FILETIME modtimeBinary;
#ifdef _WIN32
CreateDirectory("cache",0);
{
HANDLE binaryFileHandle = CreateFile(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
if (binaryFileHandle ==INVALID_HANDLE_VALUE)
{
DWORD errorCode;
errorCode = GetLastError();
switch (errorCode)
{
case ERROR_FILE_NOT_FOUND:
{
printf("\nCached file not found %s\n", binaryFileName);
break;
}
case ERROR_PATH_NOT_FOUND:
{
printf("\nCached file path not found %s\n", binaryFileName);
break;
}
default:
{
printf("\nFailed reading cached file with errorCode = %d\n", errorCode);
}
}
} else
{
if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0)
{
DWORD errorCode;
errorCode = GetLastError();
printf("\nGetFileTime errorCode = %d\n", errorCode);
} else
{
binaryFileValid = true;
}
CloseHandle(binaryFileHandle);
}
if (binaryFileValid)
{
HANDLE srcFileHandle = CreateFile(clFileNameForCaching,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
if (srcFileHandle!=INVALID_HANDLE_VALUE)
{
FILETIME modtimeSrc;
if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0)
{
DWORD errorCode;
errorCode = GetLastError();
printf("\nGetFileTime errorCode = %d\n", errorCode);
}
if ( ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime)
||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
{
fileUpToDate=true;
} else
{
printf("\nCached binary file out-of-date (%s)\n",binaryFileName);
}
CloseHandle(srcFileHandle);
}
else
{
#ifdef _DEBUG
DWORD errorCode;
errorCode = GetLastError();
switch (errorCode)
{
case ERROR_FILE_NOT_FOUND:
{
printf("\nSrc file not found %s\n", clFileNameForCaching);
break;
}
case ERROR_PATH_NOT_FOUND:
{
printf("\nSrc path not found %s\n", clFileNameForCaching);
break;
}
default:
{
printf("\nnSrc file reading errorCode = %d\n", errorCode);
}
}
//we should make sure the src file exists so we can verify the timestamp with binary
assert(0);
#else
//if we cannot find the source, assume it is OK in release builds
fileUpToDate = true;
#endif
}
}
}
if( fileUpToDate)
{
FILE* file = fopen(binaryFileName, "rb");
if (file)
{
fseek( file, 0L, SEEK_END );
size_t binarySize = ftell( file );
rewind( file );
char* binary = new char[binarySize];
fread( binary, sizeof(char), binarySize, file );
fclose( file );
m_cpProgram = clCreateProgramWithBinary( clContext, 1,&device, &binarySize, (const unsigned char**)&binary, 0, &status );
btAssert( status == CL_SUCCESS );
status = clBuildProgram( m_cpProgram, 1, &device, additionalMacros, 0, 0 );
btAssert( status == CL_SUCCESS );
if( status != CL_SUCCESS )
{
char *build_log;
size_t ret_val_size;
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = new char[ret_val_size+1];
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
build_log[ret_val_size] = '\0';
printf("%s\n", build_log);
delete build_log;
btAssert(0);
m_cpProgram = 0;
}
delete[] binary;
}
}
#endif //_WIN32
}
if (!m_cpProgram)
{
cl_kernel kernel;
cl_int localErrNum;
size_t program_length = strlen(kernelSource);
m_cpProgram = clCreateProgramWithSource(clContext, 1, (const char**)&kernelSource, &program_length, &localErrNum);
if (localErrNum!= CL_SUCCESS)
{
if (pErrNum)
*pErrNum = localErrNum;
return 0;
}
// Build the program with 'mad' Optimization option
#ifdef MAC
char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
#else
//const char* flags = "-DGUID_ARG= -fno-alias";
const char* flags = "-DGUID_ARG= ";
#endif
char* compileFlags = new char[strlen(additionalMacros) + strlen(flags) + 5];
sprintf(compileFlags, "%s %s", flags, additionalMacros);
localErrNum = clBuildProgram(m_cpProgram, 1, &device, compileFlags, NULL, NULL);
if (localErrNum!= CL_SUCCESS)
{
char *build_log;
size_t ret_val_size;
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = new char[ret_val_size+1];
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
// to be carefully, terminate with \0
// there's no information in the reference whether the string is 0 terminated or not
build_log[ret_val_size] = '\0';
printf("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log);
delete[] build_log;
if (pErrNum)
*pErrNum = localErrNum;
return 0;
}
if( clFileNameForCaching )
{ // write to binary
cl_uint numAssociatedDevices;
status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 );
btAssert( status == CL_SUCCESS );
if (numAssociatedDevices==1)
{
size_t binarySize;
status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
btAssert( status == CL_SUCCESS );
char* binary = new char[binarySize];
status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
btAssert( status == CL_SUCCESS );
{
FILE* file = fopen(binaryFileName, "wb");
if (file)
{
fwrite( binary, sizeof(char), binarySize, file );
fclose( file );
} else
{
printf("cannot write file %s\n", binaryFileName);
}
}
delete [] binary;
}
}
delete [] compileFlags;
}
return m_cpProgram;
}
cl_kernel btOpenCLUtils::compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros )
{
printf("compiling kernel %s ",kernelName);
cl_kernel kernel;
cl_int localErrNum;
size_t program_length = strlen(kernelSource);
cl_program m_cpProgram = prog;
if (!m_cpProgram)
{
m_cpProgram = compileCLProgramFromString(clContext,device,kernelSource,pErrNum, additionalMacros);
}
// Create the kernel
kernel = clCreateKernel(m_cpProgram, kernelName, &localErrNum);
if (localErrNum != CL_SUCCESS)
{
printf("Error in clCreateKernel, Line %u in file %s, cannot find kernel function %s !!!\n\n", __LINE__, __FILE__, kernelName);
if (pErrNum)
*pErrNum = localErrNum;
return 0;
}
if (!prog && m_cpProgram)
{
clReleaseProgram(m_cpProgram);
}
printf("ready. \n");
if (pErrNum)
*pErrNum = CL_SUCCESS;
return kernel;
}

View File

@@ -0,0 +1,104 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//original author: Roman Ponomarev
//cleanup by Erwin Coumans
#ifndef BT_OPENCL_UTILS_H
#define BT_OPENCL_UTILS_H
#include "btOpenCLInclude.h"
#define BT_MAX_STRING_LENGTH 1024
struct btOpenCLDeviceInfo
{
char m_deviceName[BT_MAX_STRING_LENGTH];
char m_deviceVendor[BT_MAX_STRING_LENGTH];
char m_driverVersion[BT_MAX_STRING_LENGTH];
char m_deviceExtensions[BT_MAX_STRING_LENGTH];
cl_device_type m_deviceType;
cl_uint m_computeUnits;
size_t m_workitemDims;
size_t m_workItemSize[3];
size_t m_image2dMaxWidth;
size_t m_image2dMaxHeight;
size_t m_image3dMaxWidth;
size_t m_image3dMaxHeight;
size_t m_image3dMaxDepth;
size_t m_workgroupSize;
cl_uint m_clockFrequency;
cl_ulong m_constantBufferSize;
cl_ulong m_localMemSize;
cl_ulong m_globalMemSize;
cl_bool m_errorCorrectionSupport;
cl_device_local_mem_type m_localMemType;
cl_uint m_maxReadImageArgs;
cl_uint m_maxWriteImageArgs;
cl_uint m_addressBits;
cl_ulong m_maxMemAllocSize;
cl_command_queue_properties m_queueProperties;
cl_bool m_imageSupport;
cl_uint m_vecWidthChar;
cl_uint m_vecWidthShort;
cl_uint m_vecWidthInt;
cl_uint m_vecWidthLong;
cl_uint m_vecWidthFloat;
cl_uint m_vecWidthDouble;
};
struct btOpenCLPlatformInfo
{
char m_platformVendor[BT_MAX_STRING_LENGTH];
char m_platformName[BT_MAX_STRING_LENGTH];
char m_platformVersion[BT_MAX_STRING_LENGTH];
};
class btOpenCLUtils
{
public:
/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
static cl_context createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex= - 1);
static int getNumDevices(cl_context cxMainContext);
static cl_device_id getDevice(cl_context cxMainContext, int nr);
static void getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo& info);
static void printDeviceInfo(cl_device_id device);
static cl_kernel compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum=0, cl_program prog=0,const char* additionalMacros = "" );
//optional
static cl_program compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum=0,const char* additionalMacros = "" , const char* srcFileNameForCaching=0);
//the following optional APIs provide access using specific platform information
static int getNumPlatforms(cl_int* pErrNum=0);
///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
static cl_platform_id getPlatform(int nr, cl_int* pErrNum=0);
static void getPlatformInfo(cl_platform_id platform, btOpenCLPlatformInfo& platformInfo);
static const char* getSdkVendorName();
static cl_context createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0,int preferredDeviceIndex = -1, int preferredPlatformIndex= -1);
};
#endif // BT_OPENCL_UTILS_H

View File

@@ -0,0 +1,92 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///original author: Erwin Coumans
#include "btOpenCLUtils.h"
#include <stdio.h>
cl_context g_cxMainContext;
cl_command_queue g_cqCommandQue;
int main(int argc, char* argv[])
{
int ciErrNum = 0;
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
const char* vendorSDK = btOpenCLUtils::getSdkVendorName();
printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
int numPlatforms = btOpenCLUtils::getNumPlatforms();
printf("Num Platforms = %d\n", numPlatforms);
for (int i=0;i<numPlatforms;i++)
{
cl_platform_id platform = btOpenCLUtils::getPlatform(i);
btOpenCLPlatformInfo platformInfo;
btOpenCLUtils::getPlatformInfo(platform,platformInfo);
printf("--------------------------------\n");
printf("Platform info for platform nr %d:\n",i);
printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
cl_context context = btOpenCLUtils::createContextFromPlatform(platform,deviceType,&ciErrNum);
int numDevices = btOpenCLUtils::getNumDevices(context);
printf("Num Devices = %d\n", numDevices);
for (int j=0;j<numDevices;j++)
{
cl_device_id dev = btOpenCLUtils::getDevice(context,j);
btOpenCLDeviceInfo devInfo;
btOpenCLUtils::getDeviceInfo(dev,devInfo);
btOpenCLUtils::printDeviceInfo(dev);
}
clReleaseContext(context);
}
///Easier method to initialize OpenCL using createContextFromType for a GPU
deviceType = CL_DEVICE_TYPE_GPU;
void* glCtx=0;
void* glDC = 0;
printf("Initialize OpenCL using btOpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);
for (int i=0;i<numDev;i++)
{
cl_device_id device;
device = btOpenCLUtils::getDevice(g_cxMainContext,i);
btOpenCLDeviceInfo clInfo;
btOpenCLUtils::getDeviceInfo(device,clInfo);
btOpenCLUtils::printDeviceInfo(device);
// create a command-queue
g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
//normally you would create and execute kernels using this command queue
clReleaseCommandQueue(g_cqCommandQue);
}
clReleaseContext(g_cxMainContext);
return 0;
}

View File

@@ -0,0 +1,4 @@
include "AMD"
include "Intel"
include "NVIDIA"

View File

@@ -0,0 +1,49 @@
hasCL = findOpenCL_AMD()
if (hasCL) then
project "OpenCL_broadphase_benchmark_AMD"
initOpenCL_AMD()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlut()
initGlew()
includedirs {
"../../../rendering/BulletMath",
"../../primitives",
"../../../../../src"
}
files {
"../main.cpp",
"../findPairsOpenCL.cpp",
"../findPairsOpenCL.h",
"../btGridBroadphaseCL.cpp",
"../btGridBroadphaseCL.h",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
"../../../../../src/LinearMath/btAlignedAllocator.cpp",
"../../../../../src/LinearMath/btQuickprof.cpp",
"../../../../../src/LinearMath/btQuickprof.h",
"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
"../../opengl_interop/btOpenCLGLInteropBuffer.h",
"../../opengl_interop/btStopwatch.cpp",
"../../opengl_interop/btStopwatch.h"
}
end

View File

@@ -0,0 +1,49 @@
hasCL = findOpenCL_Intel()
if (hasCL) then
project "OpenCL_broadphase_benchmark_Intel"
initOpenCL_Intel()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlut()
initGlew()
includedirs {
"../../../rendering/BulletMath",
"../../primitives",
"../../../../../src"
}
files {
"../main.cpp",
"../findPairsOpenCL.cpp",
"../findPairsOpenCL.h",
"../btGridBroadphaseCL.cpp",
"../btGridBroadphaseCL.h",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
"../../../../../src/LinearMath/btAlignedAllocator.cpp",
"../../../../../src/LinearMath/btQuickprof.cpp",
"../../../../../src/LinearMath/btQuickprof.h",
"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
"../../opengl_interop/btOpenCLGLInteropBuffer.h",
"../../opengl_interop/btStopwatch.cpp",
"../../opengl_interop/btStopwatch.h"
}
end

View File

@@ -0,0 +1,49 @@
hasCL = findOpenCL_NVIDIA()
if (hasCL) then
project "OpenCL_broadphase_benchmark_NVIDIA"
initOpenCL_NVIDIA()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlut()
initGlew()
includedirs {
"../../../rendering/BulletMath",
"../../primitives",
"../../../../../src"
}
files {
"../main.cpp",
"../findPairsOpenCL.cpp",
"../findPairsOpenCL.h",
"../btGridBroadphaseCL.cpp",
"../btGridBroadphaseCL.h",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
"../../../../../src/LinearMath/btAlignedAllocator.cpp",
"../../../../../src/LinearMath/btQuickprof.cpp",
"../../../../../src/LinearMath/btQuickprof.h",
"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
"../../opengl_interop/btOpenCLGLInteropBuffer.h",
"../../opengl_interop/btStopwatch.cpp",
"../../opengl_interop/btStopwatch.h"
}
end

View File

@@ -0,0 +1,335 @@
MSTRINGIFY(
typedef struct
{
float4 m_row[3];
} Matrix3x3;
typedef unsigned int u32;
typedef struct
{
float4 m_pos;
float4 m_quat;
float4 m_linVel;
float4 m_angVel;
u32 m_shapeIdx;
u32 m_shapeType;
float m_invMass;
float m_restituitionCoeff;
float m_frictionCoeff;
} Body;
typedef struct
{
Matrix3x3 m_invInertia;
Matrix3x3 m_initInvInertia;
} Shape;
__inline
Matrix3x3 qtGetRotationMatrix(float4 quat)
{
float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
Matrix3x3 out;
out.m_row[0].x=fabs(1-2*quat2.y-2*quat2.z);
out.m_row[0].y=fabs(2*quat.x*quat.y-2*quat.w*quat.z);
out.m_row[0].z=fabs(2*quat.x*quat.z+2*quat.w*quat.y);
out.m_row[0].w = 0.f;
out.m_row[1].x=fabs(2*quat.x*quat.y+2*quat.w*quat.z);
out.m_row[1].y=fabs(1-2*quat2.x-2*quat2.z);
out.m_row[1].z=fabs(2*quat.y*quat.z-2*quat.w*quat.x);
out.m_row[1].w = 0.f;
out.m_row[2].x=fabs(2*quat.x*quat.z-2*quat.w*quat.y);
out.m_row[2].y=fabs(2*quat.y*quat.z+2*quat.w*quat.x);
out.m_row[2].z=fabs(1-2*quat2.x-2*quat2.y);
out.m_row[2].w = 0.f;
return out;
}
typedef struct
{
float fx;
float fy;
float fz;
unsigned int uw;
} btAABBCL;
__inline
Matrix3x3 mtTranspose(Matrix3x3 m)
{
Matrix3x3 out;
out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
return out;
}
__inline
float dot3F4(float4 a, float4 b)
{
float4 a1 = (float4)(a.xyz,0.f);
float4 b1 = (float4)(b.xyz,0.f);
return dot(a1, b1);
}
__inline
Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
{
Matrix3x3 transB;
transB = mtTranspose( b );
Matrix3x3 ans;
// why this doesn't run when 0ing in the for{}
a.m_row[0].w = 0.f;
a.m_row[1].w = 0.f;
a.m_row[2].w = 0.f;
for(int i=0; i<3; i++)
{
// a.m_row[i].w = 0.f;
ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
ans.m_row[i].w = 0.f;
}
return ans;
}
//apply gravity
//update world inverse inertia tensor
//copy velocity from arrays to bodies
//copy transforms from buffer to bodies
__kernel void
setupBodiesKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer,
__global float4 *linVel,
__global float4 *pAngVel,
__global Body* gBodies, __global Shape* bodyInertias
)
{
int nodeID = get_global_id(0);
float timeStep = 0.0166666f;
float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254);
if( nodeID < numNodes )
{
float inverseMass = gBodies[nodeID].m_invMass;
if (inverseMass != 0.f)
{
float4 position = g_vertexBuffer[nodeID + startOffset/4];
float4 orientation = g_vertexBuffer[nodeID + startOffset/4+numNodes];
float4 gravityAcceleration = (float4)(0.f,-9.8f,0.f,0.f);
linVel[nodeID] += gravityAcceleration * timeStep;
gBodies[nodeID].m_pos = position;
gBodies[nodeID].m_quat = orientation;
gBodies[nodeID].m_linVel = (float4)(linVel[nodeID].xyz,0.f);
gBodies[nodeID].m_angVel = (float4)(pAngVel[nodeID].xyz,0.f);
Matrix3x3 m = qtGetRotationMatrix( orientation);
Matrix3x3 mT = mtTranspose( m );
Matrix3x3 tmp = mtMul(m, bodyInertias[nodeID].m_initInvInertia);
Matrix3x3 tmp2 = mtMul(tmp, mT);
bodyInertias[nodeID].m_invInertia = tmp2;
//shapeInfo.m_invInertia = mtMul( mtMul( m, shapeInfo.m_initInvInertia ), mT );
} else
{
gBodies[nodeID].m_linVel = (float4)(0.f,0.f,0.f,0.f);
gBodies[nodeID].m_angVel = (float4)(0.f,0.f,0.f,0.f);
}
}
}
__kernel void
copyVelocitiesKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer,
__global float4 *linVel,
__global float4 *pAngVel,
__global Body* gBodies, __global Shape* bodyInertias
)
{
int nodeID = get_global_id(0);
if( nodeID < numNodes )
{
float inverseMass = gBodies[nodeID].m_invMass;
if (inverseMass != 0.f)
{
linVel[nodeID] = (float4)(gBodies[nodeID].m_linVel.xyz,0.f);
pAngVel[nodeID] = (float4)(gBodies[nodeID].m_angVel.xyz,0.f);
}
}
}
__kernel void
initializeGpuAabbsSimple( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer, __global btAABBCL* pAABB)
{
int nodeID = get_global_id(0);
if( nodeID < numNodes )
{
float4 position = g_vertexBuffer[nodeID + startOffset/4];
float4 orientation = g_vertexBuffer[nodeID + startOffset/4+numNodes];
float4 color = g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes];
float4 green = (float4)(.4f,1.f,.4f,1.f);
g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes] = green;
float4 halfExtents = (float4)(1.01f,1.01f,1.01f,0.f);
//float4 extent=(float4)(1.f,1.f,1.f,0.f);
Matrix3x3 abs_b = qtGetRotationMatrix(orientation);
float4 extent = (float4) (
dot(abs_b.m_row[0],halfExtents),
dot(abs_b.m_row[1],halfExtents),
dot(abs_b.m_row[2],halfExtents),
0.f);
pAABB[nodeID*2].fx = position.x-extent.x;
pAABB[nodeID*2].fy = position.y-extent.y;
pAABB[nodeID*2].fz = position.z-extent.z;
pAABB[nodeID*2].uw = nodeID;
pAABB[nodeID*2+1].fx = position.x+extent.x;
pAABB[nodeID*2+1].fy = position.y+extent.y;
pAABB[nodeID*2+1].fz = position.z+extent.z;
pAABB[nodeID*2+1].uw = nodeID;
}
}
__kernel void
initializeGpuAabbsFull( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer, __global Body* gBodies, __global btAABBCL* plocalShapeAABB, __global btAABBCL* pAABB)
{
int nodeID = get_global_id(0);
if( nodeID < numNodes )
{
float4 position = g_vertexBuffer[nodeID + startOffset/4];
float4 orientation = g_vertexBuffer[nodeID + startOffset/4+numNodes];
float4 color = g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes];
float4 green = (float4)(.4f,1.f,.4f,1.f);
g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes] = green;
int shapeIndex = gBodies[nodeID].m_shapeIdx;
if (shapeIndex>=0)
{
btAABBCL minAabb = plocalShapeAABB[shapeIndex*2];
btAABBCL maxAabb = plocalShapeAABB[shapeIndex*2+1];
float4 halfExtents = ((float4)(maxAabb.fx - minAabb.fx,maxAabb.fy - minAabb.fy,maxAabb.fz - minAabb.fz,0.f))*0.5f;
Matrix3x3 abs_b = qtGetRotationMatrix(orientation);
float4 extent = (float4) ( dot(abs_b.m_row[0],halfExtents),dot(abs_b.m_row[1],halfExtents),dot(abs_b.m_row[2],halfExtents),0.f);
pAABB[nodeID*2].fx = position.x-extent.x;
pAABB[nodeID*2].fy = position.y-extent.y;
pAABB[nodeID*2].fz = position.z-extent.z;
pAABB[nodeID*2].uw = nodeID;
pAABB[nodeID*2+1].fx = position.x+extent.x;
pAABB[nodeID*2+1].fy = position.y+extent.y;
pAABB[nodeID*2+1].fz = position.z+extent.z;
pAABB[nodeID*2+1].uw = nodeID;
}
}
}
__kernel void
broadphaseColorKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer, __global int2* pOverlappingPairs, const int numOverlap)
{
int nodeID = get_global_id(0);
if( nodeID < numOverlap )
{
int2 pair = pOverlappingPairs[nodeID];
float4 red = (float4)(1.f,0.4f,0.4f,1.f);
g_vertexBuffer[pair.x + startOffset/4+numNodes+numNodes] = red;
g_vertexBuffer[pair.y + startOffset/4+numNodes+numNodes] = red;
}
}
__kernel void
broadphaseKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer)
{
int nodeID = get_global_id(0);
// float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254);
if( nodeID < numNodes )
{
float4 position = g_vertexBuffer[nodeID + startOffset/4];
//float4 orientation = g_vertexBuffer[nodeID + startOffset/4+numNodes];
float4 color = g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes];
float4 red = (float4)(1.f,0.f,0.f,0.f);
float4 green = (float4)(0.f,1.f,0.f,0.f);
float4 blue = (float4)(0.f,0.f,1.f,0.f);
float overlap=0;
int equal = 0;
g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes] = green;
for (int i=0;i<numNodes;i++)
{
if (i!=nodeID)
{
float4 otherPosition = g_vertexBuffer[i + startOffset/4];
if ((otherPosition.x == position.x)&&
(otherPosition.y == position.y)&&
(otherPosition.z == position.z))
equal=1;
float distsqr =
((otherPosition.x - position.x)* (otherPosition.x - position.x))+
((otherPosition.y - position.y)* (otherPosition.y - position.y))+
((otherPosition.z - position.z)* (otherPosition.z - position.z));
if (distsqr<7.f)
overlap+=0.25f;
}
}
if (equal)
{
g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes]=blue;
} else
{
if (overlap>0.f)
g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes]=red*overlap;
else
g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes]=green;
}
}
}
);

View File

@@ -0,0 +1,231 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Roman Ponomarev, Erwin Coumans
#ifdef RELEASE_ME
#define COMPUTE_AABB_KERNEL_PATH "computeAabbKernelOCL.cl"
#else
#define COMPUTE_AABB_KERNEL_PATH "..\\..\\opencl\\broadphase_benchmark\\computeAabbKernelOCL"
#endif
#include "btGridBroadphaseCl.h"
#include "LinearMath/btQuickprof.h"
#include "Adl/Adl.h"
#include "AdlPrimitives/Math/Math.h"
#include "Adl/AdlKernel.h"
#include "../basic_initialize/btOpenCLUtils.h"
#define MSTRINGIFY(A) #A
static const char* spComputeAabbSource=
#include "computeAabbKernelOCL.cl"
struct btTmpAabb
{
float minfx;
float minfy;
float minfz;
unsigned int index0;
float maxfx;
float maxfy;
float maxfz;
unsigned int index1;
} ;
btGridBroadphaseCl::btGridBroadphaseCl( btOverlappingPairCache* overlappingPairCache,
const btVector3& cellSize,
int gridSizeX, int gridSizeY, int gridSizeZ,
int maxSmallProxies, int maxLargeProxies, int maxPairsPerSmallProxy,
btScalar maxSmallProxySize,
int maxSmallProxiesPerCell,
cl_context context,
cl_device_id device,
cl_command_queue queue,
adl::DeviceCL* deviceCL)
:bt3dGridBroadphaseOCL(overlappingPairCache,cellSize,
gridSizeX, gridSizeY, gridSizeZ,
maxSmallProxies, maxLargeProxies, maxPairsPerSmallProxy,
maxSmallProxySize,maxSmallProxiesPerCell,
context,device,queue,deviceCL)
{
m_computeAabbKernel = m_deviceCL->getKernel(COMPUTE_AABB_KERNEL_PATH,"computeAabb","",spComputeAabbSource);
m_countOverlappingPairs = m_deviceCL->getKernel(COMPUTE_AABB_KERNEL_PATH,"countOverlappingpairs","",spComputeAabbSource);
m_squeezePairCaches = m_deviceCL->getKernel(COMPUTE_AABB_KERNEL_PATH,"squeezePairCaches","",spComputeAabbSource);
m_aabbConstBuffer = new adl::Buffer<MyAabbConstData >(m_deviceCL,1,adl::BufferBase::BUFFER_CONST);
size_t memSize = m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)*2;
cl_int ciErrNum=0;
m_dAllOverlappingPairs = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
memset(m_hAllOverlappingPairs, 0x00, sizeof(MyUint2)*m_maxHandles * m_maxPairsPerBody);
copyArrayToDevice(m_dAllOverlappingPairs, m_hAllOverlappingPairs, m_maxHandles * m_maxPairsPerBody * sizeof(MyUint2));
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
btGridBroadphaseCl::~btGridBroadphaseCl()
{
clReleaseMemObject(m_dAllOverlappingPairs);
delete m_aabbConstBuffer;
}
void btGridBroadphaseCl::prepareAABB(float* positions, int numObjects)
{
return;
#if 0
bt3dGridBroadphaseOCL::prepareAABB();
#else
BT_PROFILE("prepareAABB");
bt3DGrid3F1U* pBB = m_hAABB;
int new_largest_index = numObjects;
unsigned int num_small = numObjects;
m_LastHandleIndex = new_largest_index;
new_largest_index = -1;
unsigned int num_large = 0;
m_LastLargeHandleIndex = new_largest_index;
// paranoid checks
//btAssert(num_small == m_numHandles);
//btAssert(num_large == m_numLargeHandles);
//copyArrayFromDevice( m_hAABB, m_dAABB, sizeof(bt3DGrid3F1U) * 2 * (m_numHandles + m_numLargeHandles));
//clFinish(m_cqCommandQue);
#endif
}
void btGridBroadphaseCl::calcHashAABB()
{
bt3dGridBroadphaseOCL::calcHashAABB();
}
void btGridBroadphaseCl::calculateOverlappingPairs(float* positions, int numObjects)
{
btDispatcher* dispatcher=0;
// update constants
{
BT_PROFILE("setParameters");
setParameters(&m_params);
}
// prepare AABB array
{
BT_PROFILE("prepareAABB");
prepareAABB(positions, numObjects);
}
// calculate hash
{
BT_PROFILE("calcHashAABB");
calcHashAABB();
}
{
BT_PROFILE("sortHash");
// sort bodies based on hash
sortHash();
}
// find start of each cell
{
BT_PROFILE("findCellStart");
findCellStart();
}
{
BT_PROFILE("findOverlappingPairs");
// findOverlappingPairs (small/small)
findOverlappingPairs();
}
// add pairs to CPU cache
{
BT_PROFILE("computePairCacheChanges");
#if 0
computePairCacheChanges();
#else
int ciErrNum=0;
ciErrNum=clSetKernelArg((cl_kernel)m_countOverlappingPairs->m_kernel, 0, sizeof(int), (void*)&numObjects);
ciErrNum=clSetKernelArg((cl_kernel)m_countOverlappingPairs->m_kernel, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
ciErrNum=clSetKernelArg((cl_kernel)m_countOverlappingPairs->m_kernel, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
ciErrNum=clSetKernelArg((cl_kernel)m_countOverlappingPairs->m_kernel, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
ciErrNum=clSetKernelArg((cl_kernel)m_countOverlappingPairs->m_kernel, 4, sizeof(cl_mem),(void*)&m_dAABB);
size_t localWorkSize=64;
size_t numWorkItems = localWorkSize*((numObjects+ (localWorkSize)) / localWorkSize);
ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, (cl_kernel)m_countOverlappingPairs->m_kernel, 1, NULL, &numWorkItems, &localWorkSize, 0,0,0 );
oclCHECKERROR(ciErrNum, CL_SUCCESS);
ciErrNum = clFlush(m_cqCommandQue);
#endif
}
{
BT_PROFILE("scanOverlappingPairBuff");
scanOverlappingPairBuff(false);
}
{
BT_PROFILE("squeezeOverlappingPairBuff");
//#define FORCE_CPU
#ifdef FORCE_CPU
bt3dGridBroadphaseOCL::squeezeOverlappingPairBuff();
copyArrayToDevice(m_dPairsChangedXY, m_hPairsChangedXY, sizeof( MyUint2) * m_numPrefixSum); //gSum
#else
//squeezeOverlappingPairBuff();
int ciErrNum = 0;
ciErrNum=clSetKernelArg((cl_kernel)m_squeezePairCaches->m_kernel, 0, sizeof(int), (void*)&numObjects);
ciErrNum=clSetKernelArg((cl_kernel)m_squeezePairCaches->m_kernel, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
ciErrNum=clSetKernelArg((cl_kernel)m_squeezePairCaches->m_kernel, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
ciErrNum=clSetKernelArg((cl_kernel)m_squeezePairCaches->m_kernel, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
ciErrNum=clSetKernelArg((cl_kernel)m_squeezePairCaches->m_kernel, 4, sizeof(cl_mem),(void*)&m_dAllOverlappingPairs);
ciErrNum=clSetKernelArg((cl_kernel)m_squeezePairCaches->m_kernel, 5, sizeof(cl_mem),(void*)&m_dAABB);
size_t workGroupSize = 64;
size_t numWorkItems = workGroupSize*((numObjects+ (workGroupSize)) / workGroupSize);
ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, (cl_kernel)m_squeezePairCaches->m_kernel, 1, NULL, &numWorkItems, &workGroupSize, 0,0,0 );
oclCHECKERROR(ciErrNum, CL_SUCCESS);
// copyArrayFromDevice(m_hAllOverlappingPairs, m_dAllOverlappingPairs, sizeof(unsigned int) * m_numPrefixSum*2); //gSum
// clFinish(m_cqCommandQue);
#endif
}
return;
}

View File

@@ -0,0 +1,73 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Roman Ponomarev, Erwin Coumans
#ifndef GRID_BROADPHASE_CL_H
#define GRID_BROADPHASE_CL_H
#include "../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h"
#include "Adl/Adl.h"
#include "Adl/AdlKernel.h"
struct MyAabbConstData
{
int bla;
int numElem;
};
class btGridBroadphaseCl : public bt3dGridBroadphaseOCL
{
protected:
adl::Kernel* m_computeAabbKernel;
adl::Kernel* m_countOverlappingPairs;
adl::Kernel* m_squeezePairCaches;
adl::Buffer<MyAabbConstData>* m_aabbConstBuffer;
public:
cl_mem m_dAllOverlappingPairs;
btGridBroadphaseCl( btOverlappingPairCache* overlappingPairCache,
const btVector3& cellSize,
int gridSizeX, int gridSizeY, int gridSizeZ,
int maxSmallProxies, int maxLargeProxies, int maxPairsPerSmallProxy,
btScalar maxSmallProxySize,
int maxSmallProxiesPerCell = 4,
cl_context context = NULL,
cl_device_id device = NULL,
cl_command_queue queue = NULL,
adl::DeviceCL* deviceCL=0
);
virtual void prepareAABB(float* positions, int numObjects);
virtual void calcHashAABB();
void calculateOverlappingPairs(float* positions, int numObjects);
virtual ~btGridBroadphaseCl();
};
#endif //GRID_BROADPHASE_CL_H

View File

@@ -0,0 +1,112 @@
MSTRINGIFY(
typedef struct
{
int bla;
int numElem;
} MyAabbConstDataCL ;
typedef struct
{
float minfx;
float minfy;
float minfz;
unsigned int index0;
float maxfx;
float maxfy;
float maxfz;
unsigned int index1;
} btAabbCL;
__kernel void computeAabb( __global btAabbCL* aabbs,__global float4* positions, MyAabbConstDataCL cb)
{
int nodeID = get_global_id(0);
if( nodeID < cb.numElem )
{
aabbs[nodeID].minfx = positions[nodeID].x -1.f;
aabbs[nodeID].minfy = positions[nodeID].y -1.f;
aabbs[nodeID].minfz = positions[nodeID].z -1.f;
aabbs[nodeID].index0 = nodeID;
aabbs[nodeID].maxfx = positions[nodeID].x +1.f;
aabbs[nodeID].maxfy = positions[nodeID].y +1.f;
aabbs[nodeID].maxfz = positions[nodeID].z +1.f;
aabbs[nodeID].index1 = nodeID;
}
}
__kernel void countOverlappingpairs( int numObjects,
__global int* pPairBuff,
__global int2* pPairBuffStartCurr,
__global int* pPairScan,
__global float4* pAABB )
{
int index = get_global_id(0);
if(index >= numObjects)
{
return;
}
float4 bbMin = pAABB[index * 2];
int handleIndex = as_int(bbMin.w);
int2 start_curr = pPairBuffStartCurr[handleIndex];
int start = start_curr.x;
int curr = start_curr.y;
__global int *pInp = pPairBuff + start;
int num_changes = 0;
for(int k = 0; k < curr; k++, pInp++)
{
if(((*pInp) & 0x60000000))//either new or existing pairs (ignore old non-overlapping pairs)
{
num_changes++;
}
}
pPairScan[index+1] = num_changes;
}
__kernel void squeezePairCaches( int numObjects,
__global int* pPairBuff,
__global int2* pPairBuffStartCurr,
__global int* pPairScan,
__global int2* pPairOut,
__global float4* pAABB )
{
int index = get_global_id(0);
if(index >= numObjects)
{
return;
}
float4 bbMin = pAABB[index * 2];
int handleIndex = as_int(bbMin.w);
int2 start_curr = pPairBuffStartCurr[handleIndex];
int start = start_curr.x;
int curr = start_curr.y;
__global int* pInp = pPairBuff + start;
__global int2* pOut = pPairOut + pPairScan[index+1];
__global int* pOut2 = pInp;
int num = 0;
for(int k = 0; k < curr; k++, pInp++)
{
if(((*pInp) & 0x60000000))
{
int2 newpair;
newpair.x = handleIndex;
newpair.y = (*pInp) & (~0x60000000);
*pOut = newpair;
pOut++;
}
if((*pInp) & 0x60000000)
{
*pOut2 = (*pInp) & (~0x60000000);
pOut2++;
num++;
}
}
int2 newStartCurr;
newStartCurr.x = start;
newStartCurr.y = num;
pPairBuffStartCurr[handleIndex] = newStartCurr;
}
);

View File

@@ -0,0 +1,204 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Roman Ponomarev, Erwin Coumans
#include "findPairsOpenCL.h"
#include "../basic_initialize/btOpenCLUtils.h"
#define MSTRINGIFY(A) #A
static char* broadphaseKernelString =
#include "broadphaseKernel.cl"
#define GRID_BROADPHASE_PATH "..\\..\\opencl\\broadphase_benchmark\\broadphaseKernel.cl"
void initFindPairs(btFindPairsIO& fpio,cl_context cxMainContext, cl_device_id device, cl_command_queue commandQueue, int maxHandles, int maxPairsPerBody)
{
//m_proxies.push_back( proxy );
fpio.m_mainContext = cxMainContext;
fpio.m_cqCommandQue = commandQueue;
fpio.m_device = device;
cl_int pErrNum;
cl_program prog = btOpenCLUtils::compileCLProgramFromString(cxMainContext, device, broadphaseKernelString, &pErrNum ,"",GRID_BROADPHASE_PATH);
fpio.m_broadphaseBruteForceKernel = btOpenCLUtils::compileCLKernelFromString(cxMainContext,device, broadphaseKernelString, "broadphaseKernel" ,&pErrNum,prog);
fpio.m_initializeGpuAabbsKernelSimple = btOpenCLUtils::compileCLKernelFromString(cxMainContext,device, broadphaseKernelString, "initializeGpuAabbsSimple" ,&pErrNum,prog);
fpio.m_initializeGpuAabbsKernelFull = btOpenCLUtils::compileCLKernelFromString(cxMainContext,device, broadphaseKernelString, "initializeGpuAabbsFull" ,&pErrNum,prog);
fpio.m_broadphaseColorKernel = btOpenCLUtils::compileCLKernelFromString(cxMainContext,device, broadphaseKernelString, "broadphaseColorKernel" ,&pErrNum,prog);
fpio.m_setupBodiesKernel = btOpenCLUtils::compileCLKernelFromString(cxMainContext,device, broadphaseKernelString, "setupBodiesKernel" ,&pErrNum,prog);
fpio.m_copyVelocitiesKernel = btOpenCLUtils::compileCLKernelFromString(cxMainContext,device, broadphaseKernelString, "copyVelocitiesKernel" ,&pErrNum,prog);
}
void findPairsOpenCLBruteForce(btFindPairsIO& fpio)
{
int ciErrNum = 0;
int numObjects = fpio.m_numObjects;
int offset = fpio.m_positionOffset;
ciErrNum = clSetKernelArg(fpio.m_broadphaseBruteForceKernel, 0, sizeof(int), &offset);
ciErrNum = clSetKernelArg(fpio.m_broadphaseBruteForceKernel, 1, sizeof(int), &numObjects);
ciErrNum = clSetKernelArg(fpio.m_broadphaseBruteForceKernel, 2, sizeof(cl_mem), (void*)&fpio.m_clObjectsBuffer);
size_t numWorkItems = numObjects;///workGroupSize*((NUM_OBJECTS + (workGroupSize)) / workGroupSize);
size_t workGroupSize = 64;
ciErrNum = clEnqueueNDRangeKernel(fpio.m_cqCommandQue, fpio.m_broadphaseBruteForceKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
void setupGpuAabbsFull(btFindPairsIO& fpio, cl_mem bodies)
{
int ciErrNum = 0;
int numObjects = fpio.m_numObjects;
int offset = fpio.m_positionOffset;
ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelFull, 0, sizeof(int), &offset);
ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelFull, 1, sizeof(int), &numObjects);
ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelFull, 2, sizeof(cl_mem), (void*)&fpio.m_clObjectsBuffer);
ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelFull, 3, sizeof(cl_mem), (void*)&bodies);
ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelFull, 4, sizeof(cl_mem), (void*)&fpio.m_dlocalShapeAABB);
ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelFull, 5, sizeof(cl_mem), (void*)&fpio.m_dAABB);
size_t workGroupSize = 64;
size_t numWorkItems = workGroupSize*((numObjects+ (workGroupSize)) / workGroupSize);
ciErrNum = clEnqueueNDRangeKernel(fpio.m_cqCommandQue, fpio.m_initializeGpuAabbsKernelFull, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
void setupGpuAabbsSimple(btFindPairsIO& fpio)
{
int ciErrNum = 0;
int numObjects = fpio.m_numObjects;
int offset = fpio.m_positionOffset;
ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelSimple, 0, sizeof(int), &offset);
ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelSimple, 1, sizeof(int), &numObjects);
ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelSimple, 2, sizeof(cl_mem), (void*)&fpio.m_clObjectsBuffer);
ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelSimple, 3, sizeof(cl_mem), (void*)&fpio.m_dAABB);
size_t workGroupSize = 64;
size_t numWorkItems = workGroupSize*((numObjects+ (workGroupSize)) / workGroupSize);
ciErrNum = clEnqueueNDRangeKernel(fpio.m_cqCommandQue, fpio.m_initializeGpuAabbsKernelSimple, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
void setupBodies(btFindPairsIO& fpio, cl_mem linVelMem, cl_mem angVelMem, cl_mem bodies, cl_mem bodyInertias)
{
int ciErrNum = 0;
int numObjects = fpio.m_numObjects;
int offset = fpio.m_positionOffset;
ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 0, sizeof(int), &offset);
ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 1, sizeof(int), &fpio.m_numObjects);
ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 2, sizeof(cl_mem), (void*)&fpio.m_clObjectsBuffer);
ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 3, sizeof(cl_mem), (void*)&linVelMem);
ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 4, sizeof(cl_mem), (void*)&angVelMem);
ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 5, sizeof(cl_mem), (void*)&bodies);
ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 6, sizeof(cl_mem), (void*)&bodyInertias);
if (numObjects)
{
size_t workGroupSize = 64;
size_t numWorkItems = workGroupSize*((numObjects+ (workGroupSize)) / workGroupSize);
ciErrNum = clEnqueueNDRangeKernel(fpio.m_cqCommandQue, fpio.m_setupBodiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
}
void copyBodyVelocities(btFindPairsIO& fpio, cl_mem linVelMem, cl_mem angVelMem, cl_mem bodies, cl_mem bodyInertias)
{
int ciErrNum = 0;
int numObjects = fpio.m_numObjects;
int offset = fpio.m_positionOffset;
ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 0, sizeof(int), &offset);
ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 1, sizeof(int), &fpio.m_numObjects);
ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 2, sizeof(cl_mem), (void*)&fpio.m_clObjectsBuffer);
ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 3, sizeof(cl_mem), (void*)&linVelMem);
ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 4, sizeof(cl_mem), (void*)&angVelMem);
ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 5, sizeof(cl_mem), (void*)&bodies);
ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 6, sizeof(cl_mem), (void*)&bodyInertias);
if (numObjects)
{
size_t workGroupSize = 64;
size_t numWorkItems = workGroupSize*((numObjects+ (workGroupSize)) / workGroupSize);
ciErrNum = clEnqueueNDRangeKernel(fpio.m_cqCommandQue, fpio.m_copyVelocitiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
}
void colorPairsOpenCL(btFindPairsIO& fpio)
{
int ciErrNum = 0;
int numObjects = fpio.m_numObjects;
int offset = fpio.m_positionOffset;
ciErrNum = clSetKernelArg(fpio.m_broadphaseColorKernel, 0, sizeof(int), &offset);
ciErrNum = clSetKernelArg(fpio.m_broadphaseColorKernel, 1, sizeof(int), &fpio.m_numObjects);
ciErrNum = clSetKernelArg(fpio.m_broadphaseColorKernel, 2, sizeof(cl_mem), (void*)&fpio.m_clObjectsBuffer);
ciErrNum = clSetKernelArg(fpio.m_broadphaseColorKernel, 3, sizeof(cl_mem), (void*)&fpio.m_dAllOverlappingPairs);
ciErrNum = clSetKernelArg(fpio.m_broadphaseColorKernel, 4, sizeof(int), &fpio.m_numOverlap);
if (fpio.m_numOverlap)
{
size_t workGroupSize = 64;
size_t numWorkItems = workGroupSize*((fpio.m_numOverlap+ (workGroupSize)) / workGroupSize);
ciErrNum = clEnqueueNDRangeKernel(fpio.m_cqCommandQue, fpio.m_broadphaseColorKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
}
void releaseFindPairs(btFindPairsIO& fpio)
{
clReleaseKernel(fpio.m_initializeGpuAabbsKernelSimple);
clReleaseKernel(fpio.m_initializeGpuAabbsKernelFull);
clReleaseKernel(fpio.m_broadphaseColorKernel);
clReleaseKernel(fpio.m_broadphaseBruteForceKernel);
clReleaseKernel(fpio.m_setupBodiesKernel);
clReleaseKernel(fpio.m_copyVelocitiesKernel);
}

View File

@@ -0,0 +1,90 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Roman Ponomarev, Erwin Coumans
#ifndef FIND_PAIRS_H
#define FIND_PAIRS_H
#include "../basic_initialize/btOpenCLInclude.h"
struct btKernelInfo
{
int m_Id;
cl_kernel m_kernel;
char* m_name;
int m_workgroupSize;
};
struct btFindPairsIO
{
int m_numObjects;
cl_mem m_clObjectsBuffer; //for memory layout details see main.cpp (todo, make it flexible)
int m_positionOffset;//offset in m_clObjectsBuffer where position array starts
cl_command_queue m_cqCommandQue;
cl_kernel m_initializeGpuAabbsKernelSimple;
cl_kernel m_initializeGpuAabbsKernelFull;
cl_kernel m_broadphaseColorKernel;
cl_kernel m_broadphaseBruteForceKernel;
cl_kernel m_setupBodiesKernel;
cl_kernel m_copyVelocitiesKernel;
cl_context m_mainContext;
cl_device_id m_device;
cl_kernel m_calcHashAabbKernel;
cl_kernel m_clearCellStartKernel;
cl_kernel m_findCellStartKernel;
cl_kernel m_findOverlappingPairsKernel;
cl_kernel m_computePairChangeKernel;
cl_kernel m_squeezePairBuffKernel;
cl_mem m_dAllOverlappingPairs;
int m_numOverlap;
cl_mem m_dBpParams;
cl_mem m_dBodiesHash;
cl_mem m_dCellStart;
cl_mem m_dPairBuff;
cl_mem m_dPairBuffStartCurr;
cl_mem m_dlocalShapeAABB;
cl_mem m_dAABB;
cl_mem m_dPairScan;
cl_mem m_dPairOut;
};
void initFindPairs(btFindPairsIO& fpio,cl_context cxMainContext, cl_device_id device, cl_command_queue commandQueue, int maxHandles,int maxPairsPerBody = 16);
void findPairsOpenCLBruteForce(btFindPairsIO& fpio);
void setupGpuAabbsSimple(btFindPairsIO& fpio);
void setupGpuAabbsFull(btFindPairsIO& fpio, cl_mem bodies);
void colorPairsOpenCL(btFindPairsIO& fpio);
void setupBodies(btFindPairsIO& fpio, cl_mem linVelMem, cl_mem angVelMem, cl_mem bodies, cl_mem bodyInertias);
void copyBodyVelocities(btFindPairsIO& fpio, cl_mem linVelMem, cl_mem angVelMem, cl_mem bodies, cl_mem bodyInertias);
void releaseFindPairs(btFindPairsIO& fpio);
#endif //FIND_PAIRS_H

View File

@@ -0,0 +1,116 @@
MSTRINGIFY(
float4 quatMult(float4 q1, float4 q2)
{
float4 q;
q.x = q1.w * q2.x + q1.x * q2.w + q1.y * q2.z - q1.z * q2.y;
q.y = q1.w * q2.y + q1.y * q2.w + q1.z * q2.x - q1.x * q2.z;
q.z = q1.w * q2.z + q1.z * q2.w + q1.x * q2.y - q1.y * q2.x;
q.w = q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z;
return q;
}
float4 quatNorm(float4 q)
{
float len = native_sqrt(dot(q, q));
if(len > 0.f)
{
q *= 1.f / len;
}
else
{
q.x = q.y = q.z = 0.f;
q.w = 1.f;
}
return q;
}
__kernel void
integrateTransformsKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer,
__global float4 *linVel,
__global float4 *pAngVel,
__global float* pBodyTimes)
{
int nodeID = get_global_id(0);
float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);
float mAmplitude = 66.f;
float timeStep = 0.0166666f;
if( nodeID < numNodes )
{
//g_vertexBuffer[nodeID + startOffset/4+numNodes] += pAngVel[nodeID];
if (1)
{
float4 axis;
//add some hardcoded angular damping
pAngVel[nodeID].x *= 0.99f;
pAngVel[nodeID].y *= 0.99f;
pAngVel[nodeID].z *= 0.99f;
float4 angvel = pAngVel[nodeID];
float fAngle = native_sqrt(dot(angvel, angvel));
//limit the angular motion
if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)
{
fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;
}
if(fAngle < 0.001f)
{
// use Taylor's expansions of sync function
axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);
}
else
{
// sync(fAngle) = sin(c*fAngle)/t
axis = angvel * ( native_sin(0.5f * fAngle * timeStep) / fAngle);
}
float4 dorn = axis;
dorn.w = native_cos(fAngle * timeStep * 0.5f);
float4 orn0 = g_vertexBuffer[nodeID + startOffset/4+numNodes];
float4 predictedOrn = quatMult(dorn, orn0);
predictedOrn = quatNorm(predictedOrn);
g_vertexBuffer[nodeID + startOffset/4+numNodes]=predictedOrn;
}
//linear velocity
g_vertexBuffer[nodeID + startOffset/4] += linVel[nodeID] * timeStep;
}
}
__kernel void
sineWaveKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer,
__global float4 *linVel,
__global float4 *pAngVel,
__global float* pBodyTimes)
{
int nodeID = get_global_id(0);
float timeStepPos = 0.000166666;
float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);
float mAmplitude = 166.f;
if( nodeID < numNodes )
{
pBodyTimes[nodeID] += timeStepPos;
float4 position = g_vertexBuffer[nodeID + startOffset/4];
position.x = native_cos(pBodyTimes[nodeID]*2.17f)*mAmplitude + native_sin(pBodyTimes[nodeID])*mAmplitude*0.5f;
position.y = native_cos(pBodyTimes[nodeID]*1.38f)*mAmplitude + native_sin(pBodyTimes[nodeID]*mAmplitude);
position.z = native_cos(pBodyTimes[nodeID]*2.17f)*mAmplitude + native_sin(pBodyTimes[nodeID]*0.777f)*mAmplitude;
g_vertexBuffer[nodeID + startOffset/4] = position;
}
}
);

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
include "AMD"
include "Intel"
include "NVIDIA"

View File

@@ -0,0 +1,23 @@
hasCL = findOpenCL_AMD()
if (hasCL) then
project "OpenCL_global_atomics_AMD"
initOpenCL_AMD()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
-- includedirs {"..","../../../../include/gpu_research"}
files {
"../main.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h"
}
end

View File

@@ -0,0 +1,36 @@
static const char* globalAtomicsKernelString= \
"\n"
"\n"
"\n"
"\n"
"//OpenCL 1.1 has atomic_inc build-in (no extension needed)\n"
"//see http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/atomic_inc.html\n"
"__kernel void globalAtomicKernelOpenCL1_1( volatile __global int* counter)\n"
"{\n"
" atomic_inc(counter);\n"
"}\n"
"\n"
"//OpenCL 1.1 atomic device counters extension, usually faster on current AMD hardware\n"
"//http://www.khronos.org/registry/cl/extensions/ext/cl_ext_atomic_counters_32.txt\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"__kernel void counterAtomicKernelExt( counter32_t counter)\n"
"{\n"
" atomic_inc(counter);\n"
"}\n"
"\n"
"\n"
"//OpenCL 1.0 optional extension, using atom_inc\n"
"//see http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/cl_khr_global_int32_base_atomics.html\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable //atomic_inc\n"
"__kernel void globalAtomicKernelExt( __global int* counter)\n"
"{\n"
" atom_inc(counter);\n"
"}\n"
"\n"
"\n"
"__kernel void globalAtomicKernelCounters32Broken( __global int* counter)\n"
"{\n"
" (*counter)++;\n"
"}\n"
"\n"
;

View File

@@ -0,0 +1,34 @@
//OpenCL 1.1 has atomic_inc build-in (no extension needed)
//see http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/atomic_inc.html
__kernel void globalAtomicKernelOpenCL1_1( volatile __global int* counter)
{
atomic_inc(counter);
}
//OpenCL 1.1 atomic device counters extension, usually faster on current AMD hardware
//http://www.khronos.org/registry/cl/extensions/ext/cl_ext_atomic_counters_32.txt
#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
__kernel void counterAtomicKernelExt( counter32_t counter)
{
atomic_inc(counter);
}
//OpenCL 1.0 optional extension, using atom_inc
//see http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/cl_khr_global_int32_base_atomics.html
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable //atomic_inc
__kernel void globalAtomicKernelExt( __global int* counter)
{
atom_inc(counter);
}
__kernel void globalAtomicKernelCounters32Broken( __global int* counter)
{
(*counter)++;
}

View File

@@ -0,0 +1,201 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///original author: Erwin Coumans
#include "../basic_initialize/btOpenCLUtils.h"
#include <stdio.h>
cl_context g_cxMainContext;
cl_command_queue g_cqCommandQue;
cl_kernel g_atomicsKernel;
static const size_t workGroupSize = 128;//todo figure out an appropriate workgroup size suitable for the OpenCL platform/context/device/kernel
#define NUM_OBJECTS 1024
#include "globalAtomicsKernel.h"
char * findAndReplace( char const * const original, char const * const pattern, char const * const replacement);
#include <string.h>
#include <malloc.h>
int main(int argc, char* argv[])
{
int ciErrNum = 0;
printf("press a key to start\n");
getchar();
const char* vendorSDK = btOpenCLUtils::getSdkVendorName();
printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
cl_device_type deviceType = CL_DEVICE_TYPE_GPU;//CL_DEVICE_TYPE_ALL
void* glCtx=0;
void* glDC = 0;
printf("Initialize OpenCL using btOpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);
if (numDev>0)
{
int deviceIndex=0;
cl_device_id device;
device = btOpenCLUtils::getDevice(g_cxMainContext,deviceIndex);
btOpenCLDeviceInfo clInfo;
btOpenCLUtils::getDeviceInfo(device,clInfo);
btOpenCLUtils::printDeviceInfo(device);
const char* globalAtomicsKernelStringPatched = globalAtomicsKernelString;
if (!strstr(clInfo.m_deviceExtensions,"cl_ext_atomic_counters_32"))
{
globalAtomicsKernelStringPatched = findAndReplace(globalAtomicsKernelString,"counter32_t", "volatile __global int*");
}
// create a command-queue
g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
cl_mem counterBuffer = clCreateBuffer(g_cxMainContext, CL_MEM_READ_WRITE, sizeof(int), NULL, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
char* kernelMethods[] =
{
"globalAtomicKernelOpenCL1_1",
"counterAtomicKernelExt",
"globalAtomicKernelExt",
"globalAtomicKernelCounters32Broken"
};
int numKernelMethods = sizeof(kernelMethods)/sizeof(char*);
for (int i=0;i<numKernelMethods;i++)
{
int myCounter = 0;
//write to counterBuffer
int deviceOffset=0;
int hostOffset=0;
ciErrNum = clEnqueueWriteBuffer(g_cqCommandQue, counterBuffer,CL_FALSE, deviceOffset, sizeof(int), &myCounter, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
g_atomicsKernel = btOpenCLUtils::compileCLKernelFromString(g_cxMainContext,device,globalAtomicsKernelStringPatched,kernelMethods[i], &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
ciErrNum = clSetKernelArg(g_atomicsKernel, 0, sizeof(cl_mem),(void*)&counterBuffer);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
size_t numWorkItems = workGroupSize*((NUM_OBJECTS + (workGroupSize-1)) / workGroupSize);
ciErrNum = clEnqueueNDRangeKernel(g_cqCommandQue, g_atomicsKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
clFinish(g_cqCommandQue);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
//read from counterBuffer
ciErrNum = clEnqueueReadBuffer(g_cqCommandQue, counterBuffer, CL_TRUE, deviceOffset, sizeof(int), &myCounter, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
if (myCounter != NUM_OBJECTS)
{
printf("%s is broken, expected %d got %d\n",kernelMethods[i],NUM_OBJECTS,myCounter);
} else
{
printf("%s success, got %d\n",kernelMethods[i],myCounter);
}
}
clReleaseCommandQueue(g_cqCommandQue);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
clReleaseContext(g_cxMainContext);
printf("press a key to end\n");
getchar();
return 0;
}
#ifdef _WIN32
#pragma warning( push )
#pragma warning( disable : 4996 )
#endif //_WIN32
#include <string.h>
#include <stdlib.h>
char * findAndReplace(
char const * const original,
char const * const pattern,
char const * const replacement
) {
size_t const replen = strlen(replacement);
size_t const patlen = strlen(pattern);
size_t const orilen = strlen(original);
size_t patcnt = 0;
const char * oriptr;
const char * patloc;
// find how many times the pattern occurs in the original string
for (oriptr = original; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
{
patcnt++;
}
{
// allocate memory for the new string
size_t const retlen = orilen + patcnt * (replen - patlen);
char * const returned = (char *) malloc( sizeof(char) * (retlen + 1) );
if (returned != NULL)
{
// copy the original string,
// replacing all the instances of the pattern
char * retptr = returned;
for (oriptr = original; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
{
size_t const skplen = patloc - oriptr;
// copy the section until the occurence of the pattern
strncpy(retptr, oriptr, skplen);
retptr += skplen;
// copy the replacement
strncpy(retptr, replacement, replen);
retptr += replen;
}
// copy the rest of the string.
strcpy(retptr, oriptr);
}
return returned;
}
}
#ifdef _WIN32
#pragma warning( pop )
#endif //_WIN32

View File

@@ -0,0 +1,4 @@
include "AMD"
--include "Intel"
--include "NVIDIA"

View File

@@ -0,0 +1,13 @@
#!/usr/bin/env python
import sys
import os
import shutil
arg = sys.argv[1]
fh = open(arg)
print 'static const char* '+sys.argv[2]+'= \\'
for line in fh.readlines():
a = line.strip('\n')
print '"'+a+'\\n"'
print ';'

View File

@@ -0,0 +1,5 @@
stringify.py global_atomics.cl globalAtomicsKernelString >globalAtomicsKernel.h

View File

@@ -0,0 +1,58 @@
hasCL = findOpenCL_AMD()
if (hasCL) then
project "OpenCL_gpu_rigidbody_pipeline_AMD"
initOpenCL_AMD()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlut()
initGlew()
includedirs {
"../../../rendering/BulletMath",
"../../primitives",
"../../../../../src"
}
files {
"../main.cpp",
"../btConvexUtility.cpp",
"../btConvexUtility.h",
"../btGpuNarrowPhaseAndSolver.cpp",
"../btGpuNarrowPhaseAndSolver.h",
"../../../dynamics/basic_demo/ConvexHeightFieldShape.cpp",
"../../../dynamics/basic_demo/ConvexHeightFieldShape.h",
"../../../../../src/LinearMath/btConvexHullComputer.cpp",
"../../../../../src/LinearMath/btConvexHullComputer.h",
"../../broadphase_benchmark/findPairsOpenCL.cpp",
"../../broadphase_benchmark/findPairsOpenCL.h",
"../../broadphase_benchmark/btGridBroadphaseCL.cpp",
"../../broadphase_benchmark/btGridBroadphaseCL.h",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
"../../../../../src/LinearMath/btAlignedAllocator.cpp",
"../../../../../src/LinearMath/btQuickprof.cpp",
"../../../../../src/LinearMath/btQuickprof.h",
"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
"../../opengl_interop/btOpenCLGLInteropBuffer.h",
"../../opengl_interop/btStopwatch.cpp",
"../../opengl_interop/btStopwatch.h"
}
end

View File

@@ -0,0 +1,91 @@
#ifndef COMMAND_LINE_ARGS_H
#define COMMAND_LINE_ARGS_H
/******************************************************************************
* Command-line parsing
******************************************************************************/
#include <map>
#include <algorithm>
#include <string>
#include <sstream>
class CommandLineArgs
{
protected:
std::map<std::string, std::string> pairs;
public:
// Constructor
CommandLineArgs(int argc, char **argv)
{
using namespace std;
for (int i = 1; i < argc; i++)
{
string arg = argv[i];
if ((arg[0] != '-') || (arg[1] != '-')) {
continue;
}
string::size_type pos;
string key, val;
if ((pos = arg.find( '=')) == string::npos) {
key = string(arg, 2, arg.length() - 2);
val = "";
} else {
key = string(arg, 2, pos - 2);
val = string(arg, pos + 1, arg.length() - 1);
}
pairs[key] = val;
}
}
bool CheckCmdLineFlag(const char* arg_name)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
return true;
}
return false;
}
template <typename T>
void GetCmdLineArgument(const char *arg_name, T &val);
int ParsedArgc()
{
return pairs.size();
}
};
template <typename T>
void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
istringstream strstream(itr->second);
strstream >> val;
}
}
template <>
void CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
string s = itr->second;
val = (char*) malloc(sizeof(char) * (s.length() + 1));
strcpy(val, s.c_str());
} else {
val = NULL;
}
}
#endif //COMMAND_LINE_ARGS_H

View File

@@ -0,0 +1,58 @@
hasCL = findOpenCL_Intel()
if (hasCL) then
project "OpenCL_gpu_rigidbody_pipeline_Intel"
initOpenCL_Intel()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlut()
initGlew()
includedirs {
"../../../rendering/BulletMath",
"../../primitives",
"../../../../../src"
}
files {
"../main.cpp",
"../btConvexUtility.cpp",
"../btConvexUtility.h",
"../btGpuNarrowPhaseAndSolver.cpp",
"../btGpuNarrowPhaseAndSolver.h",
"../../../dynamics/basic_demo/ConvexHeightFieldShape.cpp",
"../../../dynamics/basic_demo/ConvexHeightFieldShape.h",
"../../../../../src/LinearMath/btConvexHullComputer.cpp",
"../../../../../src/LinearMath/btConvexHullComputer.h",
"../../broadphase_benchmark/findPairsOpenCL.cpp",
"../../broadphase_benchmark/findPairsOpenCL.h",
"../../broadphase_benchmark/btGridBroadphaseCL.cpp",
"../../broadphase_benchmark/btGridBroadphaseCL.h",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
"../../../../../src/LinearMath/btAlignedAllocator.cpp",
"../../../../../src/LinearMath/btQuickprof.cpp",
"../../../../../src/LinearMath/btQuickprof.h",
"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
"../../opengl_interop/btOpenCLGLInteropBuffer.h",
"../../opengl_interop/btStopwatch.cpp",
"../../opengl_interop/btStopwatch.h"
}
end

View File

@@ -0,0 +1,57 @@
hasCL = findOpenCL_NVIDIA()
if (hasCL) then
project "OpenCL_gpu_rigidbody_pipeline_NVIDIA"
initOpenCL_NVIDIA()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlut()
initGlew()
includedirs {
"../../../rendering/BulletMath",
"../../primitives",
"../../../../../src"
}
files {
"../main.cpp",
"../btConvexUtility.cpp",
"../btConvexUtility.h",
"../btGpuNarrowPhaseAndSolver.cpp",
"../btGpuNarrowPhaseAndSolver.h",
"../../../dynamics/basic_demo/ConvexHeightFieldShape.cpp",
"../../../dynamics/basic_demo/ConvexHeightFieldShape.h",
"../../../../../src/LinearMath/btConvexHullComputer.cpp",
"../../../../../src/LinearMath/btConvexHullComputer.h",
"../../broadphase_benchmark/findPairsOpenCL.cpp",
"../../broadphase_benchmark/findPairsOpenCL.h",
"../../broadphase_benchmark/btGridBroadphaseCL.cpp",
"../../broadphase_benchmark/btGridBroadphaseCL.h",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
"../../../../../src/LinearMath/btAlignedAllocator.cpp",
"../../../../../src/LinearMath/btQuickprof.cpp",
"../../../../../src/LinearMath/btQuickprof.h",
"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
"../../opengl_interop/btOpenCLGLInteropBuffer.h",
"../../opengl_interop/btStopwatch.cpp",
"../../opengl_interop/btStopwatch.h"
}
end

View File

@@ -0,0 +1,240 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#include "btConvexUtility.h"
#include "LinearMath/btConvexHullComputer.h"
#include "LinearMath/btGrahamScan2dConvexHull.h"
#include "LinearMath/btQuaternion.h"
bool btConvexUtility::initializePolyhedralFeatures(const btAlignedObjectArray<btVector3>& orgVertices, bool mergeCoplanarTriangles)
{
btConvexHullComputer conv;
conv.compute(&orgVertices[0].getX(), sizeof(btVector3),orgVertices.size(),0.f,0.f);
btAlignedObjectArray<btVector3> faceNormals;
int numFaces = conv.faces.size();
faceNormals.resize(numFaces);
btConvexHullComputer* convexUtil = &conv;
btAlignedObjectArray<btFace> tmpFaces;
tmpFaces.resize(numFaces);
int numVertices = convexUtil->vertices.size();
m_vertices.resize(numVertices);
for (int p=0;p<numVertices;p++)
{
m_vertices[p] = convexUtil->vertices[p];
}
for (int i=0;i<numFaces;i++)
{
int face = convexUtil->faces[i];
//printf("face=%d\n",face);
const btConvexHullComputer::Edge* firstEdge = &convexUtil->edges[face];
const btConvexHullComputer::Edge* edge = firstEdge;
btVector3 edges[3];
int numEdges = 0;
//compute face normals
btScalar maxCross2 = 0.f;
int chosenEdge = -1;
do
{
int src = edge->getSourceVertex();
tmpFaces[i].m_indices.push_back(src);
int targ = edge->getTargetVertex();
btVector3 wa = convexUtil->vertices[src];
btVector3 wb = convexUtil->vertices[targ];
btVector3 newEdge = wb-wa;
newEdge.normalize();
if (numEdges<2)
edges[numEdges++] = newEdge;
edge = edge->getNextEdgeOfFace();
} while (edge!=firstEdge);
btScalar planeEq = 1e30f;
if (numEdges==2)
{
faceNormals[i] = edges[0].cross(edges[1]);
faceNormals[i].normalize();
tmpFaces[i].m_plane[0] = faceNormals[i].getX();
tmpFaces[i].m_plane[1] = faceNormals[i].getY();
tmpFaces[i].m_plane[2] = faceNormals[i].getZ();
tmpFaces[i].m_plane[3] = planeEq;
}
else
{
btAssert(0);//degenerate?
faceNormals[i].setZero();
}
for (int v=0;v<tmpFaces[i].m_indices.size();v++)
{
btScalar eq = m_vertices[tmpFaces[i].m_indices[v]].dot(faceNormals[i]);
if (planeEq>eq)
{
planeEq=eq;
}
}
tmpFaces[i].m_plane[3] = -planeEq;
}
//merge coplanar faces
btScalar faceWeldThreshold= 0.999f;
btAlignedObjectArray<int> todoFaces;
for (int i=0;i<tmpFaces.size();i++)
todoFaces.push_back(i);
while (todoFaces.size())
{
btAlignedObjectArray<int> coplanarFaceGroup;
int refFace = todoFaces[todoFaces.size()-1];
coplanarFaceGroup.push_back(refFace);
btFace& faceA = tmpFaces[refFace];
todoFaces.pop_back();
btVector3 faceNormalA(faceA.m_plane[0],faceA.m_plane[1],faceA.m_plane[2]);
for (int j=todoFaces.size()-1;j>=0;j--)
{
int i = todoFaces[j];
btFace& faceB = tmpFaces[i];
btVector3 faceNormalB(faceB.m_plane[0],faceB.m_plane[1],faceB.m_plane[2]);
if (faceNormalA.dot(faceNormalB)>faceWeldThreshold)
{
coplanarFaceGroup.push_back(i);
todoFaces.remove(i);
}
}
bool did_merge = false;
if (mergeCoplanarTriangles && coplanarFaceGroup.size()>1)
{
//do the merge: use Graham Scan 2d convex hull
btAlignedObjectArray<GrahamVector2> orgpoints;
for (int i=0;i<coplanarFaceGroup.size();i++)
{
btFace& face = tmpFaces[coplanarFaceGroup[i]];
btVector3 faceNormal(face.m_plane[0],face.m_plane[1],face.m_plane[2]);
btVector3 xyPlaneNormal(0,0,1);
btQuaternion rotationArc = shortestArcQuat(faceNormal,xyPlaneNormal);
for (int f=0;f<face.m_indices.size();f++)
{
int orgIndex = face.m_indices[f];
btVector3 pt = m_vertices[orgIndex];
btVector3 rotatedPt = quatRotate(rotationArc,pt);
rotatedPt.setZ(0);
bool found = false;
for (int i=0;i<orgpoints.size();i++)
{
//if ((orgpoints[i].m_orgIndex == orgIndex) || ((rotatedPt-orgpoints[i]).length2()<0.0001))
if (orgpoints[i].m_orgIndex == orgIndex)
{
found=true;
break;
}
}
if (!found)
orgpoints.push_back(GrahamVector2(rotatedPt,orgIndex));
}
}
btFace combinedFace;
for (int i=0;i<4;i++)
combinedFace.m_plane[i] = tmpFaces[coplanarFaceGroup[0]].m_plane[i];
btAlignedObjectArray<GrahamVector2> hull;
GrahamScanConvexHull2D(orgpoints,hull);
for (int i=0;i<hull.size();i++)
{
combinedFace.m_indices.push_back(hull[i].m_orgIndex);
for(int k = 0; k < orgpoints.size(); k++) {
if(orgpoints[k].m_orgIndex == hull[i].m_orgIndex) {
orgpoints[k].m_orgIndex = -1; // invalidate...
break;
}
}
}
// are there rejected vertices?
bool reject_merge = false;
for(int i = 0; i < orgpoints.size(); i++) {
if(orgpoints[i].m_orgIndex == -1)
continue; // this is in the hull...
// this vertex is rejected -- is anybody else using this vertex?
for(int j = 0; j < tmpFaces.size(); j++) {
btFace& face = tmpFaces[j];
// is this a face of the current coplanar group?
bool is_in_current_group = false;
for(int k = 0; k < coplanarFaceGroup.size(); k++) {
if(coplanarFaceGroup[k] == j) {
is_in_current_group = true;
break;
}
}
if(is_in_current_group) // ignore this face...
continue;
// does this face use this rejected vertex?
for(int v = 0; v < face.m_indices.size(); v++) {
if(face.m_indices[v] == orgpoints[i].m_orgIndex) {
// this rejected vertex is used in another face -- reject merge
reject_merge = true;
break;
}
}
if(reject_merge)
break;
}
if(reject_merge)
break;
}
if(!reject_merge) {
// do this merge!
did_merge = true;
m_faces.push_back(combinedFace);
}
}
if(!did_merge)
{
for (int i=0;i<coplanarFaceGroup.size();i++)
{
m_faces.push_back(tmpFaces[coplanarFaceGroup[i]]);
}
}
}
return true;
}

View File

@@ -0,0 +1,41 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef _BT_CONVEX_UTILITY_H
#define _BT_CONVEX_UTILITY_H
#include "LinearMath/btAlignedObjectArray.h"
#include "LinearMath/btVector3.h"
struct btFace
{
btAlignedObjectArray<int> m_indices;
// btAlignedObjectArray<int> m_connectedFaces;
btScalar m_plane[4];
};
class btConvexUtility
{
public:
btAlignedObjectArray<btVector3> m_vertices;
btAlignedObjectArray<btFace> m_faces;
bool initializePolyhedralFeatures(const btAlignedObjectArray<btVector3>& orgVertices, bool mergeCoplanarTriangles);
};
#endif

View File

@@ -0,0 +1,730 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#include "btGpuNarrowphaseAndSolver.h"
//#include "CustomConvexShape.h"
//#include "CustomConvexPairCollision.h"
#include "LinearMath/btQuickprof.h"
//#include "BulletDynamics/Dynamics/btRigidBody.h"
#include "Adl/Adl.h"
#include "../../dynamics/basic_demo/Stubs/AdlMath.h"
#include "../../dynamics/basic_demo/Stubs/AdlContact4.h"
#include "../../dynamics/basic_demo/Stubs/AdlQuaternion.h"
#include "../../dynamics/basic_demo/Stubs/ChNarrowPhase.h"
#include "../../dynamics/basic_demo/Stubs/Solver.h"
#include <AdlPrimitives/Sort/RadixSort32.h>
int gpuBatchContacts = 1;
int numPairsOut =0;
struct CPUSolveData
{
u32 m_n[adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT];
u32 m_offset[adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT];
};
struct ParallelSolveData
{
adl::Buffer<u32>* m_numConstraints;
adl::Buffer<u32>* m_offsets;
};
struct CustomDispatchData
{
adl::DeviceCL* m_deviceCL;
adl::Device* m_deviceHost;
ShapeDataType m_ShapeBuffer;
adl::HostBuffer<ConvexHeightField*>* m_shapePointers;
adl::HostBuffer<int2>* m_pBufPairsCPU;
adl::Buffer<int2>* m_convexPairsOutGPU;
adl::Buffer<int2>* m_planePairs;
adl::Buffer<Contact4>* m_pBufContactOutGPU;
adl::HostBuffer<Contact4>* m_pBufContactOutCPU;
adl::ChNarrowphase<adl::TYPE_CL>::Data* m_Data;
adl::HostBuffer<RigidBodyBase::Body>* m_bodyBufferCPU;
adl::Buffer<RigidBodyBase::Body>* m_bodyBufferGPU;
adl::Buffer<RigidBodyBase::Inertia>* m_inertiaBufferCPU;
adl::Buffer<RigidBodyBase::Inertia>* m_inertiaBufferGPU;
adl::Solver<adl::TYPE_CL>::Data* m_solverDataGPU;
SolverData m_contactCGPU;
void* m_frictionCGPU;
int m_numAcceleratedShapes;
int m_numAcceleratedRigidBodies;
};
btGpuNarrowphaseAndSolver::btGpuNarrowphaseAndSolver(adl::DeviceCL* deviceCL)
:m_internalData(0) ,m_planeBodyIndex(-1)
{
if (deviceCL)
{
m_internalData = new CustomDispatchData();
memset(m_internalData,0,sizeof(CustomDispatchData));
adl::DeviceUtils::Config cfg;
m_internalData->m_deviceCL = deviceCL;
m_internalData->m_deviceHost = adl::DeviceUtils::allocate( adl::TYPE_HOST, cfg );
m_internalData->m_pBufPairsCPU = new adl::HostBuffer<int2>(m_internalData->m_deviceHost, MAX_BROADPHASE_COLLISION_CL);
m_internalData->m_convexPairsOutGPU = new adl::Buffer<int2>(m_internalData->m_deviceCL,MAX_BROADPHASE_COLLISION_CL);
m_internalData->m_planePairs = new adl::Buffer<int2>(m_internalData->m_deviceCL,MAX_BROADPHASE_COLLISION_CL);
m_internalData->m_pBufContactOutCPU = new adl::HostBuffer<Contact4>(m_internalData->m_deviceHost, MAX_BROADPHASE_COLLISION_CL);
m_internalData->m_bodyBufferCPU = new adl::HostBuffer<RigidBodyBase::Body>(m_internalData->m_deviceHost, MAX_CONVEX_BODIES_CL);
m_internalData->m_inertiaBufferCPU = new adl::Buffer<RigidBodyBase::Inertia>(m_internalData->m_deviceHost,MAX_CONVEX_BODIES_CL);
m_internalData->m_pBufContactOutGPU = new adl::Buffer<Contact4>(m_internalData->m_deviceCL, MAX_BROADPHASE_COLLISION_CL);
m_internalData->m_inertiaBufferGPU = new adl::Buffer<RigidBodyBase::Inertia>(m_internalData->m_deviceCL,MAX_CONVEX_BODIES_CL);
m_internalData->m_solverDataGPU = adl::Solver<adl::TYPE_CL>::allocate( m_internalData->m_deviceCL, MAX_BROADPHASE_COLLISION_CL);
m_internalData->m_bodyBufferGPU = new adl::Buffer<RigidBodyBase::Body>(m_internalData->m_deviceCL, MAX_CONVEX_BODIES_CL);
m_internalData->m_Data = adl::ChNarrowphase<adl::TYPE_CL>::allocate(m_internalData->m_deviceCL);
// m_internalData->m_DataCPU = adl::ChNarrowphase<adl::TYPE_HOST>::allocate(m_internalData->m_deviceHost);
m_internalData->m_ShapeBuffer = adl::ChNarrowphase<adl::TYPE_CL>::allocateShapeBuffer(m_internalData->m_deviceCL, MAX_CONVEX_SHAPES_CL);
m_internalData->m_shapePointers = new adl::HostBuffer<ConvexHeightField*>(m_internalData->m_deviceHost,MAX_CONVEX_SHAPES_CL);
m_internalData->m_numAcceleratedShapes = 0;
m_internalData->m_numAcceleratedRigidBodies = 0;
m_internalData->m_contactCGPU = adl::Solver<adl::TYPE_CL>::allocateConstraint4( m_internalData->m_deviceCL, MAX_BROADPHASE_COLLISION_CL);
m_internalData->m_frictionCGPU = adl::Solver<adl::TYPE_CL>::allocateFrictionConstraint( m_internalData->m_deviceCL, MAX_BROADPHASE_COLLISION_CL);
}
}
int btGpuNarrowphaseAndSolver::registerShape(ConvexHeightField* convexShape)
{
(*m_internalData->m_shapePointers)[m_internalData->m_numAcceleratedShapes] = convexShape;
adl::ChNarrowphase<adl::TYPE_CL>::setShape(m_internalData->m_ShapeBuffer, convexShape, m_internalData->m_numAcceleratedShapes, 0.01f);
return m_internalData->m_numAcceleratedShapes++;
}
cl_mem btGpuNarrowphaseAndSolver::getBodiesGpu()
{
return (cl_mem)m_internalData->m_bodyBufferGPU->m_ptr;
}
cl_mem btGpuNarrowphaseAndSolver::getBodyInertiasGpu()
{
return (cl_mem)m_internalData->m_inertiaBufferGPU->m_ptr;
}
int btGpuNarrowphaseAndSolver::registerRigidBody(int shapeIndex, float mass, const float* position, const float* orientation , bool writeToGpu)
{
assert(m_internalData->m_numAcceleratedRigidBodies< (MAX_CONVEX_BODIES_CL-1));
RigidBodyBase::Body& body = m_internalData->m_bodyBufferCPU->m_ptr[m_internalData->m_numAcceleratedRigidBodies];
float friction = 1.f;
float restitution = 0.f;
body.m_frictionCoeff = friction;
body.m_restituitionCoeff = restitution;
body.m_angVel = make_float4(0.f);
body.m_linVel = make_float4(0.f);
body.m_pos = make_float4(position[0],position[1],position[2],0.f);
body.m_quat = make_float4(orientation[0],orientation[1],orientation[2],orientation[3]);
body.m_shapeIdx = shapeIndex;
if (shapeIndex<0)
{
body.m_shapeType = CollisionShape::SHAPE_PLANE;
m_planeBodyIndex = m_internalData->m_numAcceleratedRigidBodies;
} else
{
body.m_shapeType = CollisionShape::SHAPE_CONVEX_HEIGHT_FIELD;
}
body.m_invMass = mass? 1.f/mass : 0.f;
if (writeToGpu)
m_internalData->m_bodyBufferGPU->write(&body,1,m_internalData->m_numAcceleratedRigidBodies);
RigidBodyBase::Inertia& shapeInfo = m_internalData->m_inertiaBufferCPU->m_ptr[m_internalData->m_numAcceleratedRigidBodies];
if (mass==0.f)
{
shapeInfo.m_initInvInertia = mtZero();
shapeInfo.m_invInertia = mtZero();
} else
{
assert(body.m_shapeIdx>=0);
//approximate using the aabb of the shape
Aabb aabb = (*m_internalData->m_shapePointers)[shapeIndex]->m_aabb;
float4 halfExtents = (aabb.m_max - aabb.m_min);
float4 localInertia;
float lx=2.f*halfExtents.x;
float ly=2.f*halfExtents.y;
float lz=2.f*halfExtents.z;
localInertia = make_float4( (mass/12.0f) * (ly*ly + lz*lz),
(mass/12.0f) * (lx*lx + lz*lz),
(mass/12.0f) * (lx*lx + ly*ly));
float4 invLocalInertia;
invLocalInertia.x = 1.f/localInertia.x;
invLocalInertia.y = 1.f/localInertia.y;
invLocalInertia.z = 1.f/localInertia.z;
invLocalInertia.w = 0.f;
shapeInfo.m_initInvInertia = mtZero();
shapeInfo.m_initInvInertia.m_row[0].x = invLocalInertia.x;
shapeInfo.m_initInvInertia.m_row[1].y = invLocalInertia.y;
shapeInfo.m_initInvInertia.m_row[2].z = invLocalInertia.z;
Matrix3x3 m = qtGetRotationMatrix( body.m_quat);
Matrix3x3 mT = mtTranspose( m );
shapeInfo.m_invInertia = mtMul( mtMul( m, shapeInfo.m_initInvInertia ), mT );
}
if (writeToGpu)
m_internalData->m_inertiaBufferGPU->write(&shapeInfo,1,m_internalData->m_numAcceleratedRigidBodies);
return m_internalData->m_numAcceleratedRigidBodies++;
}
void btGpuNarrowphaseAndSolver::writeAllBodiesToGpu()
{
m_internalData->m_bodyBufferGPU->write(m_internalData->m_bodyBufferCPU->m_ptr,m_internalData->m_numAcceleratedRigidBodies);
m_internalData->m_inertiaBufferGPU->write( m_internalData->m_inertiaBufferCPU->m_ptr,m_internalData->m_numAcceleratedRigidBodies);
}
btGpuNarrowphaseAndSolver::~btGpuNarrowphaseAndSolver(void)
{
if (m_internalData)
{
delete m_internalData->m_pBufPairsCPU;
delete m_internalData->m_convexPairsOutGPU;
delete m_internalData->m_planePairs;
delete m_internalData->m_pBufContactOutGPU;
delete m_internalData->m_inertiaBufferGPU;
delete m_internalData->m_pBufContactOutCPU;
delete m_internalData->m_shapePointers;
adl::ChNarrowphase<adl::TYPE_CL>::deallocateShapeBuffer(m_internalData->m_ShapeBuffer);
delete m_internalData->m_inertiaBufferCPU;
adl::Solver<adl::TYPE_CL>::deallocateConstraint4( m_internalData->m_contactCGPU );
adl::Solver<adl::TYPE_CL>::deallocateFrictionConstraint( m_internalData->m_frictionCGPU );
delete m_internalData->m_bodyBufferGPU;
adl::Solver<adl::TYPE_CL>::deallocate( m_internalData->m_solverDataGPU);
delete m_internalData->m_bodyBufferCPU;
adl::ChNarrowphase<adl::TYPE_CL>::deallocate(m_internalData->m_Data);
adl::DeviceUtils::deallocate(m_internalData->m_deviceHost);
delete m_internalData;
}
}
void btGpuNarrowphaseAndSolver::computeContactsAndSolver(cl_mem broadphasePairs, int numBroadphasePairs)
{
BT_PROFILE("computeContactsAndSolver");
bool bGPU = (m_internalData != 0);
int maxBodyIndex = m_internalData->m_numAcceleratedRigidBodies;
if (!maxBodyIndex)
return;
int numOfConvexRBodies = maxBodyIndex;
adl::ChNarrowphaseBase::Config cfgNP;
cfgNP.m_collisionMargin = 0.01f;
int nContactOut = 0;
//printf("convexPairsOut.m_size = %d\n",m_internalData->m_convexPairsOutGPU->m_size);
adl::Buffer<int2> broadphasePairsGPU;
broadphasePairsGPU.m_ptr = (int2*)broadphasePairs;
broadphasePairsGPU.m_size = numBroadphasePairs;
broadphasePairsGPU.m_device = m_internalData->m_deviceCL;
bool useCulling = true;
if (useCulling)
{
BT_PROFILE("ChNarrowphase::culling");
adl::DeviceUtils::waitForCompletion(m_internalData->m_deviceCL);
numPairsOut = adl::ChNarrowphase<adl::TYPE_CL>::culling(
m_internalData->m_Data,
&broadphasePairsGPU,
numBroadphasePairs,
m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer,
m_internalData->m_convexPairsOutGPU,
cfgNP);
}
{
BT_PROFILE("ChNarrowphase::execute");
if (useCulling)
{
if (m_planeBodyIndex>=0)
{
BT_PROFILE("ChNarrowphase:: plane versus convex");
//todo: get rid of this dynamic allocation
int2* hostPairs = new int2[m_internalData->m_numAcceleratedRigidBodies-1];
int index=0;
for (int i=0;i<m_internalData->m_numAcceleratedRigidBodies;i++)
{
if (i!=m_planeBodyIndex)
{
hostPairs[index].x = m_planeBodyIndex;
hostPairs[index].y = i;
index++;
}
}
assert(m_internalData->m_numAcceleratedRigidBodies-1 == index);
m_internalData->m_planePairs->write(hostPairs,index);
adl::DeviceUtils::waitForCompletion(m_internalData->m_deviceCL);
delete[]hostPairs;
//convex versus plane
adl::ChNarrowphase<adl::TYPE_CL>::execute(m_internalData->m_Data, m_internalData->m_planePairs, index, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer,
0,0,m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP);
}
//convex versus convex
adl::ChNarrowphase<adl::TYPE_CL>::execute(m_internalData->m_Data, m_internalData->m_convexPairsOutGPU,numPairsOut, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP);
} else
{
adl::ChNarrowphase<adl::TYPE_CL>::execute(m_internalData->m_Data, &broadphasePairsGPU, numBroadphasePairs, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP);
}
adl::DeviceUtils::waitForCompletion(m_internalData->m_deviceCL);
}
if (!nContactOut)
return;
bool useSolver = true;//true;//false;
if (useSolver)
{
float dt=1./60.;
adl::SolverBase::ConstraintCfg csCfg( dt );
csCfg.m_enableParallelSolve = true;
csCfg.m_averageExtent = 0.2f;//@TODO m_averageObjExtent;
csCfg.m_staticIdx = m_planeBodyIndex;
bool exposeInternalBatchImplementation=true;
adl::Solver<adl::TYPE_HOST>::Data* cpuSolverData = 0;
if (exposeInternalBatchImplementation)
{
BT_PROFILE("Batching");
cpuSolverData = adl::Solver<adl::TYPE_HOST>::allocate( m_internalData->m_deviceHost, nContactOut);
adl::Buffer<Contact4>* contactsIn = m_internalData->m_pBufContactOutGPU;
const adl::Buffer<RigidBodyBase::Body>* bodyBuf = m_internalData->m_bodyBufferGPU;
void* additionalData = m_internalData->m_frictionCGPU;
const adl::Buffer<RigidBodyBase::Inertia>* shapeBuf = m_internalData->m_inertiaBufferGPU;
SolverData contactCOut = m_internalData->m_contactCGPU;
int nContacts = nContactOut;
bool useCPU=false;
if (useCPU)
{
BT_PROFILE("CPU batch");
{
BT_PROFILE("CPU sortContacts2");
sortContacts2( cpuSolverData, bodyBuf, contactsIn, additionalData, nContacts, csCfg );
}
CPUSolveData* dataCPU = (CPUSolveData*)cpuSolverData->m_parallelSolveData;
{
BT_PROFILE("CPU batchContacts2");
adl::Buffer<u32> n; n.setRawPtr( cpuSolverData->m_device, dataCPU->m_n, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
adl::Buffer<u32> offsets; offsets.setRawPtr( cpuSolverData->m_device, dataCPU->m_offset, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
batchContacts2( cpuSolverData, contactsIn, nContacts, &n, &offsets, csCfg.m_staticIdx );
}
{
BT_PROFILE("CPU convertToConstraints2");
convertToConstraints2( cpuSolverData, bodyBuf, shapeBuf, contactsIn, contactCOut, additionalData, nContacts, csCfg );
}
{
BT_PROFILE("CPU -> GPU copy");
ParallelSolveData* dataGPU = (ParallelSolveData*)m_internalData->m_solverDataGPU->m_parallelSolveData;
dataGPU->m_numConstraints->write(dataCPU->m_n,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
dataGPU->m_offsets->write(dataCPU->m_offset,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL);
}
}
else
{
BT_PROFILE("GPU batch");
adl::Solver<adl::TYPE_CL>::Data* data = m_internalData->m_solverDataGPU;
{
if( data->m_contactBuffer )
{
if( data->m_contactBuffer->getSize() < nContacts )
{
BT_PROFILE("delete data->m_contactBuffer;");
delete data->m_contactBuffer;
data->m_contactBuffer = 0;
}
}
if( data->m_contactBuffer == 0 )
{
data->m_contactBuffer = new adl::Buffer<Contact4>( data->m_device, nContacts );
}
adl::Buffer<Contact4>* contactNative = contactsIn;
ParallelSolveData* nativeSolveData = (ParallelSolveData*)data->m_parallelSolveData;
{
ADLASSERT( data->m_device->m_type == adl::TYPE_CL );
adl::Buffer<RigidBodyBase::Body>* bodyNative = adl::BufferUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf );
adl::Buffer<Contact4>* contactNative = adl::BufferUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn );
const int sortAlignment = 512; // todo. get this out of sort
if( csCfg.m_enableParallelSolve )
{
ParallelSolveData* nativeSolveData = (ParallelSolveData*)data->m_parallelSolveData;
int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
adl::Buffer<u32>* countsNative = nativeSolveData->m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
adl::Buffer<u32>* offsetsNative = nativeSolveData->m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
{ // 2. set cell idx
BT_PROFILE("GPU set cell idx");
struct CB
{
int m_nContacts;
int m_staticIdx;
float m_scale;
int m_nSplit;
};
ADLASSERT( sortSize%64 == 0 );
CB cdata;
cdata.m_nContacts = nContacts;
cdata.m_staticIdx = csCfg.m_staticIdx;
cdata.m_scale = 1.f/(adl::SolverBase::N_OBJ_PER_SPLIT*csCfg.m_averageExtent);
cdata.m_nSplit = adl::SolverBase::N_SPLIT;
adl::Buffer<CB> constBuffer( data->m_device, 1, adl::BufferBase::BUFFER_CONST );
adl::Launcher::BufferInfo bInfo[] = { adl::Launcher::BufferInfo( contactNative ), adl::Launcher::BufferInfo( bodyNative ), adl::Launcher::BufferInfo( data->m_sortDataBuffer ) };
adl::Launcher launcher( data->m_device, data->m_setSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(adl::Launcher::BufferInfo) );
launcher.setConst( constBuffer, cdata );
launcher.launch1D( sortSize, 64 );
}
bool gpuRadixSort=true;
if (gpuRadixSort)
{ // 3. sort by cell idx
BT_PROFILE("gpuRadixSort");
int n = adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT;
int sortBit = 32;
//if( n <= 0xffff ) sortBit = 16;
//if( n <= 0xff ) sortBit = 8;
//adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize );
adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize );
} else
{
BT_PROFILE("cpu RadixSort");
adl::HostBuffer<adl::SortData> sortData(m_internalData->m_deviceHost,nContacts);
data->m_sortDataBuffer->read(sortData.m_ptr,nContacts);
adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL);
adl::RadixSort<adl::TYPE_HOST>::Data* sData = adl::RadixSort<adl::TYPE_HOST>::allocate( m_internalData->m_deviceHost, nContacts );
adl::RadixSort<adl::TYPE_HOST>::execute( sData, sortData, nContacts );
adl::RadixSort<adl::TYPE_HOST>::deallocate( sData );
data->m_sortDataBuffer->write(sortData.m_ptr,nContacts);
adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL);
}
bool gpuBoundSearch=true;
if (gpuBoundSearch)
{ // 4. find entries
BT_PROFILE("gpuBoundSearch");
adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative,
adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT, adl::BoundSearchBase::COUNT );
adl::PrefixScan<adl::TYPE_CL>::execute( data->m_scan, *countsNative, *offsetsNative,
adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
} else
{
BT_PROFILE("cpuBoundSearch");
adl::HostBuffer<adl::SortData> sortData(m_internalData->m_deviceHost,nContacts);
data->m_sortDataBuffer->read(sortData.m_ptr,nContacts);
adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL);
adl::HostBuffer<u32> n0( m_internalData->m_deviceHost, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
adl::HostBuffer<u32> offset0( m_internalData->m_deviceHost, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
{
n0[i] = 0;
offset0[i] = 0;
}
for(int i=0; i<nContacts; i++)
{
int idx = sortData[i].m_key;
assert(idx>=0);
assert(idx<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
n0[idx]++;
}
// scan
int sum = 0;
for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
{
offset0[i] = sum;
sum += n0[i];
}
countsNative->write(n0.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
offsetsNative->write(offset0.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
adl::DeviceUtils::waitForCompletion( data->m_device );
}
{ // 5. sort constraints by cellIdx
{
BT_PROFILE("gpu m_reorderContactKernel");
adl::Buffer<int4> constBuffer( data->m_device, 1, adl::BufferBase::BUFFER_CONST );
int4 cdata; cdata.x = nContacts;
adl::Launcher::BufferInfo bInfo[] = { adl::Launcher::BufferInfo( contactNative ), adl::Launcher::BufferInfo( data->m_contactBuffer ), adl::Launcher::BufferInfo( data->m_sortDataBuffer ) };
adl::Launcher launcher( data->m_device, data->m_reorderContactKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(adl::Launcher::BufferInfo) );
launcher.setConst( constBuffer, cdata );
launcher.launch1D( nContacts, 64 );
}
}
}
adl::BufferUtils::unmap<false>( bodyNative, bodyBuf );
adl::BufferUtils::unmap<false>( contactNative, contactsIn );
}
adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL);
{
BT_PROFILE("gpu m_copyConstraintKernel");
adl::Buffer<int4> constBuffer( data->m_device, 1, adl::BufferBase::BUFFER_CONST );
int4 cdata; cdata.x = nContacts;
adl::Launcher::BufferInfo bInfo[] = { adl::Launcher::BufferInfo( data->m_contactBuffer ), adl::Launcher::BufferInfo( contactNative ) };
adl::Launcher launcher( data->m_device, data->m_copyConstraintKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(adl::Launcher::BufferInfo) );
launcher.setConst( constBuffer, cdata );
launcher.launch1D( nContacts, 64 );
adl::DeviceUtils::waitForCompletion( data->m_device );
}
bool compareGPU = false;
if (gpuBatchContacts)
{
BT_PROFILE("gpu batchContacts");
adl::Solver<adl::TYPE_CL>::batchContacts( data, contactNative, nContacts, nativeSolveData->m_numConstraints, nativeSolveData->m_offsets, csCfg.m_staticIdx );
}
else
{
BT_PROFILE("cpu batchContacts2");
cpuSolverData->m_parallelSolveData = 0;//
ParallelSolveData* dataGPU = (ParallelSolveData*)m_internalData->m_solverDataGPU->m_parallelSolveData;
adl::Buffer<u32> numConstraints(cpuSolverData->m_device,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
adl::Buffer<u32> offsets(cpuSolverData->m_device,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
{
BT_PROFILE("gpu->cpu read m_numConstraints");
dataGPU->m_numConstraints->read(numConstraints.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
dataGPU->m_offsets->read(offsets.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
adl::DeviceUtils::waitForCompletion( data->m_device );
}
adl::Buffer<u32> gpunumConstraints(cpuSolverData->m_device,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
adl::Buffer<u32> gpuoffsets(cpuSolverData->m_device,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
if (compareGPU)
{
adl::Buffer<Contact4> contactNativeCopy (data->m_device,contactNative->getSize());
contactNativeCopy.write(*contactNative,contactNative->getSize());
adl::DeviceUtils::waitForCompletion( data->m_device );
adl::Buffer<u32> tmpNumGPU(data->m_device,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
adl::Buffer<u32> tmpOffsetGPU(data->m_device,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
tmpNumGPU.write(numConstraints.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
tmpOffsetGPU.write(offsets.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
adl::DeviceUtils::waitForCompletion( data->m_device );
BT_PROFILE("gpu batchContacts");
//adl::Solver<adl::TYPE_CL>::batchContacts( data, contactNative, nContacts, nativeSolveData->m_numConstraints, nativeSolveData->m_offsets, csCfg.m_staticIdx );
adl::Solver<adl::TYPE_CL>::batchContacts( data, &contactNativeCopy, nContacts, &tmpNumGPU, &tmpOffsetGPU, csCfg.m_staticIdx );
adl::DeviceUtils::waitForCompletion( data->m_device );
//compare now
tmpNumGPU.read(gpunumConstraints,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
tmpOffsetGPU.read(gpuoffsets,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
adl::DeviceUtils::waitForCompletion( data->m_device );
}
CPUSolveData* dataCPU = (CPUSolveData*)cpuSolverData->m_parallelSolveData;
{
BT_PROFILE("cpu batchContacts2");
batchContacts2( cpuSolverData, contactNative, nContacts, &numConstraints, &offsets, csCfg.m_staticIdx );
}
if (compareGPU)
{
adl::DeviceUtils::waitForCompletion( data->m_device );
dataGPU->m_numConstraints->write(numConstraints.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
dataGPU->m_offsets->write(offsets.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
adl::DeviceUtils::waitForCompletion( data->m_device );
for (int i=0;i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT;i++)
{
if (gpunumConstraints.m_ptr[i] != numConstraints.m_ptr[i])
{
printf("numConstraints error at %d, expected %d got %d\n",i,numConstraints.m_ptr[i],gpunumConstraints.m_ptr[i]);
}
if (gpuoffsets.m_ptr[i] != offsets.m_ptr[i])
{
printf("numConstraints error at %d, expected %d got %d\n",i,offsets.m_ptr[i],gpuoffsets.m_ptr[i]);
}
}
}
}
if (1)
{
BT_PROFILE("gpu convertToConstraints");
adl::Solver<adl::TYPE_CL>::convertToConstraints( data, bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, csCfg );
adl::DeviceUtils::waitForCompletion( data->m_device );
}
if (compareGPU)
{
adl::Buffer<Contact4> contactNativeCPU(cpuSolverData->m_device,contactNative->getSize());
contactNative->read(contactNativeCPU,nContacts);
adl::DeviceUtils::waitForCompletion( data->m_device );
for (int i=0;i<nContacts;i++)
{
//if (contactNativeCopyCPU.m_ptr[i].m_frictionCoeffCmp !=45874)// contactNativeCPU.m_ptr[i].m_batchIdx != contactNativeCopyCPU.m_ptr[i].m_batchIdx)
{
//if (.m_friction!=45874
//printf("not matching at %d, expected %d, got %d\n",i,contactNativeCPU.m_ptr[i].m_batchIdx,contactNativeCopyCPU.m_ptr[i].m_batchIdx);
}
}
}
}
}
} else
{
BT_PROFILE("GPU reorderConvertToConstraints");
adl::Solver<adl::TYPE_CL>::reorderConvertToConstraints(
m_internalData->m_solverDataGPU,
m_internalData->m_bodyBufferGPU,
m_internalData->m_inertiaBufferGPU,
m_internalData->m_pBufContactOutGPU,
m_internalData->m_contactCGPU,
m_internalData->m_frictionCGPU,
nContactOut,
csCfg );
adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL );
}
if (1)
{
BT_PROFILE("GPU solveContactConstraint");
m_internalData->m_solverDataGPU->m_nIterations = 5;
adl::Solver<adl::TYPE_CL>::solveContactConstraint( m_internalData->m_solverDataGPU,
m_internalData->m_bodyBufferGPU,
m_internalData->m_inertiaBufferGPU,
m_internalData->m_contactCGPU,
0,
nContactOut );
adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL );
}
if (cpuSolverData)
adl::Solver<adl::TYPE_HOST>::deallocate( cpuSolverData );
if (0)
{
BT_PROFILE("read body velocities back to CPU");
//read body updated linear/angular velocities back to CPU
m_internalData->m_bodyBufferGPU->read(
m_internalData->m_bodyBufferCPU->m_ptr,numOfConvexRBodies);
adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL );
}
}
}

View File

@@ -0,0 +1,72 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef GPU_NARROWPHASE_SOLVER_H
#define GPU_NARROWPHASE_SOLVER_H
//#define MAX_CONVEX_BODIES_CL 8*1024
#define MAX_CONVEX_BODIES_CL 128*1024
#define MAX_PAIRS_PER_BODY_CL 16
#define MAX_CONVEX_SHAPES_CL 8192
#define MAX_BROADPHASE_COLLISION_CL (MAX_CONVEX_BODIES_CL*MAX_PAIRS_PER_BODY_CL)
/*
#define MAX_CONVEX_BODIES_CL 1024
#define MAX_PAIRS_PER_BODY_CL 32
#define MAX_CONVEX_SHAPES_CL 8192
#define MAX_BROADPHASE_COLLISION_CL (MAX_CONVEX_BODIES_CL*MAX_PAIRS_PER_BODY_CL)
*/
namespace adl
{
struct DeviceCL;
};
struct CustomDispatchData;
#include "../basic_initialize/btOpenCLInclude.h"
class btGpuNarrowphaseAndSolver
{
protected:
CustomDispatchData* m_internalData;
int m_acceleratedCompanionShapeIndex;
int m_planeBodyIndex;
public:
btGpuNarrowphaseAndSolver(adl::DeviceCL* deviceCL);
virtual ~btGpuNarrowphaseAndSolver(void);
int registerShape(class ConvexHeightField* convexShape);
int registerRigidBody(int shapeIndex, float mass, const float* position, const float* orientation, bool writeToGpu = true);
void writeAllBodiesToGpu();
//btBroadphasePair* GetPair(btBroadphasePairArray& pairArray, int idxBodyA, int idxBodyB);
virtual void computeContactsAndSolver(cl_mem broadphasePairs, int numBroadphasePairs);
cl_mem getBodiesGpu();
cl_mem getBodyInertiasGpu();
};
#endif //GPU_NARROWPHASE_SOLVER_H

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
include "AMD"
-- include "Intel"
include "NVIDIA"

View File

@@ -0,0 +1,64 @@
hasCL = findOpenCL_AMD()
if (hasCL) then
project "OpenCL_gpu_rigidbody_pipeline2_AMD"
initOpenCL_AMD()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlew()
includedirs {
"../../primitives",
"../../../../../src"
}
files {
"../main.cpp",
"../CLPhysicsDemo.cpp",
"../CLPhysicsDemo.h",
"../GLInstancingRenderer.cpp",
"../GLInstancingRenderer.h",
"../GlutRenderer.cpp",
"../GlutRenderer.h",
"../Win32OpenGLRenderManager.cpp",
"../Win32OpenGLRenderManager.h",
"../../gpu_rigidbody_pipeline/btConvexUtility.cpp",
"../../gpu_rigidbody_pipeline/btConvexUtility.h",
"../../gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.cpp",
"../../gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.h",
"../../../dynamics/basic_demo/ConvexHeightFieldShape.cpp",
"../../../dynamics/basic_demo/ConvexHeightFieldShape.h",
"../../../../../src/LinearMath/btConvexHullComputer.cpp",
"../../../../../src/LinearMath/btConvexHullComputer.h",
"../../broadphase_benchmark/findPairsOpenCL.cpp",
"../../broadphase_benchmark/findPairsOpenCL.h",
"../../broadphase_benchmark/btGridBroadphaseCL.cpp",
"../../broadphase_benchmark/btGridBroadphaseCL.h",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
"../../../../../src/LinearMath/btAlignedAllocator.cpp",
"../../../../../src/LinearMath/btQuickprof.cpp",
"../../../../../src/LinearMath/btQuickprof.h",
"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
"../../opengl_interop/btOpenCLGLInteropBuffer.h",
"../../opengl_interop/btStopwatch.cpp",
"../../opengl_interop/btStopwatch.h"
}
end

View File

@@ -0,0 +1,529 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#include "OpenGLInclude.h"
#include "CLPhysicsDemo.h"
#include "LinearMath/btAlignedObjectArray.h"
#include "DemoSettings.h"
#include "../basic_initialize/btOpenCLUtils.h"
#include "../opengl_interop/btOpenCLGLInteropBuffer.h"
#include "../broadphase_benchmark/findPairsOpenCL.h"
#include "LinearMath/btVector3.h"
#include "LinearMath/btQuaternion.h"
#include "LinearMath/btMatrix3x3.h"
#include "../../opencl/gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.h"
#include "../../opencl/gpu_rigidbody_pipeline/btConvexUtility.h"
#include "../../dynamics/basic_demo/ConvexHeightFieldShape.h"
#include "../broadphase_benchmark/btGridBroadphaseCl.h"
#include "LinearMath/btQuickprof.h"
#define MSTRINGIFY(A) #A
static char* interopKernelString =
#include "../broadphase_benchmark/integrateKernel.cl"
#define INTEROPKERNEL_SRC_PATH "../../opencl/broadphase_benchmark/integrateKernel.cl"
cl_kernel g_integrateTransformsKernel;
bool runOpenCLKernels = true;
btGpuNarrowphaseAndSolver* narrowphaseAndSolver = 0;
ConvexHeightField* s_convexHeightField = 0 ;
btOpenCLGLInteropBuffer* g_interopBuffer = 0;
extern GLuint cube_vbo;
extern int VBOsize;
cl_mem clBuffer=0;
char* hostPtr=0;
cl_bool blocking= CL_TRUE;
btFindPairsIO gFpIO;
cl_context g_cxMainContext;
cl_command_queue g_cqCommandQue;
cl_device_id g_device;
cl_mem gLinVelMem=0;
cl_mem gAngVelMem=0;
cl_mem gBodyTimes=0;
#include <Adl/Adl.h>
adl::DeviceCL* g_deviceCL=0;
struct btAABBHost //keep this in sync with btAABBCL!
{
float fx;
float fy;
float fz;
unsigned int uw;
};
struct InternalData
{
adl::Buffer<btVector3>* m_linVelBuf;
adl::Buffer<btVector3>* m_angVelBuf;
adl::Buffer<float>* m_bodyTimes;
bool m_useInterop;
btGridBroadphaseCl* m_Broadphase;
adl::Buffer<btAABBHost>* m_localShapeAABB;
btVector3* m_linVelHost;
btVector3* m_angVelHost;
float* m_bodyTimesHost;
InternalData():m_linVelBuf(0),m_angVelBuf(0),m_bodyTimes(0),m_useInterop(0),m_Broadphase(0)
{
m_linVelHost= new btVector3[MAX_CONVEX_BODIES_CL];
m_angVelHost = new btVector3[MAX_CONVEX_BODIES_CL];
m_bodyTimesHost = new float[MAX_CONVEX_BODIES_CL];
}
~InternalData()
{
delete[] m_linVelHost;
delete[] m_angVelHost;
delete[] m_bodyTimesHost;
}
};
void InitCL(int preferredDeviceIndex, int preferredPlatformIndex, bool useInterop)
{
void* glCtx=0;
void* glDC = 0;
#ifdef _WIN32
glCtx = wglGetCurrentContext();
#else //!_WIN32
GLXContext glCtx = glXGetCurrentContext();
#endif //!_WIN32
glDC = wglGetCurrentDC();
int ciErrNum = 0;
#ifdef CL_PLATFORM_INTEL
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
#else
cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
#endif
if (useInterop)
{
g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
} else
{
g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
}
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);
if (numDev>0)
{
g_device= btOpenCLUtils::getDevice(g_cxMainContext,0);
btOpenCLDeviceInfo clInfo;
btOpenCLUtils::getDeviceInfo(g_device,clInfo);
btOpenCLUtils::printDeviceInfo(g_device);
g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
}
CLPhysicsDemo::CLPhysicsDemo(Win32OpenGLWindow* renderer)
{
m_numCollisionShapes=0;
m_numPhysicsInstances=0;
m_data = new InternalData;
}
CLPhysicsDemo::~CLPhysicsDemo()
{
}
void CLPhysicsDemo::writeBodiesToGpu()
{
if (narrowphaseAndSolver)
narrowphaseAndSolver->writeAllBodiesToGpu();
}
int CLPhysicsDemo::registerCollisionShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling)
{
btAlignedObjectArray<btVector3> verts;
unsigned char* vts = (unsigned char*) vertices;
for (int i=0;i<numVertices;i++)
{
float* vertex = (float*) &vts[i*strideInBytes];
verts.push_back(btVector3(vertex[0]*scaling[0],vertex[1]*scaling[1],vertex[2]*scaling[2]));
}
btConvexUtility util;
bool merge = true;
util.initializePolyhedralFeatures(verts,merge);
int numFaces= util.m_faces.size();
float4* eqn = new float4[numFaces];
for (int i=0;i<numFaces;i++)
{
eqn[i].x = util.m_faces[i].m_plane[0];
eqn[i].y = util.m_faces[i].m_plane[1];
eqn[i].z = util.m_faces[i].m_plane[2];
eqn[i].w = util.m_faces[i].m_plane[3];
}
printf("numFaces = %d\n", numFaces);
s_convexHeightField = new ConvexHeightField(eqn,numFaces);
int shapeIndex=-1;
if (narrowphaseAndSolver)
shapeIndex = narrowphaseAndSolver->registerShape(s_convexHeightField);
if (shapeIndex>=0)
{
btAABBHost aabbMin, aabbMax;
aabbMin.fx = s_convexHeightField->m_aabb.m_min.x;
aabbMin.fy = s_convexHeightField->m_aabb.m_min.y;
aabbMin.fz= s_convexHeightField->m_aabb.m_min.z;
aabbMin.uw = shapeIndex;
aabbMax.fx = s_convexHeightField->m_aabb.m_max.x;
aabbMax.fy = s_convexHeightField->m_aabb.m_max.y;
aabbMax.fz= s_convexHeightField->m_aabb.m_max.z;
aabbMax.uw = shapeIndex;
m_data->m_localShapeAABB->write(&aabbMin,1,shapeIndex*2);
m_data->m_localShapeAABB->write(&aabbMax,1,shapeIndex*2+1);
adl::DeviceUtils::waitForCompletion( g_deviceCL );
}
m_numCollisionShapes++;
delete[] eqn;
return shapeIndex;
}
int CLPhysicsDemo::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, void* userPointer)
{
btVector3 aabbMin(position[0],position[0],position[0]);
btVector3 aabbMax = aabbMin;
aabbMin -= btVector3(1.f,1.f,1.f);
aabbMax += btVector3(1.f,1.f,1.f);
if (collisionShapeIndex>=0)
{
btBroadphaseProxy* proxy = m_data->m_Broadphase->createProxy(aabbMin,aabbMax,collisionShapeIndex,userPointer,1,1,0,0);//m_dispatcher);
}
bool writeToGpu = false;
int bodyIndex = -1;
if (narrowphaseAndSolver)
bodyIndex = narrowphaseAndSolver->registerRigidBody(collisionShapeIndex,mass,position,orientation,writeToGpu);
m_numPhysicsInstances++;
return bodyIndex;
}
void CLPhysicsDemo::init(int preferredDevice, int preferredPlatform, bool useInterop)
{
InitCL(-1,-1,useInterop);
#define CUSTOM_CL_INITIALIZATION
#ifdef CUSTOM_CL_INITIALIZATION
g_deviceCL = new adl::DeviceCL();
g_deviceCL->m_deviceIdx = g_device;
g_deviceCL->m_context = g_cxMainContext;
g_deviceCL->m_commandQueue = g_cqCommandQue;
g_deviceCL->m_kernelManager = new adl::KernelManager;
#else
DeviceUtils::Config cfg;
cfg.m_type = DeviceUtils::Config::DEVICE_CPU;
g_deviceCL = DeviceUtils::allocate( TYPE_CL, cfg );
#endif
//adl::Solver<adl::TYPE_CL>::allocate(g_deviceCL->allocate(
m_data->m_linVelBuf = new adl::Buffer<btVector3>(g_deviceCL,MAX_CONVEX_BODIES_CL);
m_data->m_angVelBuf = new adl::Buffer<btVector3>(g_deviceCL,MAX_CONVEX_BODIES_CL);
m_data->m_bodyTimes = new adl::Buffer<float>(g_deviceCL,MAX_CONVEX_BODIES_CL);
m_data->m_localShapeAABB = new adl::Buffer<btAABBHost>(g_deviceCL,MAX_CONVEX_SHAPES_CL);
gLinVelMem = (cl_mem)m_data->m_linVelBuf->m_ptr;
gAngVelMem = (cl_mem)m_data->m_angVelBuf->m_ptr;
gBodyTimes = (cl_mem)m_data->m_bodyTimes->m_ptr;
narrowphaseAndSolver = new btGpuNarrowphaseAndSolver(g_deviceCL);
int maxObjects = btMax(256,MAX_CONVEX_BODIES_CL);
int maxPairsSmallProxy = 32;
btOverlappingPairCache* overlappingPairCache=0;
m_data->m_Broadphase = new btGridBroadphaseCl(overlappingPairCache,btVector3(4.f, 4.f, 4.f), 128, 128, 128,maxObjects, maxObjects, maxPairsSmallProxy, 100.f, 128,
g_cxMainContext ,g_device,g_cqCommandQue, g_deviceCL);
cl_program prog = btOpenCLUtils::compileCLProgramFromString(g_cxMainContext,g_device,interopKernelString,0,"",INTEROPKERNEL_SRC_PATH);
g_integrateTransformsKernel = btOpenCLUtils::compileCLKernelFromString(g_cxMainContext, g_device,interopKernelString, "integrateTransformsKernel" ,0,prog);
initFindPairs(gFpIO, g_cxMainContext, g_device, g_cqCommandQue, MAX_CONVEX_BODIES_CL);
}
void CLPhysicsDemo::writeVelocitiesToGpu()
{
m_data->m_linVelBuf->write(m_data->m_linVelHost,MAX_CONVEX_BODIES_CL);
m_data->m_angVelBuf->write(m_data->m_angVelHost,MAX_CONVEX_BODIES_CL);
m_data->m_bodyTimes->write(m_data->m_bodyTimesHost,MAX_CONVEX_BODIES_CL);
adl::DeviceUtils::waitForCompletion( g_deviceCL );
}
void CLPhysicsDemo::setupInterop()
{
m_data->m_useInterop = true;
g_interopBuffer = new btOpenCLGLInteropBuffer(g_cxMainContext,g_cqCommandQue,cube_vbo);
clFinish(g_cqCommandQue);
}
void CLPhysicsDemo::cleanup()
{
delete narrowphaseAndSolver;
delete m_data->m_linVelBuf;
delete m_data->m_angVelBuf;
delete m_data->m_bodyTimes;
delete m_data->m_localShapeAABB;
delete m_data->m_Broadphase;
delete m_data;
delete g_deviceCL->m_kernelManager;
delete g_deviceCL;
m_data=0;
g_deviceCL=0;
delete g_interopBuffer;
delete s_convexHeightField;
}
void CLPhysicsDemo::stepSimulation()
{
BT_PROFILE("simulationLoop");
{
BT_PROFILE("glFinish");
glFinish();
}
cl_int ciErrNum = CL_SUCCESS;
if(m_data->m_useInterop)
{
clBuffer = g_interopBuffer->getCLBUffer();
BT_PROFILE("clEnqueueAcquireGLObjects");
ciErrNum = clEnqueueAcquireGLObjects(g_cqCommandQue, 1, &clBuffer, 0, 0, NULL);
adl::DeviceUtils::waitForCompletion( g_deviceCL );
} else
{
glBindBuffer(GL_ARRAY_BUFFER, cube_vbo);
glFlush();
BT_PROFILE("glMapBuffer and clEnqueueWriteBuffer");
blocking= CL_TRUE;
hostPtr= (char*)glMapBuffer( GL_ARRAY_BUFFER,GL_READ_WRITE);//GL_WRITE_ONLY
if (!clBuffer)
{
clBuffer = clCreateBuffer(g_cxMainContext, CL_MEM_READ_WRITE, VBOsize, 0, &ciErrNum);
}
adl::DeviceUtils::waitForCompletion( g_deviceCL );
oclCHECKERROR(ciErrNum, CL_SUCCESS);
ciErrNum = clEnqueueWriteBuffer ( g_cqCommandQue,
clBuffer,
blocking,
0,
VBOsize,
hostPtr,0,0,0
);
adl::DeviceUtils::waitForCompletion( g_deviceCL );
}
oclCHECKERROR(ciErrNum, CL_SUCCESS);
if (runOpenCLKernels && m_numPhysicsInstances)
{
gFpIO.m_numObjects = m_numPhysicsInstances;
gFpIO.m_positionOffset = SHAPE_VERTEX_BUFFER_SIZE/4;
gFpIO.m_clObjectsBuffer = clBuffer;
gFpIO.m_dAABB = m_data->m_Broadphase->m_dAABB;
gFpIO.m_dlocalShapeAABB = (cl_mem)m_data->m_localShapeAABB->m_ptr;
gFpIO.m_numOverlap = 0;
{
BT_PROFILE("setupGpuAabbs");
setupGpuAabbsFull(gFpIO,narrowphaseAndSolver->getBodiesGpu() );
}
if (1)
{
BT_PROFILE("calculateOverlappingPairs");
m_data->m_Broadphase->calculateOverlappingPairs(0, m_numPhysicsInstances);
gFpIO.m_dAllOverlappingPairs = m_data->m_Broadphase->m_dAllOverlappingPairs;
gFpIO.m_numOverlap = m_data->m_Broadphase->m_numPrefixSum;
}
//printf("gFpIO.m_numOverlap = %d\n",gFpIO.m_numOverlap );
if (gFpIO.m_numOverlap>=0 && gFpIO.m_numOverlap<MAX_BROADPHASE_COLLISION_CL)
{
colorPairsOpenCL(gFpIO);
if (1)
{
{
//BT_PROFILE("setupBodies");
if (narrowphaseAndSolver)
setupBodies(gFpIO, gLinVelMem, gAngVelMem, narrowphaseAndSolver->getBodiesGpu(), narrowphaseAndSolver->getBodyInertiasGpu());
}
if (gFpIO.m_numOverlap)
{
BT_PROFILE("computeContactsAndSolver");
if (narrowphaseAndSolver)
narrowphaseAndSolver->computeContactsAndSolver(gFpIO.m_dAllOverlappingPairs,gFpIO.m_numOverlap);
}
{
BT_PROFILE("copyBodyVelocities");
if (narrowphaseAndSolver)
copyBodyVelocities(gFpIO, gLinVelMem, gAngVelMem, narrowphaseAndSolver->getBodiesGpu(), narrowphaseAndSolver->getBodyInertiasGpu());
}
}
} else
{
printf("error, gFpIO.m_numOverlap = %d\n",gFpIO.m_numOverlap);
btAssert(0);
}
{
BT_PROFILE("integrateTransforms");
if (runOpenCLKernels)
{
int numObjects = m_numPhysicsInstances;
int offset = SHAPE_VERTEX_BUFFER_SIZE/4;
ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 0, sizeof(int), &offset);
ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 1, sizeof(int), &numObjects);
ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 2, sizeof(cl_mem), (void*)&clBuffer );
ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 3, sizeof(cl_mem), (void*)&gLinVelMem);
ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 4, sizeof(cl_mem), (void*)&gAngVelMem);
ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 5, sizeof(cl_mem), (void*)&gBodyTimes);
size_t workGroupSize = 64;
size_t numWorkItems = workGroupSize*((m_numPhysicsInstances + (workGroupSize)) / workGroupSize);
if (workGroupSize>numWorkItems)
workGroupSize=numWorkItems;
ciErrNum = clEnqueueNDRangeKernel(g_cqCommandQue, g_integrateTransformsKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
}
}
if(m_data->m_useInterop)
{
BT_PROFILE("clEnqueueReleaseGLObjects");
ciErrNum = clEnqueueReleaseGLObjects(g_cqCommandQue, 1, &clBuffer, 0, 0, 0);
adl::DeviceUtils::waitForCompletion( g_deviceCL );
}
else
{
BT_PROFILE("clEnqueueReadBuffer clReleaseMemObject and glUnmapBuffer");
ciErrNum = clEnqueueReadBuffer ( g_cqCommandQue,
clBuffer,
blocking,
0,
VBOsize,
hostPtr,0,0,0);
//clReleaseMemObject(clBuffer);
adl::DeviceUtils::waitForCompletion( g_deviceCL );
glUnmapBuffer( GL_ARRAY_BUFFER);
glFlush();
}
oclCHECKERROR(ciErrNum, CL_SUCCESS);
if (runOpenCLKernels)
{
BT_PROFILE("clFinish");
clFinish(g_cqCommandQue);
}
}

View File

@@ -0,0 +1,53 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef CL_PHYSICS_DEMO_H
#define CL_PHYSICS_DEMO_H
class Win32OpenGLWindow;
struct CLPhysicsDemo
{
Win32OpenGLWindow* m_renderer;
int m_numCollisionShapes;
int m_numPhysicsInstances;
struct InternalData* m_data;
CLPhysicsDemo(Win32OpenGLWindow* renderer);
virtual ~CLPhysicsDemo();
//btOpenCLGLInteropBuffer* m_interopBuffer;
void init(int preferredDevice, int preferredPlatform, bool useInterop);
void setupInterop();
int registerCollisionShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
int registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, void* userPointer);
void writeVelocitiesToGpu();
void writeBodiesToGpu();
void cleanup();
void stepSimulation();
};
#endif//CL_PHYSICS_DEMO_H

View File

@@ -0,0 +1,24 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef DEMO_SETTINGS_H
#define DEMO_SETTINGS_H
#define SHAPE_VERTEX_BUFFER_SIZE 1024*1024
#define SHAPE_BUFFER_SIZE (SHAPE_VERTEX_BUFFER_SIZE)
#endif //DEMO_SETTINGS_H

View File

@@ -0,0 +1,861 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#include "OpenGLInclude.h"
#include "GLInstancingRenderer.h"
#include <string.h>
#include "DemoSettings.h"
#include <stdio.h>
#include <assert.h>
#include "LinearMath/btVector3.h"
#include "LinearMath/btQuaternion.h"
#include "LinearMath/btQuickprof.h"
#include "LinearMath/btMatrix3x3.h"
#include "../../opencl/gpu_rigidbody_pipeline/btGpuNarrowphaseAndSolver.h"//for MAX_CONVEX_BODIES_CL
struct btGraphicsInstance
{
GLuint m_cube_vao;
GLuint m_index_vbo;
int m_numIndices;
int m_numVertices;
int m_numGraphicsInstances;
int m_instanceOffset;
int m_vertexArrayOffset;
btGraphicsInstance() :m_cube_vao(-1),m_index_vbo(-1),m_numIndices(-1),m_numVertices(-1),m_numGraphicsInstances(0),m_instanceOffset(0),m_vertexArrayOffset(0)
{
}
};
bool m_ortho = false;
int m_glutScreenWidth = 1024;
int m_glutScreenHeight = 768;
extern int gShapeIndex;
btVector3 m_cameraPosition(0,0,0);//will be overridden by a position computed from azi/ele
btVector3 m_cameraTargetPosition(30,-5,-20);
btScalar m_cameraDistance = 95;
btVector3 m_cameraUp(0,1,0);
float m_azi=95.f;
float m_ele=15.f;
int VBOsize =0;
struct InternalDataRenderer
{
GLfloat* m_instance_positions_ptr;
GLfloat* m_instance_quaternion_ptr;
GLfloat* m_instance_colors_ptr;
GLfloat* m_instance_scale_ptr;
InternalDataRenderer() :m_instance_positions_ptr (0),m_instance_quaternion_ptr(0),m_instance_colors_ptr(0),m_instance_scale_ptr(0)
{
}
};
static GLuint instancingShader; // The instancing renderer
GLuint cube_vbo;
static GLuint m_texturehandle;
static bool done = false;
static GLint angle_loc = 0;
static GLint ModelViewMatrix;
static GLint ProjectionMatrix;
GLInstancingRenderer::GLInstancingRenderer()
{
m_data = new InternalDataRenderer;
m_data->m_instance_positions_ptr = (GLfloat*)new float[MAX_CONVEX_BODIES_CL*4];
m_data->m_instance_quaternion_ptr = (GLfloat*)new float[MAX_CONVEX_BODIES_CL*4];
m_data->m_instance_colors_ptr = (GLfloat*)new float[MAX_CONVEX_BODIES_CL*4];
m_data->m_instance_scale_ptr = (GLfloat*)new float[MAX_CONVEX_BODIES_CL*3];
}
GLInstancingRenderer::~GLInstancingRenderer()
{
delete m_data;
}
static GLint uniform_texture_diffuse = 0;
//used for dynamic loading from disk (default switched off)
#define MAX_SHADER_LENGTH 8192
static GLubyte shaderText[MAX_SHADER_LENGTH];
static const char* vertexShader= \
"#version 330\n"
"precision highp float;\n"
"\n"
"\n"
"\n"
"layout (location = 0) in vec4 position;\n"
"layout (location = 1) in vec4 instance_position;\n"
"layout (location = 2) in vec4 instance_quaternion;\n"
"layout (location = 3) in vec2 uvcoords;\n"
"layout (location = 4) in vec3 vertexnormal;\n"
"layout (location = 5) in vec4 instance_color;\n"
"layout (location = 6) in vec3 instance_scale;\n"
"\n"
"\n"
"uniform float angle = 0.0;\n"
"uniform mat4 ModelViewMatrix;\n"
"uniform mat4 ProjectionMatrix;\n"
"\n"
"out Fragment\n"
"{\n"
" vec4 color;\n"
"} fragment;\n"
"\n"
"out Vert\n"
"{\n"
" vec2 texcoord;\n"
"} vert;\n"
"\n"
"\n"
"vec4 quatMul ( in vec4 q1, in vec4 q2 )\n"
"{\n"
" vec3 im = q1.w * q2.xyz + q1.xyz * q2.w + cross ( q1.xyz, q2.xyz );\n"
" vec4 dt = q1 * q2;\n"
" float re = dot ( dt, vec4 ( -1.0, -1.0, -1.0, 1.0 ) );\n"
" return vec4 ( im, re );\n"
"}\n"
"\n"
"vec4 quatFromAxisAngle(vec4 axis, in float angle)\n"
"{\n"
" float cah = cos(angle*0.5);\n"
" float sah = sin(angle*0.5);\n"
" float d = inversesqrt(dot(axis,axis));\n"
" vec4 q = vec4(axis.x*sah*d,axis.y*sah*d,axis.z*sah*d,cah);\n"
" return q;\n"
"}\n"
"//\n"
"// vector rotation via quaternion\n"
"//\n"
"vec4 quatRotate3 ( in vec3 p, in vec4 q )\n"
"{\n"
" vec4 temp = quatMul ( q, vec4 ( p, 0.0 ) );\n"
" return quatMul ( temp, vec4 ( -q.x, -q.y, -q.z, q.w ) );\n"
"}\n"
"vec4 quatRotate ( in vec4 p, in vec4 q )\n"
"{\n"
" vec4 temp = quatMul ( q, p );\n"
" return quatMul ( temp, vec4 ( -q.x, -q.y, -q.z, q.w ) );\n"
"}\n"
"\n"
"out vec3 lightDir,normal,ambient;\n"
"\n"
"void main(void)\n"
"{\n"
" vec4 q = instance_quaternion;\n"
" ambient = vec3(0.3,.3,0.3);\n"
" \n"
" \n"
" vec4 local_normal = (quatRotate3( vertexnormal,q));\n"
" vec3 light_pos = vec3(-0.8,1,-0.6);\n"
" normal = local_normal.xyz;\n"//normalize(ModelViewMatrix * local_normal).xyz;\n"
"\n"
" lightDir = normalize(light_pos);//gl_LightSource[0].position.xyz));\n"
"// lightDir = normalize(vec3(gl_LightSource[0].position));\n"
" \n"
" vec4 axis = vec4(1,1,1,0);\n"
" vec4 localcoord = quatRotate3( position.xyz*instance_scale,q);\n"
" vec4 vertexPos = ProjectionMatrix * ModelViewMatrix *(instance_position+localcoord);\n"
"\n"
" gl_Position = vertexPos;\n"
" \n"
" fragment.color = instance_color;\n"
" vert.texcoord = uvcoords;\n"
"}\n"
;
static const char* fragmentShader= \
"#version 330\n"
"precision highp float;\n"
"\n"
"in Fragment\n"
"{\n"
" vec4 color;\n"
"} fragment;\n"
"\n"
"in Vert\n"
"{\n"
" vec2 texcoord;\n"
"} vert;\n"
"\n"
"uniform sampler2D Diffuse;\n"
"\n"
"in vec3 lightDir,normal,ambient;\n"
"\n"
"out vec4 color;\n"
"\n"
"void main_textured(void)\n"
"{\n"
" color = texture2D(Diffuse,vert.texcoord);//fragment.color;\n"
"}\n"
"\n"
"void main(void)\n"
"{\n"
" vec4 texel = fragment.color*texture2D(Diffuse,vert.texcoord);//fragment.color;\n"
" vec3 ct,cf;\n"
" float intensity,at,af;\n"
" intensity = max(dot(lightDir,normalize(normal)),.2);\n"
" cf = intensity*vec3(1.0,1.0,1.0)+ambient;"
" af = 1.0;\n"
" \n"
" ct = texel.rgb;\n"
" at = texel.a;\n"
" \n"
" color = vec4(ct * cf, at * af); \n"
"}\n"
;
// Load the shader from the source text
void gltLoadShaderSrc(const char *szShaderSrc, GLuint shader)
{
GLchar *fsStringPtr[1];
fsStringPtr[0] = (GLchar *)szShaderSrc;
glShaderSource(shader, 1, (const GLchar **)fsStringPtr, NULL);
}
GLuint gltLoadShaderPair(const char *szVertexProg, const char *szFragmentProg)
{
// Temporary Shader objects
GLuint hVertexShader;
GLuint hFragmentShader;
GLuint hReturn = 0;
GLint testVal;
// Create shader objects
hVertexShader = glCreateShader(GL_VERTEX_SHADER);
hFragmentShader = glCreateShader(GL_FRAGMENT_SHADER);
gltLoadShaderSrc(vertexShader, hVertexShader);
gltLoadShaderSrc(fragmentShader, hFragmentShader);
// Compile them
glCompileShader(hVertexShader);
glCompileShader(hFragmentShader);
// Check for errors
glGetShaderiv(hVertexShader, GL_COMPILE_STATUS, &testVal);
if(testVal == GL_FALSE)
{
char temp[256] = "";
glGetShaderInfoLog( hVertexShader, 256, NULL, temp);
fprintf( stderr, "Compile failed:\n%s\n", temp);
assert(0);
exit(0);
glDeleteShader(hVertexShader);
glDeleteShader(hFragmentShader);
return (GLuint)NULL;
}
glGetShaderiv(hFragmentShader, GL_COMPILE_STATUS, &testVal);
if(testVal == GL_FALSE)
{
char temp[256] = "";
glGetShaderInfoLog( hFragmentShader, 256, NULL, temp);
fprintf( stderr, "Compile failed:\n%s\n", temp);
assert(0);
exit(0);
glDeleteShader(hVertexShader);
glDeleteShader(hFragmentShader);
return (GLuint)NULL;
}
// Link them - assuming it works...
hReturn = glCreateProgram();
glAttachShader(hReturn, hVertexShader);
glAttachShader(hReturn, hFragmentShader);
glLinkProgram(hReturn);
// These are no longer needed
glDeleteShader(hVertexShader);
glDeleteShader(hFragmentShader);
// Make sure link worked too
glGetProgramiv(hReturn, GL_LINK_STATUS, &testVal);
if(testVal == GL_FALSE)
{
glDeleteProgram(hReturn);
return (GLuint)NULL;
}
return hReturn;
}
void GLInstancingRenderer::writeTransforms()
{
glBindBuffer(GL_ARRAY_BUFFER, cube_vbo);
glFlush();
char* orgBase = (char*)glMapBuffer( GL_ARRAY_BUFFER,GL_READ_WRITE);
int totalNumInstances= 0;
for (int k=0;k<m_graphicsInstances.size();k++)
{
btGraphicsInstance* gfxObj = m_graphicsInstances[k];
totalNumInstances+=gfxObj->m_numGraphicsInstances;
}
for (int k=0;k<m_graphicsInstances.size();k++)
{
//int k=0;
btGraphicsInstance* gfxObj = m_graphicsInstances[k];
int POSITION_BUFFER_SIZE = (totalNumInstances*sizeof(float)*4);
int ORIENTATION_BUFFER_SIZE = (totalNumInstances*sizeof(float)*4);
int COLOR_BUFFER_SIZE = (totalNumInstances*sizeof(float)*4);
int SCALE_BUFFER_SIZE = (totalNumInstances*sizeof(float)*3);
char* base = orgBase;
float* positions = (float*)(base+SHAPE_BUFFER_SIZE);
float* orientations = (float*)(base+SHAPE_BUFFER_SIZE + POSITION_BUFFER_SIZE);
float* colors= (float*)(base+SHAPE_BUFFER_SIZE + POSITION_BUFFER_SIZE+ORIENTATION_BUFFER_SIZE);
float* scaling= (float*)(base+SHAPE_BUFFER_SIZE + POSITION_BUFFER_SIZE+ORIENTATION_BUFFER_SIZE+COLOR_BUFFER_SIZE);
static int offset=0;
//offset++;
for (int i=0;i<gfxObj->m_numGraphicsInstances;i++)
{
int srcIndex=i+gfxObj->m_instanceOffset;
positions[srcIndex*4] = m_data->m_instance_positions_ptr[srcIndex*4];
positions[srcIndex*4+1] = m_data->m_instance_positions_ptr[srcIndex*4+1];
positions[srcIndex*4+2] = m_data->m_instance_positions_ptr[srcIndex*4+2];
positions[srcIndex*4+3] = m_data->m_instance_positions_ptr[srcIndex*4+3];
orientations[srcIndex*4]=m_data->m_instance_quaternion_ptr[srcIndex*4];
orientations[srcIndex*4+1]=m_data->m_instance_quaternion_ptr[srcIndex*4+1];
orientations[srcIndex*4+2]=m_data->m_instance_quaternion_ptr[srcIndex*4+2];
orientations[srcIndex*4+3]=m_data->m_instance_quaternion_ptr[srcIndex*4+3];
colors[srcIndex*4]=m_data->m_instance_colors_ptr[srcIndex*4];
colors[srcIndex*4+1]=m_data->m_instance_colors_ptr[srcIndex*4+1];
colors[srcIndex*4+2]=m_data->m_instance_colors_ptr[srcIndex*4+2];
colors[srcIndex*4+3]=m_data->m_instance_colors_ptr[srcIndex*4+3];
scaling[srcIndex*3]=m_data->m_instance_scale_ptr[srcIndex*3];
scaling[srcIndex*3+1]=m_data->m_instance_scale_ptr[srcIndex*3+1];
scaling[srcIndex*3+2]=m_data->m_instance_scale_ptr[srcIndex*3+2];
}
}
glUnmapBuffer( GL_ARRAY_BUFFER);
//if this glFinish is removed, the animation is not always working/blocks
//@todo: figure out why
glFlush();
}
int GLInstancingRenderer::registerGraphicsInstance(int shapeIndex, const float* position, const float* quaternion, const float* color, const float* scaling)
{
btGraphicsInstance* gfxObj = m_graphicsInstances[shapeIndex];
int index = gfxObj->m_numGraphicsInstances + gfxObj->m_instanceOffset;
m_data->m_instance_positions_ptr[index*4]=position[0];
m_data->m_instance_positions_ptr[index*4+1]=position[1];
m_data->m_instance_positions_ptr[index*4+2]=position[2];
m_data->m_instance_positions_ptr[index*4+3]=1;
m_data->m_instance_quaternion_ptr[index*4]=quaternion[0];
m_data->m_instance_quaternion_ptr[index*4+1]=quaternion[1];
m_data->m_instance_quaternion_ptr[index*4+2]=quaternion[2];
m_data->m_instance_quaternion_ptr[index*4+3]=quaternion[3];
m_data->m_instance_colors_ptr[index*4]=color[0];
m_data->m_instance_colors_ptr[index*4+1]=color[1];
m_data->m_instance_colors_ptr[index*4+2]=color[2];
m_data->m_instance_colors_ptr[index*4+3]=color[3];
m_data->m_instance_scale_ptr[index*3] = scaling[0];
m_data->m_instance_scale_ptr[index*3+1] = scaling[1];
m_data->m_instance_scale_ptr[index*3+2] = scaling[2];
gfxObj->m_numGraphicsInstances++;
return gfxObj->m_numGraphicsInstances;
}
int GLInstancingRenderer::registerShape(const float* vertices, int numvertices, const int* indices, int numIndices)
{
btGraphicsInstance* gfxObj = new btGraphicsInstance;
if (m_graphicsInstances.size())
{
btGraphicsInstance* prevObj = m_graphicsInstances[m_graphicsInstances.size()-1];
gfxObj->m_instanceOffset = prevObj->m_instanceOffset + prevObj->m_numGraphicsInstances;
gfxObj->m_vertexArrayOffset = prevObj->m_vertexArrayOffset + prevObj->m_numVertices;
} else
{
gfxObj->m_instanceOffset = 0;
}
m_graphicsInstances.push_back(gfxObj);
gfxObj->m_numIndices = numIndices;
gfxObj->m_numVertices = numvertices;
glBindBuffer(GL_ARRAY_BUFFER, cube_vbo);
char* dest= (char*)glMapBuffer( GL_ARRAY_BUFFER,GL_WRITE_ONLY);//GL_WRITE_ONLY
int vertexStrideInBytes = 9*sizeof(float);
int sz = numvertices*vertexStrideInBytes;
memcpy(dest+vertexStrideInBytes*gfxObj->m_vertexArrayOffset,vertices,sz);
glUnmapBuffer( GL_ARRAY_BUFFER);
glGenBuffers(1, &gfxObj->m_index_vbo);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, gfxObj->m_index_vbo);
int indexBufferSizeInBytes = gfxObj->m_numIndices*sizeof(int);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, indexBufferSizeInBytes, NULL, GL_STATIC_DRAW);
glBufferSubData(GL_ELEMENT_ARRAY_BUFFER,0,indexBufferSizeInBytes,indices);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
glGenVertexArrays(1, &gfxObj->m_cube_vao);
glBindVertexArray(gfxObj->m_cube_vao);
glBindBuffer(GL_ARRAY_BUFFER, cube_vbo);
glBindVertexArray(0);
glBindBuffer(GL_ARRAY_BUFFER,0);
glBindVertexArray(0);
return m_graphicsInstances.size()-1;
}
void GLInstancingRenderer::InitShaders()
{
int POSITION_BUFFER_SIZE = (MAX_CONVEX_BODIES_CL*sizeof(float)*4);
int ORIENTATION_BUFFER_SIZE = (MAX_CONVEX_BODIES_CL*sizeof(float)*4);
int COLOR_BUFFER_SIZE = (MAX_CONVEX_BODIES_CL*sizeof(float)*4);
int SCALE_BUFFER_SIZE = (MAX_CONVEX_BODIES_CL*sizeof(float)*3);
instancingShader = gltLoadShaderPair(vertexShader,fragmentShader);
glLinkProgram(instancingShader);
glUseProgram(instancingShader);
angle_loc = glGetUniformLocation(instancingShader, "angle");
ModelViewMatrix = glGetUniformLocation(instancingShader, "ModelViewMatrix");
ProjectionMatrix = glGetUniformLocation(instancingShader, "ProjectionMatrix");
uniform_texture_diffuse = glGetUniformLocation(instancingShader, "Diffuse");
GLuint offset = 0;
glGenBuffers(1, &cube_vbo);
glBindBuffer(GL_ARRAY_BUFFER, cube_vbo);
int size = SHAPE_BUFFER_SIZE + POSITION_BUFFER_SIZE+ORIENTATION_BUFFER_SIZE+COLOR_BUFFER_SIZE+SCALE_BUFFER_SIZE;
VBOsize = size;
glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);//GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER,0);
glBindVertexArray(0);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
}
void myinit()
{
GLint err = glGetError();
// GLfloat light_ambient[] = { btScalar(0.2), btScalar(0.2), btScalar(0.2), btScalar(1.0) };
GLfloat light_ambient[] = { btScalar(1.0), btScalar(1.2), btScalar(0.2), btScalar(1.0) };
GLfloat light_diffuse[] = { btScalar(1.0), btScalar(1.0), btScalar(1.0), btScalar(1.0) };
GLfloat light_specular[] = { btScalar(1.0), btScalar(1.0), btScalar(1.0), btScalar(1.0 )};
/* light_position is NOT default value */
GLfloat light_position0[] = { btScalar(10000.0), btScalar(10000.0), btScalar(10000.0), btScalar(0.0 )};
GLfloat light_position1[] = { btScalar(-1.0), btScalar(-10.0), btScalar(-1.0), btScalar(0.0) };
glLightfv(GL_LIGHT0, GL_AMBIENT, light_ambient);
glLightfv(GL_LIGHT0, GL_DIFFUSE, light_diffuse);
glLightfv(GL_LIGHT0, GL_SPECULAR, light_specular);
glLightfv(GL_LIGHT0, GL_POSITION, light_position0);
glLightfv(GL_LIGHT1, GL_AMBIENT, light_ambient);
glLightfv(GL_LIGHT1, GL_DIFFUSE, light_diffuse);
glLightfv(GL_LIGHT1, GL_SPECULAR, light_specular);
glLightfv(GL_LIGHT1, GL_POSITION, light_position1);
glEnable(GL_LIGHTING);
glEnable(GL_LIGHT0);
glEnable(GL_LIGHT1);
// glShadeModel(GL_FLAT);//GL_SMOOTH);
glShadeModel(GL_SMOOTH);
glEnable(GL_DEPTH_TEST);
glDepthFunc(GL_LESS);
glClearColor(float(0.7),float(0.7),float(0.7),float(0));
glEnable(GL_LIGHTING);
glEnable(GL_LIGHT0);
static bool m_textureenabled = true;
static bool m_textureinitialized = false;
if(m_textureenabled)
{
if(!m_textureinitialized)
{
glActiveTexture(GL_TEXTURE0);
GLubyte* image=new GLubyte[256*256*3];
for(int y=0;y<256;++y)
{
const int t=y>>5;
GLubyte* pi=image+y*256*3;
for(int x=0;x<256;++x)
{
if (x<2||y<2||x>253||y>253)
{
pi[0]=0;
pi[1]=0;
pi[2]=0;
} else
{
pi[0]=255;
pi[1]=255;
pi[2]=255;
}
/*
const int s=x>>5;
const GLubyte b=180;
GLubyte c=b+((s+t&1)&1)*(255-b);
pi[0]=c;
pi[1]=c;
pi[2]=c;
*/
pi+=3;
}
}
glGenTextures(1,(GLuint*)&m_texturehandle);
glBindTexture(GL_TEXTURE_2D,m_texturehandle);
glTexEnvf(GL_TEXTURE_ENV,GL_TEXTURE_ENV_MODE,GL_MODULATE);
glTexParameterf(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR_MIPMAP_LINEAR);
glTexParameterf(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR);
glTexParameterf(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT);
glTexParameterf(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT);
gluBuild2DMipmaps(GL_TEXTURE_2D,3,256,256,GL_RGB,GL_UNSIGNED_BYTE,image);
delete[] image;
m_textureinitialized=true;
}
// glMatrixMode(GL_TEXTURE);
// glLoadIdentity();
// glMatrixMode(GL_MODELVIEW);
glEnable(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D,m_texturehandle);
} else
{
glDisable(GL_TEXTURE_2D);
}
glEnable(GL_COLOR_MATERIAL);
err = glGetError();
assert(err==GL_NO_ERROR);
// glEnable(GL_CULL_FACE);
// glCullFace(GL_BACK);
}
void updateCamera()
{
btVector3 m_cameraUp(0,1,0);
int m_forwardAxis=2;
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
//m_azi+=0.0f;
btScalar rele = m_ele * btScalar(0.01745329251994329547);// rads per deg
btScalar razi = m_azi * btScalar(0.01745329251994329547);// rads per deg
btQuaternion rot(m_cameraUp,razi);
btVector3 eyePos(0,0,0);
eyePos[m_forwardAxis] = -m_cameraDistance;
btVector3 forward(eyePos[0],eyePos[1],eyePos[2]);
if (forward.length2() < SIMD_EPSILON)
{
forward.setValue(1.f,0.f,0.f);
}
btVector3 right = m_cameraUp.cross(forward);
btQuaternion roll(right,-rele);
eyePos = btMatrix3x3(rot) * btMatrix3x3(roll) * eyePos;
m_cameraPosition[0] = eyePos.getX();
m_cameraPosition[1] = eyePos.getY();
m_cameraPosition[2] = eyePos.getZ();
m_cameraPosition += m_cameraTargetPosition;
float m_frustumZNear=1;
float m_frustumZFar=1000;
if (m_glutScreenWidth == 0 && m_glutScreenHeight == 0)
return;
float aspect;
btVector3 extents;
if (m_glutScreenWidth > m_glutScreenHeight)
{
aspect = m_glutScreenWidth / (float)m_glutScreenHeight;
extents.setValue(aspect * 1.0f, 1.0f,0);
} else
{
aspect = m_glutScreenHeight / (float)m_glutScreenWidth;
extents.setValue(1.0f, aspect*1.f,0);
}
if (m_ortho)
{
// reset matrix
glLoadIdentity();
extents *= m_cameraDistance;
btVector3 lower = m_cameraTargetPosition - extents;
btVector3 upper = m_cameraTargetPosition + extents;
glOrtho(lower.getX(), upper.getX(), lower.getY(), upper.getY(),-1000,1000);
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
} else
{
if (m_glutScreenWidth > m_glutScreenHeight)
{
glFrustum (-aspect * m_frustumZNear, aspect * m_frustumZNear, -m_frustumZNear, m_frustumZNear, m_frustumZNear, m_frustumZFar);
} else
{
glFrustum (-aspect * m_frustumZNear, aspect * m_frustumZNear, -m_frustumZNear, m_frustumZNear, m_frustumZNear, m_frustumZFar);
}
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
gluLookAt(m_cameraPosition[0], m_cameraPosition[1], m_cameraPosition[2],
m_cameraTargetPosition[0], m_cameraTargetPosition[1], m_cameraTargetPosition[2],
m_cameraUp.getX(),m_cameraUp.getY(),m_cameraUp.getZ());
}
}
void GLInstancingRenderer::RenderScene(void)
{
BT_PROFILE("GlutDisplayFunc");
myinit();
updateCamera();
//render coordinate system
glBegin(GL_LINES);
glColor3f(1,0,0);
glVertex3f(0,0,0);
glVertex3f(1,0,0);
glColor3f(0,1,0);
glVertex3f(0,0,0);
glVertex3f(0,1,0);
glColor3f(0,0,1);
glVertex3f(0,0,0);
glVertex3f(0,0,1);
glEnd();
//do a finish, to make sure timings are clean
// glFinish();
// glBindBuffer(GL_ARRAY_BUFFER, 0);
glBindBuffer(GL_ARRAY_BUFFER, cube_vbo);
glFlush();
//updatePos();
// simulationLoop();
//useCPU = true;
int totalNumInstances = 0;
for (int i=0;i<m_graphicsInstances.size();i++)
{
totalNumInstances+=m_graphicsInstances[i]->m_numGraphicsInstances;
}
int curOffset = 0;
for (int i=0;i<m_graphicsInstances.size();i++)
{
btGraphicsInstance* gfxObj = m_graphicsInstances[i];
int myOffset = gfxObj->m_instanceOffset*4*sizeof(float);
int POSITION_BUFFER_SIZE = (totalNumInstances*sizeof(float)*4);
int ORIENTATION_BUFFER_SIZE = (totalNumInstances*sizeof(float)*4);
int COLOR_BUFFER_SIZE = (totalNumInstances*sizeof(float)*4);
int SCALE_BUFFER_SIZE = (totalNumInstances*sizeof(float)*3);
glBindVertexArray(gfxObj->m_cube_vao);
int vertexStride = 9*sizeof(float);
int vertexBase = gfxObj->m_vertexArrayOffset*vertexStride;
glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, 9*sizeof(float), (GLvoid*)vertexBase);
glVertexAttribPointer(1, 4, GL_FLOAT, GL_FALSE, 0, (GLvoid *)(curOffset*4*sizeof(float)+SHAPE_BUFFER_SIZE));
glVertexAttribPointer(2, 4, GL_FLOAT, GL_FALSE, 0, (GLvoid *)(curOffset*4*sizeof(float)+SHAPE_BUFFER_SIZE+POSITION_BUFFER_SIZE));
int uvoffset = 7*sizeof(float)+vertexBase;
int normaloffset = 4*sizeof(float)+vertexBase;
glVertexAttribPointer(3, 2, GL_FLOAT, GL_FALSE, 9*sizeof(float), (GLvoid *)uvoffset);
glVertexAttribPointer(4, 3, GL_FLOAT, GL_FALSE, 9*sizeof(float), (GLvoid *)normaloffset);
glVertexAttribPointer(5, 4, GL_FLOAT, GL_FALSE, 0, (GLvoid *)(curOffset*4*sizeof(float)+SHAPE_BUFFER_SIZE+POSITION_BUFFER_SIZE+ORIENTATION_BUFFER_SIZE));
glVertexAttribPointer(6, 3, GL_FLOAT, GL_FALSE, 0, (GLvoid *)(curOffset*3*sizeof(float)+SHAPE_BUFFER_SIZE+POSITION_BUFFER_SIZE+ORIENTATION_BUFFER_SIZE+COLOR_BUFFER_SIZE));
glEnableVertexAttribArray(0);
glEnableVertexAttribArray(1);
glEnableVertexAttribArray(2);
glEnableVertexAttribArray(3);
glEnableVertexAttribArray(4);
glEnableVertexAttribArray(5);
glEnableVertexAttribArray(6);
glVertexAttribDivisor(0, 0);
glVertexAttribDivisor(1, 1);
glVertexAttribDivisor(2, 1);
glVertexAttribDivisor(3, 0);
glVertexAttribDivisor(4, 0);
glVertexAttribDivisor(5, 1);
glVertexAttribDivisor(6, 1);
glUseProgram(instancingShader);
glUniform1f(angle_loc, 0);
GLfloat pm[16];
glGetFloatv(GL_PROJECTION_MATRIX, pm);
glUniformMatrix4fv(ProjectionMatrix, 1, false, &pm[0]);
GLfloat mvm[16];
glGetFloatv(GL_MODELVIEW_MATRIX, mvm);
glUniformMatrix4fv(ModelViewMatrix, 1, false, &mvm[0]);
glUniform1i(uniform_texture_diffuse, 0);
glFlush();
if (gfxObj->m_numGraphicsInstances)
{
int indexCount = gfxObj->m_numIndices;
int indexOffset = 0;
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, gfxObj->m_index_vbo);
{
BT_PROFILE("glDrawElementsInstanced");
glDrawElementsInstanced(GL_TRIANGLES, indexCount, GL_UNSIGNED_INT, (void*)indexOffset, gfxObj->m_numGraphicsInstances);
}
}
curOffset+= gfxObj->m_numGraphicsInstances;
}
glUseProgram(0);
glBindBuffer(GL_ARRAY_BUFFER,0);
glBindVertexArray(0);
GLint err = glGetError();
assert(err==GL_NO_ERROR);
}
void GLInstancingRenderer::CleanupShaders()
{
delete []m_data->m_instance_positions_ptr;
delete []m_data->m_instance_quaternion_ptr;
delete []m_data->m_instance_colors_ptr;
delete []m_data->m_instance_scale_ptr;
}

View File

@@ -0,0 +1,45 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef GL_INSTANCING_RENDERER_H
#define GL_INSTANCING_RENDERER_H
#include "LinearMath/btAlignedObjectArray.h"
class GLInstancingRenderer
{
btAlignedObjectArray<struct btGraphicsInstance*> m_graphicsInstances;
struct InternalDataRenderer* m_data;
public:
GLInstancingRenderer();
virtual ~GLInstancingRenderer();
void InitShaders();
void RenderScene(void);
void CleanupShaders();
///vertices must be in the format x,y,z, nx,ny,nz, u,v
int registerShape(const float* vertices, int numvertices, const int* indices, int numIndices);
///position x,y,z, quaternion x,y,z,w, color r,g,b,a, scaling x,y,z
int registerGraphicsInstance(int shapeIndex, const float* position, const float* quaternion, const float* color, const float* scaling);
void writeTransforms();
};
#endif //GL_INSTANCING_RENDERER_H

View File

@@ -0,0 +1,107 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#include <GL/glew.h>
#include "GlutRenderer.h"
#include <stdio.h>
GlutRenderer* GlutRenderer::gDemoApplication;
void GlutRenderer::runMainLoop()
{
glutMainLoop();
}
static void glutKeyboardCallback(unsigned char key, int x, int y) { GlutRenderer::gDemoApplication->keyboardCallback(key,x,y); }
static void glutKeyboardUpCallback(unsigned char key, int x, int y){ GlutRenderer::gDemoApplication->keyboardUpCallback(key,x,y);}
static void glutSpecialKeyboardCallback(int key, int x, int y){ GlutRenderer::gDemoApplication->specialKeyboard(key,x,y);}
static void glutSpecialKeyboardUpCallback(int key, int x, int y){ GlutRenderer::gDemoApplication->specialKeyboardUp(key,x,y);}
static void glutReshapeCallback(int w, int h){ GlutRenderer::gDemoApplication->resize(w,h);}
static void glutIdleCallback(){ glutPostRedisplay (); }
static void glutMouseFuncCallback(int button, int state, int x, int y){ GlutRenderer::gDemoApplication->mouseFunc(button,state,x,y);}
static void glutMotionFuncCallback(int x,int y){ GlutRenderer::gDemoApplication->mouseMotionFunc(x,y);}
static void glutDisplayCallback(void){ GlutRenderer::gDemoApplication->displayCallback();}
void GlutRenderer::resize(int width, int height)
{
m_glutScreenWidth = width;
m_glutScreenHeight = height;
}
void GlutRenderer::mouseFunc(int button, int state, int x, int y)
{
}
void GlutRenderer::mouseMotionFunc(int x,int y)
{
}
void GlutRenderer::renderScene()
{
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glutSwapBuffers();
glutPostRedisplay();
GLint err = glGetError();
assert(err==GL_NO_ERROR);
}
void GlutRenderer::displayCallback()
{
updateScene();
renderScene();
}
GlutRenderer::GlutRenderer(int argc, char* argv[])
{
glutInit(&argc, argv);
gDemoApplication = this;
}
void GlutRenderer::initGraphics(int width, int height)
{
m_glutScreenWidth = width;
m_glutScreenHeight = height;
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
glutInitWindowSize(m_glutScreenWidth, m_glutScreenHeight);
glutCreateWindow("GPU rigid body pipeline2");
glutKeyboardFunc(glutKeyboardCallback);
glutKeyboardUpFunc(glutKeyboardUpCallback);
glutSpecialFunc(glutSpecialKeyboardCallback);
glutSpecialUpFunc(glutSpecialKeyboardUpCallback);
glutReshapeFunc(glutReshapeCallback);
glutIdleFunc(glutIdleCallback);
glutMouseFunc(glutMouseFuncCallback);
glutPassiveMotionFunc(glutMotionFuncCallback);
glutMotionFunc(glutMotionFuncCallback);
glutDisplayFunc( glutDisplayCallback );
GLenum err = glewInit();
if (GLEW_OK != err)
{
printf("Error: %s\n", glewGetErrorString(err));
}
glClearColor(0.6f,0.6f,1.f,1.f);
}

View File

@@ -0,0 +1,59 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef GLUT_RENDERER_H
#define GLUT_RENDERER_H
#include "btGlutInclude.h"
#include "LinearMath/btVector3.h"
struct GlutRenderer
{
static GlutRenderer* gDemoApplication;
int m_glutScreenWidth;
int m_glutScreenHeight;
btVector3 m_cameraPosition;
btVector3 m_cameraTargetPosition;
btScalar m_cameraDistance;
btVector3 m_cameraUp;
float m_azimuth;
float m_elevation;
GlutRenderer(int argc, char* argv[]);
virtual void initGraphics(int width, int height);
virtual void cleanup() {}
void runMainLoop();
virtual void updateScene(){};
virtual void renderScene();
virtual void keyboardCallback(unsigned char key, int x, int y) {};
virtual void keyboardUpCallback(unsigned char key, int x, int y) {}
virtual void specialKeyboard(int key, int x, int y){}
virtual void specialKeyboardUp(int key, int x, int y){}
virtual void resize(int w, int h);
virtual void mouseFunc(int button, int state, int x, int y);
virtual void mouseMotionFunc(int x,int y);
virtual void displayCallback();
};
#endif //GLUT_RENDERER_H

View File

@@ -0,0 +1,64 @@
hasCL = findOpenCL_NVIDIA()
if (hasCL) then
project "OpenCL_gpu_rigidbody_pipeline2_NVIDIA"
initOpenCL_NVIDIA()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlew()
includedirs {
"../../primitives",
"../../../bullet2"
}
files {
"../main.cpp",
"../CLPhysicsDemo.cpp",
"../CLPhysicsDemo.h",
"../GLInstancingRenderer.cpp",
"../GLInstancingRenderer.h",
"../GlutRenderer.cpp",
"../GlutRenderer.h",
"../Win32OpenGLRenderManager.cpp",
"../Win32OpenGLRenderManager.h",
"../../gpu_rigidbody_pipeline/btConvexUtility.cpp",
"../../gpu_rigidbody_pipeline/btConvexUtility.h",
"../../gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.cpp",
"../../gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.h",
"../../../dynamics/basic_demo/ConvexHeightFieldShape.cpp",
"../../../dynamics/basic_demo/ConvexHeightFieldShape.h",
"../../../bullet2/LinearMath/btConvexHullComputer.cpp",
"../../../bullet2/LinearMath/btConvexHullComputer.h",
"../../broadphase_benchmark/findPairsOpenCL.cpp",
"../../broadphase_benchmark/findPairsOpenCL.h",
"../../broadphase_benchmark/btGridBroadphaseCL.cpp",
"../../broadphase_benchmark/btGridBroadphaseCL.h",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
"../../../bullet2/LinearMath/btAlignedAllocator.cpp",
"../../../bullet2/LinearMath/btQuickprof.cpp",
"../../../bullet2/LinearMath/btQuickprof.h",
"../../../bullet2/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
"../../../bullet2/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
"../../../bullet2/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
"../../opengl_interop/btOpenCLGLInteropBuffer.h",
"../../opengl_interop/btStopwatch.cpp",
"../../opengl_interop/btStopwatch.h"
}
end

View File

@@ -0,0 +1,41 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef __OPENGL_INCLUDE_H
#define __OPENGL_INCLUDE_H
#include <GL/glew.h>
//think different
#if defined(__APPLE__) && !defined (VMDMESA)
#include <OpenGL/OpenGL.h>
#include <OpenGL/gl.h>
#include <OpenGL/glu.h>
#else
#ifdef _WINDOWS
#include <windows.h>
#include <GL/gl.h>
#include <GL/glu.h>
#else
#include <GL/gl.h>
#endif //_WINDOWS
#endif //APPLE
#endif //__OPENGL_INCLUDE_H

View File

@@ -0,0 +1,210 @@
#ifndef SHAPE_DATA_H
#define SHAPE_DATA_H
static float barrel_vertices[] = {
0.0f,-0.5f,0.0f, 1.0f, 0.0f,-1.0f,0.0f, 0.5f, 0.5f,
0.282362f,-0.5f,-0.205148f, 1.0f, 0.0f,-1.0f,0.0f, 0.5f, 0.5f,
0.349018f,-0.5f,0.0f, 1.0f, 0.0f,-1.0f,0.0f, 0.5f, 0.5f,
0.107853f,-0.5f,-0.331936f, 1.0f, 0.0f,-1.0f,0.0f, 0.5f, 0.5f,
-0.107853f,-0.5f,-0.331936f, 1.0f, 0.0f,-1.0f,0.0f, 0.5f, 0.5f,
0.107853f,-0.5f,-0.331936f, 1.0f, 0.0f,-1.0f,0.0f, 0.5f, 0.5f,
-0.282362f,-0.5f,-0.205148f, 1.0f, 0.0f,-1.0f,0.0f, 0.5f, 0.5f,
-0.349018f,-0.5f,0.0f, 1.0f, 0.0f,-1.0f,0.0f, 0.5f, 0.5f,
-0.282362f,-0.5f,0.205148f, 1.0f, 0.0f,-1.0f,0.0f, 0.5f, 0.5f,
-0.107853f,-0.5f,0.331936f, 1.0f, 0.0f,-1.0f,0.0f, 0.5f, 0.5f,
0.107853f,-0.5f,0.331936f, 1.0f, 0.0f,-1.0f,0.0f, 0.5f, 0.5f,
0.282362f,-0.5f,0.205148f, 1.0f, 0.0f,-1.0f,0.0f, 0.5f, 0.5f,
0.0f,0.5f,0.0f, 1.0f, 0.0f,1.0f,0.0f, 0.5f, 0.5f,
0.349018f,0.5f,0.0f, 1.0f, 0.0f,1.0f,0.0f, 0.5f, 0.5f,
0.282362f,0.5f,-0.205148f, 1.0f, 0.0f,1.0f,0.0f, 0.5f, 0.5f,
0.107853f,0.5f,-0.331936f, 1.0f, 0.0f,1.0f,0.0f, 0.5f, 0.5f,
0.107853f,0.5f,-0.331936f, 1.0f, 0.0f,1.0f,0.0f, 0.5f, 0.5f,
-0.107853f,0.5f,-0.331936f, 1.0f, 0.0f,1.0f,0.0f, 0.5f, 0.5f,
-0.282362f,0.5f,-0.205148f, 1.0f, 0.0f,1.0f,0.0f, 0.5f, 0.5f,
-0.349018f,0.5f,0.0f, 1.0f, 0.0f,1.0f,0.0f, 0.5f, 0.5f,
-0.282362f,0.5f,0.205148f, 1.0f, 0.0f,1.0f,0.0f, 0.5f, 0.5f,
-0.107853f,0.5f,0.331936f, 1.0f, 0.0f,1.0f,0.0f, 0.5f, 0.5f,
0.107853f,0.5f,0.331936f, 1.0f, 0.0f,1.0f,0.0f, 0.5f, 0.5f,
0.282362f,0.5f,0.205148f, 1.0f, 0.0f,1.0f,0.0f, 0.5f, 0.5f,
0.349018f,-0.5f,0.0f, 1.0f, 0.957307f,-0.289072f,0.0f, 0.5f, 0.5f,
0.404509f,0.0f,-0.293893f, 1.0f, 0.809017f,0.0f,-0.587785f, 0.5f, 0.5f,
0.5f,0.0f,0.0f, 1.0f, 1.0f,0.0f,0.0f, 0.5f, 0.5f,
0.282362f,-0.5f,-0.205148f, 1.0f, 0.774478f,-0.289072f,-0.562691f, 0.5f, 0.5f,
0.154508f,0.0f,-0.475528f, 1.0f, 0.309017f,0.0f,-0.951057f, 0.5f, 0.5f,
0.107853f,-0.5f,-0.331936f, 1.0f, 0.295824f,-0.289072f,-0.910453f, 0.5f, 0.5f,
0.107853f,-0.5f,-0.331936f, 1.0f, 0.295824f,-0.289072f,-0.910453f, 0.5f, 0.5f,
-0.154509f,0.0f,-0.475528f, 1.0f, -0.309017f,0.0f,-0.951057f, 0.5f, 0.5f,
0.154508f,0.0f,-0.475528f, 1.0f, 0.309017f,0.0f,-0.951057f, 0.5f, 0.5f,
-0.107853f,-0.5f,-0.331936f, 1.0f, -0.295824f,-0.289072f,-0.910453f, 0.5f, 0.5f,
-0.404509f,0.0f,-0.293893f, 1.0f, -0.809017f,0.0f,-0.587785f, 0.5f, 0.5f,
-0.282362f,-0.5f,-0.205148f, 1.0f, -0.774478f,-0.289072f,-0.562691f, 0.5f, 0.5f,
-0.5f,0.0f,0.0f, 1.0f, -1.0f,0.0f,0.0f, 0.5f, 0.5f,
-0.349018f,-0.5f,0.0f, 1.0f, -0.957307f,-0.289072f,0.0f, 0.5f, 0.5f,
-0.404508f,0.0f,0.293893f, 1.0f, -0.809017f,0.0f,0.587785f, 0.5f, 0.5f,
-0.282362f,-0.5f,0.205148f, 1.0f, -0.774478f,-0.289072f,0.562691f, 0.5f, 0.5f,
-0.154509f,0.0f,0.475528f, 1.0f, -0.309017f,0.0f,0.951056f, 0.5f, 0.5f,
-0.107853f,-0.5f,0.331936f, 1.0f, -0.295824f,-0.289072f,0.910453f, 0.5f, 0.5f,
0.154509f,0.0f,0.475528f, 1.0f, 0.309017f,0.0f,0.951056f, 0.5f, 0.5f,
0.107853f,-0.5f,0.331936f, 1.0f, 0.295824f,-0.289072f,0.910453f, 0.5f, 0.5f,
0.404509f,0.0f,0.293892f, 1.0f, 0.809017f,0.0f,0.587785f, 0.5f, 0.5f,
0.282362f,-0.5f,0.205148f, 1.0f, 0.774478f,-0.289072f,0.562691f, 0.5f, 0.5f,
0.282362f,0.5f,-0.205148f, 1.0f, 0.774478f,0.289072f,-0.562691f, 0.5f, 0.5f,
0.349018f,0.5f,0.0f, 1.0f, 0.957307f,0.289072f,0.0f, 0.5f, 0.5f,
0.107853f,0.5f,-0.331936f, 1.0f, 0.295824f,0.289072f,-0.910453f, 0.5f, 0.5f,
-0.107853f,0.5f,-0.331936f, 1.0f, -0.295824f,0.289072f,-0.910453f, 0.5f, 0.5f,
0.107853f,0.5f,-0.331936f, 1.0f, 0.295824f,0.289072f,-0.910453f, 0.5f, 0.5f,
-0.282362f,0.5f,-0.205148f, 1.0f, -0.774478f,0.289072f,-0.562691f, 0.5f, 0.5f,
-0.349018f,0.5f,0.0f, 1.0f, -0.957307f,0.289072f,0.0f, 0.5f, 0.5f,
-0.282362f,0.5f,0.205148f, 1.0f, -0.774478f,0.289072f,0.562691f, 0.5f, 0.5f,
-0.107853f,0.5f,0.331936f, 1.0f, -0.295824f,0.289072f,0.910453f, 0.5f, 0.5f,
0.107853f,0.5f,0.331936f, 1.0f, 0.295824f,0.289072f,0.910453f, 0.5f, 0.5f,
0.282362f,0.5f,0.205148f, 1.0f, 0.774478f,0.289072f,0.562691f, 0.5f, 0.5f
};
static int barrel_indices[] = {
0,1,2,
0,3,1,
0,4,5,
0,6,4,
0,7,6,
0,8,7,
0,9,8,
0,10,9,
0,11,10,
0,2,11,
12,13,14,
12,14,15,
12,16,17,
12,17,18,
12,18,19,
12,19,20,
12,20,21,
12,21,22,
12,22,23,
12,23,13,
24,25,26,
24,27,25,
27,28,25,
27,29,28,
30,31,32,
30,33,31,
33,34,31,
33,35,34,
35,36,34,
35,37,36,
37,38,36,
37,39,38,
39,40,38,
39,41,40,
41,42,40,
41,43,42,
43,44,42,
43,45,44,
45,26,44,
45,24,26,
26,46,47,
26,25,46,
25,48,46,
25,28,48,
32,49,50,
32,31,49,
31,51,49,
31,34,51,
34,52,51,
34,36,52,
36,53,52,
36,38,53,
38,54,53,
38,40,54,
40,55,54,
40,42,55,
42,56,55,
42,44,56,
44,47,56,
44,26,47,
};
///position xyz, unused w, normal, uv
static const float cube_vertices[] =
{
-0.5f, -0.5f, 0.5f, 0.0f, 0,0,1, 0,0,//0
0.5f, -0.5f, 0.5f, 0.0f, 0,0,1, 1,0,//1
0.5f, 0.5f, 0.5f, 0.0f, 0,0,1, 1,1,//2
-0.5f, 0.5f, 0.5f, 0.0f, 0,0,1, 0,1 ,//3
-0.5f, -0.5f, -0.5f, 0.5f, 0,0,-1, 0,0,//4
0.5f, -0.5f, -0.5f, 0.5f, 0,0,-1, 1,0,//5
0.5f, 0.5f, -0.5f, 0.5f, 0,0,-1, 1,1,//6
-0.5f, 0.5f, -0.5f, 0.5f, 0,0,-1, 0,1,//7
-0.5f, -0.5f, -0.5f, 0.5f, -1,0,0, 0,0,
-0.5f, 0.5f, -0.5f, 0.5f, -1,0,0, 1,0,
-0.5f, 0.5f, 0.5f, 0.5f, -1,0,0, 1,1,
-0.5f, -0.5f, 0.5f, 0.5f, -1,0,0, 0,1,
0.5f, -0.5f, -0.5f, 0.5f, 1,0,0, 0,0,
0.5f, 0.5f, -0.5f, 0.5f, 1,0,0, 1,0,
0.5f, 0.5f, 0.5f, 0.5f, 1,0,0, 1,1,
0.5f, -0.5f, 0.5f, 0.5f, 1,0,0, 0,1,
-0.5f, -0.5f, -0.5f, 0.5f, 0,-1,0, 0,0,
-0.5f, -0.5f, 0.5f, 0.5f, 0,-1,0, 1,0,
0.5f, -0.5f, 0.5f, 0.5f, 0,-1,0, 1,1,
0.5f,-0.5f, -0.5f, 0.5f, 0,-1,0, 0,1,
-0.5f, 0.5f, -0.5f, 0.5f, 0,1,0, 0,0,
-0.5f, 0.5f, 0.5f, 0.5f, 0,1,0, 1,0,
0.5f, 0.5f, 0.5f, 0.5f, 0,1,0, 1,1,
0.5f,0.5f, -0.5f, 0.5f, 0,1,0, 0,1,
};
///position xyz, unused w, normal, uv
static const float cube_vertices2[] =
{
-1.5f, -0.5f, 0.5f, 0.0f, 0,0,1, 0,0,//0
1.5f, -0.5f, 0.5f, 0.0f, 0,0,1, 1,0,//1
1.5f, 0.5f, 0.5f, 0.0f, 0,0,1, 1,1,//2
-1.5f, 0.5f, 0.5f, 0.0f, 0,0,1, 0,1 ,//3
-1.5f, -0.5f, -0.5f, 0.5f, 0,0,-1, 0,0,//4
1.5f, -0.5f, -0.5f, 0.5f, 0,0,-1, 1,0,//5
1.5f, 0.5f, -0.5f, 0.5f, 0,0,-1, 1,1,//6
-1.5f, 0.5f, -0.5f, 0.5f, 0,0,-1, 0,1,//7
-1.5f, -0.5f, -0.5f, 0.5f, -1,0,0, 0,0,
-1.5f, 0.5f, -0.5f, 0.5f, -1,0,0, 1,0,
-1.5f, 0.5f, 0.5f, 0.5f, -1,0,0, 1,1,
-1.5f, -0.5f, 0.5f, 0.5f, -1,0,0, 0,1,
1.5f, -0.5f, -0.5f, 0.5f, 1,0,0, 0,0,
1.5f, 0.5f, -0.5f, 0.5f, 1,0,0, 1,0,
1.5f, 0.5f, 0.5f, 0.5f, 1,0,0, 1,1,
1.5f, -0.5f, 0.5f, 0.5f, 1,0,0, 0,1,
-1.5f, -0.5f, -0.5f, 0.5f, 0,-1,0, 0,0,
-1.5f, -0.5f, 0.5f, 0.5f, 0,-1,0, 1,0,
1.5f, -0.5f, 0.5f, 0.5f, 0,-1,0, 1,1,
1.5f, -0.5f, -0.5f, 0.5f, 0,-1,0, 0,1,
-1.5f, 0.5f, -0.5f, 0.5f, 0,1,0, 0,0,
-1.5f, 0.5f, 0.5f, 0.5f, 0,1,0, 1,0,
1.5f, 0.5f, 0.5f, 0.5f, 0,1,0, 1,1,
1.5f, 0.5f, -0.5f, 0.5f, 0,1,0, 0,1,
};
static const int cube_indices[]=
{
0,1,2,0,2,3,//ground face
4,5,6,4,6,7,//top face
8,9,10,8,10,11,
12,13,14,12,14,15,
16,17,18,16,18,19,
20,21,22,20,22,23
};
#endif //SHAPE_DATA_H

View File

@@ -0,0 +1,465 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#include "Win32OpenGLRenderManager.h"
#include <windows.h>
#include <GL/gl.h>
static InternalData2* sData = 0;
struct InternalData2
{
HWND m_hWnd;;
int m_width;
int m_height;
HDC m_hDC;
HGLRC m_hRC;
bool m_OpenGLInitialized;
int m_oldScreenWidth;
int m_oldHeight;
int m_oldBitsPerPel;
bool m_quit;
InternalData2()
{
m_hWnd = 0;
m_width = 0;
m_height = 0;
m_hDC = 0;
m_hRC = 0;
m_OpenGLInitialized = false;
m_oldScreenWidth = 0;
m_oldHeight = 0;
m_oldBitsPerPel = 0;
m_quit = false;
}
};
void Win32OpenGLWindow::enableOpenGL()
{
PIXELFORMATDESCRIPTOR pfd;
int format;
// get the device context (DC)
m_data->m_hDC = GetDC( m_data->m_hWnd );
// set the pixel format for the DC
ZeroMemory( &pfd, sizeof( pfd ) );
pfd.nSize = sizeof( pfd );
pfd.nVersion = 1;
pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER;
pfd.iPixelType = PFD_TYPE_RGBA;
pfd.cColorBits = 24;
pfd.cDepthBits = 16;
pfd.cStencilBits = 1;
pfd.iLayerType = PFD_MAIN_PLANE;
format = ChoosePixelFormat( m_data->m_hDC, &pfd );
SetPixelFormat( m_data->m_hDC, format, &pfd );
// create and enable the render context (RC)
m_data->m_hRC = wglCreateContext( m_data->m_hDC );
wglMakeCurrent( m_data->m_hDC, m_data->m_hRC );
m_data->m_OpenGLInitialized = true;
}
void Win32OpenGLWindow::disableOpenGL()
{
m_data->m_OpenGLInitialized = false;
wglMakeCurrent( NULL, NULL );
wglDeleteContext( m_data->m_hRC );
ReleaseDC( m_data->m_hWnd, m_data->m_hDC );
}
void Win32OpenGLWindow::pumpMessage()
{
MSG msg;
// check for messages
if ( PeekMessage( &msg, NULL, 0, 0, PM_REMOVE ) )
{
// handle or dispatch messages
if ( msg.message == WM_QUIT )
{
m_data->m_quit = TRUE;
}
else
{
TranslateMessage( &msg );
DispatchMessage( &msg );
}
// gDemoApplication->displayCallback();
};
}
LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
{
switch (message)
{
case WM_PAINT:
{
PAINTSTRUCT ps;
BeginPaint(hWnd, &ps);
EndPaint(hWnd, &ps);
}
return 0;
case WM_ERASEBKGND:
return 0;
case WM_DESTROY:
PostQuitMessage(0);
return 0;
case WM_KEYDOWN:
{
switch ( wParam )
{
case 'Q':
case VK_ESCAPE:
{
PostQuitMessage(0);
}
return 0;
}
break;
}
case WM_SIZE: // Size Action Has Taken Place
switch (wParam) // Evaluate Size Action
{
case SIZE_MINIMIZED: // Was Window Minimized?
return 0; // Return
case SIZE_MAXIMIZED: // Was Window Maximized?
sData->m_width = LOWORD (lParam);
sData->m_height = HIWORD (lParam);
//if (sOpenGLInitialized)
//{
// //gDemoApplication->reshape(sWidth,sHeight);
//}
glViewport(0, 0, sData->m_width, sData->m_height);
return 0; // Return
case SIZE_RESTORED: // Was Window Restored?
sData->m_width = LOWORD (lParam);
sData->m_height = HIWORD (lParam);
//if (sOpenGLInitialized)
//{
// gDemoApplication->reshape(sWidth,sHeight);
//}
glViewport(0, 0, sData->m_width, sData->m_height);
return 0; // Return
}
break;
default:{
}
};
return DefWindowProc(hWnd, message, wParam, lParam);
}
void Win32OpenGLWindow::init(int width,int height, bool fullscreen,int colorBitsPerPixel, void* windowHandle)
{
// get handle to exe file
HINSTANCE hInstance = GetModuleHandle(0);
// create the window if we need to and we do not use the null device
if (!windowHandle)
{
const char* ClassName = "DeviceWin32";
// Register Class
WNDCLASSEX wcex;
wcex.cbSize = sizeof(WNDCLASSEX);
wcex.style = CS_HREDRAW | CS_VREDRAW;
wcex.lpfnWndProc = WndProc;
wcex.cbClsExtra = 0;
wcex.cbWndExtra = 0;
wcex.hInstance = hInstance;
wcex.hIcon = LoadIcon( NULL, IDI_APPLICATION ); //(HICON)LoadImage(hInstance, "bullet_ico.ico", IMAGE_ICON, 0,0, LR_LOADTRANSPARENT);//LR_LOADFROMFILE);
wcex.hCursor = LoadCursor(NULL, IDC_ARROW);
wcex.hbrBackground = (HBRUSH)(COLOR_WINDOW+1);
wcex.lpszMenuName = 0;
wcex.lpszClassName = ClassName;
wcex.hIconSm = 0;
// if there is an icon, load it
wcex.hIcon = (HICON)LoadImage(hInstance, "irrlicht.ico", IMAGE_ICON, 0,0, LR_LOADFROMFILE);
RegisterClassEx(&wcex);
// calculate client size
RECT clientSize;
clientSize.top = 0;
clientSize.left = 0;
clientSize.right = width;
clientSize.bottom = height;
DWORD style = WS_POPUP;
if (!fullscreen)
style = WS_SYSMENU | WS_BORDER | WS_CAPTION | WS_CLIPCHILDREN | WS_CLIPSIBLINGS | WS_MINIMIZEBOX | WS_MAXIMIZEBOX | WS_SIZEBOX;
AdjustWindowRect(&clientSize, style, FALSE);
m_data->m_width = clientSize.right - clientSize.left;
m_data->m_height = clientSize.bottom - clientSize.top;
int windowLeft = (GetSystemMetrics(SM_CXSCREEN) - m_data->m_width) / 2;
int windowTop = (GetSystemMetrics(SM_CYSCREEN) - m_data->m_height) / 2;
if (fullscreen)
{
windowLeft = 0;
windowTop = 0;
}
// create window
m_data->m_hWnd = CreateWindow( ClassName, "", style, windowLeft, windowTop,
m_data->m_width, m_data->m_height, NULL, NULL, hInstance, NULL);
ShowWindow(m_data->m_hWnd, SW_SHOW);
UpdateWindow(m_data->m_hWnd);
MoveWindow(m_data->m_hWnd, windowLeft, windowTop, m_data->m_width, m_data->m_height, TRUE);
}
else if (windowHandle)
{
// attach external window
m_data->m_hWnd = static_cast<HWND>(windowHandle);
RECT r;
GetWindowRect(m_data->m_hWnd, &r);
m_data->m_width = r.right - r.left;
m_data->m_height = r.bottom - r.top;
//sFullScreen = false;
//sExternalWindow = true;
}
if (fullscreen)
{
DEVMODE dm;
memset(&dm, 0, sizeof(dm));
dm.dmSize = sizeof(dm);
// use default values from current setting
EnumDisplaySettings(NULL, ENUM_CURRENT_SETTINGS, &dm);
m_data->m_oldScreenWidth = dm.dmPelsWidth;
m_data->m_oldHeight = dm.dmPelsHeight;
m_data->m_oldBitsPerPel = dm.dmBitsPerPel;
dm.dmPelsWidth = width;
dm.dmPelsHeight = height;
if (colorBitsPerPixel)
{
dm.dmBitsPerPel = colorBitsPerPixel;
}
dm.dmFields = DM_BITSPERPEL | DM_PELSWIDTH | DM_PELSHEIGHT | DM_DISPLAYFREQUENCY;
LONG res = ChangeDisplaySettings(&dm, CDS_FULLSCREEN);
if (res != DISP_CHANGE_SUCCESSFUL)
{ // try again without forcing display frequency
dm.dmFields = DM_BITSPERPEL | DM_PELSWIDTH | DM_PELSHEIGHT;
res = ChangeDisplaySettings(&dm, CDS_FULLSCREEN);
}
}
//VideoDriver = video::createOpenGLDriver(CreationParams, FileSystem, this);
enableOpenGL();
const wchar_t* text= L"OpenCL rigid body demo";
DWORD dwResult;
#ifdef _WIN64
SetWindowTextW(m_data->m_hWnd, text);
#else
SendMessageTimeoutW(m_data->m_hWnd, WM_SETTEXT, 0,
reinterpret_cast<LPARAM>(text),
SMTO_ABORTIFHUNG, 2000, &dwResult);
#endif
}
void Win32OpenGLWindow::switchFullScreen(bool fullscreen,int width,int height,int colorBitsPerPixel)
{
LONG res;
DEVMODE dm;
memset(&dm, 0, sizeof(dm));
dm.dmSize = sizeof(dm);
// use default values from current setting
EnumDisplaySettings(NULL, ENUM_CURRENT_SETTINGS, &dm);
dm.dmFields = DM_BITSPERPEL | DM_PELSWIDTH | DM_PELSHEIGHT | DM_DISPLAYFREQUENCY;
if (fullscreen && !m_data->m_oldScreenWidth)
{
m_data->m_oldScreenWidth = dm.dmPelsWidth;
m_data->m_oldHeight = dm.dmPelsHeight;
m_data->m_oldBitsPerPel = dm.dmBitsPerPel;
if (width && height)
{
dm.dmPelsWidth = width;
dm.dmPelsHeight = height;
} else
{
dm.dmPelsWidth = m_data->m_width;
dm.dmPelsHeight = m_data->m_height;
}
if (colorBitsPerPixel)
{
dm.dmBitsPerPel = colorBitsPerPixel;
}
} else
{
if (m_data->m_oldScreenWidth)
{
dm.dmPelsWidth = m_data->m_oldScreenWidth;
dm.dmPelsHeight= m_data->m_oldHeight;
dm.dmBitsPerPel = m_data->m_oldBitsPerPel;
}
}
if (fullscreen)
{
res = ChangeDisplaySettings(&dm, CDS_FULLSCREEN);
} else
{
res = ChangeDisplaySettings(&dm, 0);
}
}
Win32OpenGLWindow::Win32OpenGLWindow()
{
m_data = new InternalData2();
sData = m_data;
}
Win32OpenGLWindow::~Win32OpenGLWindow()
{
delete m_data;
sData = 0;
}
void Win32OpenGLWindow::init()
{
init(640,480,false);
}
void Win32OpenGLWindow::exit()
{
disableOpenGL();
DestroyWindow(this->m_data->m_hWnd);
}
void Win32OpenGLWindow::startRendering()
{
pumpMessage();
//glClearColor(1.f,0.f,0.f,1.f);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT); //clear buffers
//glCullFace(GL_BACK);
//glFrontFace(GL_CCW);
glEnable(GL_DEPTH_TEST);
float aspect;
//btVector3 extents;
if (m_data->m_width > m_data->m_height)
{
aspect = (float)m_data->m_width / (float)m_data->m_height;
//extents.setValue(aspect * 1.0f, 1.0f,0);
} else
{
aspect = (float)m_data->m_height / (float)m_data->m_width;
//extents.setValue(1.0f, aspect*1.f,0);
}
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
if (m_data->m_width > m_data->m_height)
{
glFrustum (-aspect, aspect, -1.0, 1.0, 1.0, 10000.0);
} else
{
glFrustum (-1.0, 1.0, -aspect, aspect, 1.0, 10000.0);
}
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
}
void Win32OpenGLWindow::renderAllObjects()
{
}
void Win32OpenGLWindow::endRendering()
{
SwapBuffers( m_data->m_hDC );
}
float Win32OpenGLWindow::getTimeInSeconds()
{
return 0.f;
}
void Win32OpenGLWindow::setDebugMessage(int x,int y,const char* message)
{
}
bool Win32OpenGLWindow::requestedExit()
{
return m_data->m_quit;
}

View File

@@ -0,0 +1,70 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef _WIN32_OPENGL_RENDER_MANAGER_H
#define _WIN32_OPENGL_RENDER_MANAGER_H
#define RM_DECLARE_HANDLE(name) typedef struct name##__ { int unused; } *name
RM_DECLARE_HANDLE(RenderObjectHandle);
struct InternalData2;
class Win32OpenGLWindow
{
protected:
struct InternalData2* m_data;
void enableOpenGL();
void disableOpenGL();
void pumpMessage();
public:
Win32OpenGLWindow();
virtual ~Win32OpenGLWindow();
virtual void init(); //default implementation uses default settings for width/height/fullscreen
void init(int width,int height, bool fullscreen=false, int colorBitsPerPixel=0, void* windowHandle=0);
void switchFullScreen(bool fullscreen,int width=0,int height=0,int colorBitsPerPixel=0);
virtual void exit();
virtual void startRendering();
virtual void renderAllObjects();
virtual void endRendering();
virtual float getTimeInSeconds();
virtual void setDebugMessage(int x,int y,const char* message);
virtual bool requestedExit();
};
#endif //_WIN32_OPENGL_RENDER_MANAGER_H

View File

@@ -0,0 +1,224 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
//
//#include "vld.h"
#include <GL/glew.h>
#include "GLInstancingRenderer.h"
#include "GLInstancingRenderer.h"
#include "../opengl_interop/btOpenCLGLInteropBuffer.h"
#include "Win32OpenGLRenderManager.h"
#include "CLPhysicsDemo.h"
#include "../broadphase_benchmark/btGridBroadphaseCl.h"
#include "../../opencl/gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.h"
#include "ShapeData.h"
#include "LinearMath/btQuickprof.h"
int NUM_OBJECTS_X = 32;
int NUM_OBJECTS_Y = 24;
int NUM_OBJECTS_Z = 32;
float X_GAP = 2.f;
float Y_GAP = 2.f;
float Z_GAP = 2.f;
extern int numPairsOut;
void createScene(GLInstancingRenderer& renderer,CLPhysicsDemo& physicsSim)
{
int strideInBytes = sizeof(float)*9;
int barrelShapeIndex = -1;
int cubeShapeIndex = -1;
float position[4]={0,0,0,0};
float orn[4] = {0,0,0,1};
float color[4] = {1,1,1,1};
int index=0;
#if 1
{
int numVertices = sizeof(barrel_vertices)/strideInBytes;
int numIndices = sizeof(barrel_indices)/sizeof(int);
barrelShapeIndex = renderer.registerShape(&barrel_vertices[0],numVertices,barrel_indices,numIndices);
}
float barrelScaling[4] = {2,2,2,1};
int barrelCollisionShapeIndex = physicsSim.registerCollisionShape(&barrel_vertices[0],strideInBytes, sizeof(barrel_vertices)/strideInBytes,&barrelScaling[0]);
for (int i=0;i<NUM_OBJECTS_X;i++)
{
for (int j=0;j<(NUM_OBJECTS_Y/2);j++)
{
for (int k=0;k<NUM_OBJECTS_Z;k++)
{
float mass = j? 1.f : 0.f;
position[0]=(i*X_GAP-NUM_OBJECTS_X/2)+5;
position[1]=(j*Y_GAP*2-NUM_OBJECTS_Y/2);
position[2]=(k*Z_GAP-NUM_OBJECTS_Z/2)-NUM_OBJECTS_Z*3;
position[3] = 1.f;
renderer.registerGraphicsInstance(barrelShapeIndex,position,orn,color,barrelScaling);
void* ptr = (void*) index;
physicsSim.registerPhysicsInstance(mass, position, orn, barrelCollisionShapeIndex,ptr);
index++;
}
}
}
#endif
float cubeScaling[4] = {2,2,2,1};
int cubeCollisionShapeIndex = physicsSim.registerCollisionShape(&cube_vertices[0],strideInBytes, sizeof(cube_vertices)/strideInBytes,&cubeScaling[0]);
{
int numVertices = sizeof(cube_vertices)/strideInBytes;
int numIndices = sizeof(cube_indices)/sizeof(int);
cubeShapeIndex = renderer.registerShape(&cube_vertices[0],numVertices,cube_indices,numIndices);
}
for (int i=0;i<NUM_OBJECTS_X;i++)
{
for (int j=0;j<NUM_OBJECTS_Y/2;j++)
{
for (int k=0;k<NUM_OBJECTS_Z;k++)
{
float mass = 1.f;//j? 1.f : 0.f;
position[0]=(i*X_GAP-NUM_OBJECTS_X/2)+(j&1);
position[1]=(j*Y_GAP-NUM_OBJECTS_Y/2);
position[2]=(k*Z_GAP-NUM_OBJECTS_Z/2)+(j&1);
position[3] = 1.f;
renderer.registerGraphicsInstance(cubeShapeIndex,position,orn,color,cubeScaling);
void* ptr = (void*) index;
physicsSim.registerPhysicsInstance(mass, position, orn, cubeCollisionShapeIndex,ptr);
index++;
}
}
}
if (1)
{
//add some 'special' plane shape
void* ptr = (void*) index;
position[0] = 0.f;
position[1] = -NUM_OBJECTS_Y/2-1;
position[2] = 0.f;
position[3] = 1.f;
physicsSim.registerPhysicsInstance(0.f,position, orn, -1,ptr);
color[0] = 1.f;
color[1] = 0.f;
color[2] = 0.f;
cubeScaling[0] = 5000.f;
cubeScaling[1] = 0.01f;
cubeScaling[2] = 5000.f;
renderer.registerGraphicsInstance(cubeShapeIndex,position,orn,color,cubeScaling);
}
physicsSim.writeBodiesToGpu();
}
int main(int argc, char* argv[])
{
Win32OpenGLWindow* window = new Win32OpenGLWindow();
window->init(1024,768);
GLenum err = glewInit();
window->startRendering();
window->endRendering();
GLInstancingRenderer render;
CLPhysicsDemo demo(window);
bool useInterop = true;
demo.init(-1,-1,useInterop);
render.InitShaders();
if (useInterop)
demo.setupInterop();
createScene(render, demo);
printf("num objects = %d\n", NUM_OBJECTS_X*NUM_OBJECTS_Y*NUM_OBJECTS_Z);
render.writeTransforms();
while (!window->requestedExit())
{
CProfileManager::Reset();
demo.stepSimulation();
window->startRendering();
render.RenderScene();
window->endRendering();
CProfileManager::Increment_Frame_Counter();
static bool printStats = true;
if (printStats)
{
static int count = 10;
count--;
if (count<0)
{
CProfileManager::dumpAll();
//printf("total broadphase pairs= %d\n", gFpIO.m_numOverlap);
printf("numPairsOut (culled) = %d\n", numPairsOut);
printStats = false;
}
}
}
demo.cleanup();
render.CleanupShaders();
window->exit();
delete window;
return 0;
}

View File

@@ -0,0 +1,5 @@
include "AMD"
-- include "Intel"
-- include "NVIDIA"

View File

@@ -0,0 +1,34 @@
hasCL = findOpenCL_AMD()
if (hasCL) then
project "OpenCL_integration_AMD"
initOpenCL_AMD()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlut()
initGlew()
includedirs {
"../../../rendering/BulletMath",
"../../primitives"
}
files {
"../main.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
"../../opengl_interop/btOpenCLGLInteropBuffer.h",
"../../opengl_interop/btStopwatch.cpp",
"../../opengl_interop/btStopwatch.h"
}
end

View File

@@ -0,0 +1,36 @@
hasCL = findOpenCL_Intel()
if (hasCL) then
project "OpenCL_integration_Intel"
initOpenCL_Intel()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlut()
initGlew()
includedirs {
"../../../rendering/BulletMath",
"../../primitives"
}
files {
"../main.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
"../../opengl_interop/btOpenCLGLInteropBuffer.h",
"../../opengl_interop/btStopwatch.cpp",
"../../opengl_interop/btStopwatch.h"
}
end

View File

@@ -0,0 +1,35 @@
hasCL = findOpenCL_NVIDIA()
if (hasCL) then
project "OpenCL_integration_NVIDIA"
initOpenCL_NVIDIA()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlut()
initGlew()
includedirs {
"../../../rendering/BulletMath",
"../../primitives"
}
files {
"../main.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
"../../opengl_interop/btOpenCLGLInteropBuffer.h",
"../../opengl_interop/btStopwatch.cpp",
"../../opengl_interop/btStopwatch.h"
}
end

View File

@@ -0,0 +1,73 @@
MSTRINGIFY(
float4 quatMult(float4 q1, float4 q2)
{
float4 q;
q.x = q1.w * q2.x + q1.x * q2.w + q1.y * q2.z - q1.z * q2.y;
q.y = q1.w * q2.y + q1.y * q2.w + q1.z * q2.x - q1.x * q2.z;
q.z = q1.w * q2.z + q1.z * q2.w + q1.x * q2.y - q1.y * q2.x;
q.w = q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z;
return q;
}
float4 quatNorm(float4 q)
{
float len = native_sqrt(dot(q, q));
if(len > 0.f)
{
q *= 1.f / len;
}
else
{
q.x = q.y = q.z = 0.f;
q.w = 1.f;
}
return q;
}
__kernel void
interopKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer,
__global float4 *linVel,
__global float4 *pAngVel)
{
int nodeID = get_global_id(0);
float timeStep = 0.0166666;
float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254);
if( nodeID < numNodes )
{
g_vertexBuffer[nodeID + startOffset/4] += linVel[nodeID]*timeStep;
// g_vertexBuffer[nodeID + startOffset/4+numNodes] += angVel[nodeID];
float4 axis;
float4 angvel = pAngVel[nodeID];
float fAngle = native_sqrt(dot(angvel, angvel));
//limit the angular motion
if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)
{
fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;
}
if(fAngle < 0.001f)
{
// use Taylor's expansions of sync function
axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);
}
else
{
// sync(fAngle) = sin(c*fAngle)/t
axis = angvel * ( native_sin(0.5f * fAngle * timeStep) / fAngle);
}
float4 dorn = axis;
dorn.w = native_cos(fAngle * timeStep * 0.5f);
float4 orn0 = g_vertexBuffer[nodeID + startOffset/4+numNodes];
float4 predictedOrn = quatMult(dorn, orn0);
predictedOrn = quatNorm(predictedOrn);
g_vertexBuffer[nodeID + startOffset/4+numNodes]=predictedOrn;
}
}
);

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
include "AMD"
include "Intel"
include "NVIDIA"

View File

@@ -0,0 +1,33 @@
hasCL = findOpenCL_AMD()
if (hasCL) then
project "OpenCL_GL_interop_AMD"
initOpenCL_AMD()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlut()
initGlew()
includedirs {
"../../../rendering/BulletMath"
}
files {
"../main.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../btOpenCLGLInteropBuffer.cpp",
"../btOpenCLGLInteropBuffer.h",
"../btStopwatch.cpp",
"../btStopwatch.h"
}
end

View File

@@ -0,0 +1,34 @@
hasCL = findOpenCL_Intel()
if (hasCL) then
project "OpenCL_GL_interop_Intel"
initOpenCL_Intel()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlut()
initGlew()
includedirs {
"../../../rendering/BulletMath"
}
files {
"../main.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../btOpenCLGLInteropBuffer.cpp",
"../btOpenCLGLInteropBuffer.h",
"../btStopwatch.cpp",
"../btStopwatch.h"
}
end

View File

@@ -0,0 +1,34 @@
hasCL = findOpenCL_NVIDIA()
if (hasCL) then
project "OpenCL_GL_interop_NVIDIA"
initOpenCL_NVIDIA()
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
initOpenGL()
initGlut()
initGlew()
includedirs {
"../../../rendering/BulletMath"
}
files {
"../main.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../btOpenCLGLInteropBuffer.cpp",
"../btOpenCLGLInteropBuffer.h",
"../btStopwatch.cpp",
"../btStopwatch.h"
}
end

View File

@@ -0,0 +1,60 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///original author: Erwin Coumans
#include "btOpenCLGLInteropBuffer.h"
btOpenCLGLInteropBuffer::btOpenCLGLInteropBuffer(cl_context clContext, cl_command_queue commandQueue,GLuint openGLVBO)
:m_clContext(clContext),
m_commandQueue(commandQueue),
m_openGLVBO(openGLVBO)
{
cl_int ciErrNum = CL_SUCCESS;
// m_buffer = clCreateFromGLBuffer(m_clContext, CL_MEM_WRITE_ONLY, m_openGLVBO, &ciErrNum);
m_buffer = clCreateFromGLBuffer(m_clContext, CL_MEM_READ_WRITE, m_openGLVBO, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
btOpenCLGLInteropBuffer::~btOpenCLGLInteropBuffer()
{
cl_int ciErrNum = CL_SUCCESS;
clReleaseMemObject (m_buffer);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
void btOpenCLGLInteropBuffer::copyCL2GL()
{
cl_int ciErrNum = CL_SUCCESS;
ciErrNum = clEnqueueAcquireGLObjects(m_commandQueue, 1, &m_buffer, 0, 0, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
//do some stuff
ciErrNum = clEnqueueReleaseGLObjects(m_commandQueue, 1, &m_buffer, 0, 0, 0);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
//only wait if necessary
// clFinish(m_commandQueue);
}
void btOpenCLGLInteropBuffer::copyGL2CL()
{
}

View File

@@ -0,0 +1,49 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///original author: Erwin Coumans
#ifndef BT_OPENCL_GL_INTEROP_BUFFER_H
#define BT_OPENCL_GL_INTEROP_BUFFER_H
#include "btGlutInclude.h"
#include "../basic_initialize/btOpenCLInclude.h"
class btOpenCLGLInteropBuffer
{
cl_context m_clContext;
cl_command_queue m_commandQueue;
cl_mem m_buffer;
GLuint m_openGLVBO;
public:
btOpenCLGLInteropBuffer(cl_context clContext, cl_command_queue commandQueue,GLuint openGLVBO);
virtual ~btOpenCLGLInteropBuffer();
void copyCL2GL();
void copyGL2CL();
cl_mem getCLBUffer()
{
return m_buffer;
}
};
#endif //BT_OPENCL_GL_INTEROP_BUFFER_H

View File

@@ -0,0 +1,182 @@
/*
Stopwatch for timing and profiling for the Bullet Physics Library, http://bulletphysics.org
Copyright (c) 2003-2011 Erwin Coumans
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "btStopwatch.h"
#ifdef __CELLOS_LV2__
#include <sys/sys_time.h>
#include <sys/time_util.h>
#include <stdio.h>
#endif
#if defined (SUNOS) || defined (__SUNOS__)
#include <stdio.h>
#endif
#if defined(WIN32) || defined(_WIN32)
#define BT_USE_WINDOWS_TIMERS
#define WIN32_LEAN_AND_MEAN
#define NOWINRES
#define NOMCX
#define NOIME
#ifdef _XBOX
#include <Xtl.h>
#else //_XBOX
#include <windows.h>
#endif //_XBOX
#include <time.h>
#else //_WIN32
#include <sys/time.h>
#endif //_WIN32
#define mymin(a,b) (a > b ? a : b)
struct btStopwatchData
{
#ifdef BT_USE_WINDOWS_TIMERS
LARGE_INTEGER mClockFrequency;
DWORD mStartTick;
LONGLONG mPrevElapsedTime;
LARGE_INTEGER mStartTime;
#else
#ifdef __CELLOS_LV2__
uint64_t mStartTime;
#else
struct timeval mStartTime;
#endif
#endif //__CELLOS_LV2__
};
btStopwatch::btStopwatch()
{
m_data = new btStopwatchData;
#ifdef BT_USE_WINDOWS_TIMERS
QueryPerformanceFrequency(&m_data->mClockFrequency);
#endif
reset();
}
btStopwatch::~btStopwatch()
{
delete m_data;
}
btStopwatch::btStopwatch(const btStopwatch& other)
{
m_data = new btStopwatchData;
*m_data = *other.m_data;
}
btStopwatch& btStopwatch::operator=(const btStopwatch& other)
{
*m_data = *other.m_data;
return *this;
}
/// Resets the initial reference time.
void btStopwatch::reset()
{
#ifdef BT_USE_WINDOWS_TIMERS
QueryPerformanceCounter(&m_data->mStartTime);
m_data->mStartTick = GetTickCount();
m_data->mPrevElapsedTime = 0;
#else
#ifdef __CELLOS_LV2__
typedef uint64_t ClockSize;
ClockSize newTime;
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
SYS_TIMEBASE_GET( newTime );
m_data->mStartTime = newTime;
#else
gettimeofday(&m_data->mStartTime, 0);
#endif
#endif
}
/// Returns the time in ms since the last call to reset or since
/// the btStopwatch was created.
float btStopwatch::getTimeMilliseconds()
{
return getTimeMicroseconds()/1000.f;
}
/// Returns the time in us since the last call to reset or since
/// the stopwatch was created.
unsigned long int btStopwatch::getTimeMicroseconds()
{
#ifdef BT_USE_WINDOWS_TIMERS
LARGE_INTEGER currentTime;
QueryPerformanceCounter(&currentTime);
LONGLONG elapsedTime = currentTime.QuadPart - m_data->mStartTime.QuadPart;
// Compute the number of millisecond ticks elapsed.
unsigned long msecTicks = (unsigned long)(1000 * elapsedTime / m_data->mClockFrequency.QuadPart);
// Check for unexpected leaps in the Win32 performance counter.
// (This is caused by unexpected data across the PCI to ISA
// bridge, aka south bridge. See Microsoft KB274323.)
unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
signed long msecOff = (signed long)(msecTicks - elapsedTicks);
if (msecOff < -100 || msecOff > 100)
{
// Adjust the starting time forwards.
LONGLONG msecAdjustment = mymin(msecOff *
m_data->mClockFrequency.QuadPart / 1000, elapsedTime -
m_data->mPrevElapsedTime);
m_data->mStartTime.QuadPart += msecAdjustment;
elapsedTime -= msecAdjustment;
}
// Store the current elapsed time for adjustments next time.
m_data->mPrevElapsedTime = elapsedTime;
// Convert to microseconds.
unsigned long usecTicks = (unsigned long)(1000000 * elapsedTime /
m_data->mClockFrequency.QuadPart);
return usecTicks;
#else
#ifdef __CELLOS_LV2__
uint64_t freq=sys_time_get_timebase_frequency();
double dFreq=((double) freq)/ 1000000.0;
typedef uint64_t ClockSize;
ClockSize newTime;
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
SYS_TIMEBASE_GET( newTime );
return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq);
#else
struct timeval currentTime;
gettimeofday(&currentTime, 0);
return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000000 + (currentTime.tv_usec - m_data->mStartTime.tv_usec);
#endif//__CELLOS_LV2__
#endif
}

View File

@@ -0,0 +1,45 @@
/*
Stopwatch for timing and profiling for the Bullet Physics Library, http://bulletphysics.org
Copyright (c) 2003-2011 Erwin Coumans
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_STOPWATCH_H
#define BT_STOPWATCH_H
///The btStopwatch is a portable basic clock that measures real-time, use for profiling etc.
class btStopwatch
{
public:
btStopwatch();
btStopwatch(const btStopwatch& other);
btStopwatch& operator=(const btStopwatch& other);
~btStopwatch();
/// Resets the initial reference time.
void reset();
/// Returns the time in ms since the last call to reset or since
/// the btStopwatch was created.
float getTimeMilliseconds();
/// Returns the time in us since the last call to reset or since
/// the Clock was created.
unsigned long int getTimeMicroseconds();
private:
struct btStopwatchData* m_data;
};
#endif //BT_STOPWATCH_H

View File

@@ -0,0 +1,13 @@
MSTRINGIFY(
__kernel void
interopKernel( const int startOffset, const int numNodes, __global float *g_vertexBuffer)
{
int nodeID = get_global_id(0);
if( nodeID < numNodes )
{
g_vertexBuffer[nodeID*4 + startOffset+1] += 0.01;
}
}
);

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
include "AMD"
include "Intel"
-- include "NVIDIA"

View File

@@ -0,0 +1,19 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#include <Adl/Adl.h>
//KernelManager* KernelManager::s_kManager = NULL;

View File

@@ -0,0 +1,235 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#ifndef ADL_H
#define ADL_H
#pragma warning( disable : 4996 )
#include <Adl/AdlConfig.h>
#include <Adl/AdlError.h>
#include <algorithm>
#ifndef max
#define max(a,b) (((a) > (b)) ? (a) : (b))
#endif
#ifndef min
#define min(a,b) (((a) < (b)) ? (a) : (b))
#endif
namespace adl
{
enum DeviceType
{
TYPE_CL = 0,
TYPE_DX11 = 1,
TYPE_HOST,
};
struct Device;
struct BufferBase
{
enum BufferType
{
BUFFER,
// for dx
BUFFER_CONST,
BUFFER_STAGING,
BUFFER_APPEND,
BUFFER_RAW,
BUFFER_W_COUNTER,
BUFFER_INDEX,
BUFFER_VERTEX,
// for cl
BUFFER_ZERO_COPY,
};
};
class DeviceUtils
{
public:
struct Config
{
enum DeviceType
{
DEVICE_GPU,
DEVICE_CPU,
};
// for CL
enum DeviceVendor
{
VD_AMD,
VD_INTEL,
VD_NV,
};
Config() : m_type(DEVICE_GPU), m_deviceIdx(0), m_vendor(VD_AMD){}
DeviceType m_type;
int m_deviceIdx;
DeviceVendor m_vendor;
};
__inline
static
int getNDevices( DeviceType type );
__inline
static Device* allocate( DeviceType type, Config& cfg );
__inline
static void deallocate( Device* deviceData );
__inline
static void waitForCompletion( const Device* deviceData );
};
//==========================
// DeviceData
//==========================
struct Kernel;
struct Device
{
typedef DeviceUtils::Config Config;
Device( DeviceType type ) : m_type( type ), m_memoryUsage(0)
{
}
virtual void* getContext() const { return 0; }
virtual void initialize(const Config& cfg){}
virtual void release(){}
virtual void waitForCompletion() const {}
virtual void getDeviceName( char nameOut[128] ) const {}
virtual Kernel* getKernel(const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL, bool cacheKernel = true ) const { ADLASSERT(0); return 0;}
virtual unsigned int getUsedMemory() const { return m_memoryUsage; }
DeviceType m_type;
unsigned int m_memoryUsage;
};
//==========================
// Buffer
//==========================
template<typename T>
struct HostBuffer;
// overload each deviceDatas
template<typename T>
struct Buffer : public BufferBase
{
__inline
Buffer();
__inline
Buffer(const Device* device, int nElems, BufferType type = BUFFER );
__inline
virtual ~Buffer();
__inline
void setRawPtr( const Device* device, T* ptr, int size, BufferType type = BUFFER );
__inline
void allocate(const Device* device, int nElems, BufferType type = BUFFER );
__inline
void write(T* hostSrcPtr, int nElems, int dstOffsetNElems = 0);
__inline
void read(T* hostDstPtr, int nElems, int srcOffsetNElems = 0) const;
__inline
void write(Buffer<T>& src, int nElems);
__inline
void read(Buffer<T>& dst, int nElems) const;
// __inline
// Buffer<T>& operator = (const Buffer<T>& buffer);
__inline
int getSize() const { return m_size; }
DeviceType getType() const { ADLASSERT( m_device ); return m_device->m_type; }
const Device* m_device;
int m_size;
T* m_ptr;
// for DX11
void* m_uav;
void* m_srv;
bool m_allocated; // todo. move this to a bit
};
class BufferUtils
{
public:
template<DeviceType TYPE, bool COPY, typename T>
__inline
static
typename Buffer<T>* map(const Device* device, const Buffer<T>* in, int copySize = -1);
template<bool COPY, typename T>
__inline
static
void unmap( Buffer<T>* native, const Buffer<T>* orig, int copySize = -1 );
};
//==========================
// HostBuffer
//==========================
struct DeviceHost;
template<typename T>
struct HostBuffer : public Buffer<T>
{
__inline
HostBuffer():Buffer<T>(){}
__inline
HostBuffer(const Device* device, int nElems, BufferType type = BUFFER ) : Buffer<T>(device, nElems, type) {}
// HostBuffer(const Device* deviceData, T* rawPtr, int nElems);
__inline
T& operator[](int idx);
__inline
const T& operator[](int idx) const;
__inline
T* begin() { return m_ptr; }
__inline
HostBuffer<T>& operator = (const Buffer<T>& device);
};
};
#include <Adl/AdlKernel.h>
#if defined(ADL_ENABLE_CL)
#include <Adl/CL/AdlCL.inl>
#endif
#if defined(ADL_ENABLE_DX11)
#include <Adl/DX11/AdlDX11.inl>
#endif
#include <Adl/Host/AdlHost.inl>
#include <Adl/AdlKernel.inl>
#include <Adl/Adl.inl>
#include <Adl/AdlStopwatch.h>
#include <Adl/Host/AdlStopwatchHost.inl>
#include <Adl/AdlStopwatch.inl>
#endif

View File

@@ -0,0 +1,344 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
namespace adl
{
int DeviceUtils::getNDevices( DeviceType type )
{
switch( type )
{
#if defined(ADL_ENABLE_CL)
case TYPE_CL:
return DeviceCL::getNDevices();
#endif
#if defined(ADL_ENABLE_DX11)
case TYPE_DX11:
return DeviceDX11::getNDevices();
#endif
default:
return 1;
};
}
Device* DeviceUtils::allocate( DeviceType type, Config& cfg )
{
Device* deviceData;
switch( type )
{
#if defined(ADL_ENABLE_CL)
case TYPE_CL:
deviceData = new DeviceCL();
break;
#endif
#if defined(ADL_ENABLE_DX11)
case TYPE_DX11:
deviceData = new DeviceDX11();
break;
#endif
case TYPE_HOST:
deviceData = new DeviceHost();
break;
default:
ADLASSERT( 0 );
break;
};
deviceData->initialize( cfg );
return deviceData;
}
void DeviceUtils::deallocate( Device* deviceData )
{
ADLASSERT( deviceData->getUsedMemory() == 0 );
deviceData->release();
delete deviceData;
}
void DeviceUtils::waitForCompletion( const Device* deviceData )
{
deviceData->waitForCompletion();
}
#if defined(ADL_ENABLE_DX11)
#if defined(ADL_ENABLE_CL)
#define SELECT_DEVICEDATA( type, func ) \
switch( type ) \
{ \
case TYPE_CL: ((DeviceCL*)m_device)->func; break; \
case TYPE_DX11: ((DeviceDX11*)m_device)->func; break; \
case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
default: ADLASSERT(0); break; \
}
#define SELECT_DEVICEDATA1( deviceData, func ) \
switch( deviceData->m_type ) \
{ \
case TYPE_CL: ((DeviceCL*)deviceData)->func; break; \
case TYPE_DX11: ((DeviceDX11*)deviceData)->func; break; \
case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
default: ADLASSERT(0); break; \
}
#else
#define SELECT_DEVICEDATA( type, func ) \
switch( type ) \
{ \
case TYPE_DX11: ((DeviceDX11*)m_device)->func; break; \
case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
default: ADLASSERT(0); break; \
}
#define SELECT_DEVICEDATA1( deviceData, func ) \
switch( deviceData->m_type ) \
{ \
case TYPE_DX11: ((DeviceDX11*)deviceData)->func; break; \
case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
default: ADLASSERT(0); break; \
}
#endif
#else
#if defined(ADL_ENABLE_CL)
#define SELECT_DEVICEDATA( type, func ) \
switch( type ) \
{ \
case TYPE_CL: ((DeviceCL*)m_device)->func; break; \
case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
default: ADLASSERT(0); break; \
}
#define SELECT_DEVICEDATA1( deviceData, func ) \
switch( deviceData->m_type ) \
{ \
case TYPE_CL: ((DeviceCL*)deviceData)->func; break; \
case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
default: ADLASSERT(0); break; \
}
#else
#define SELECT_DEVICEDATA( type, func ) \
switch( type ) \
{ \
case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
default: ADLASSERT(0); break; \
}
#define SELECT_DEVICEDATA1( deviceData, func ) \
switch( deviceData->m_type ) \
{ \
case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
default: ADLASSERT(0); break; \
}
#endif
#endif
template<typename T>
Buffer<T>::Buffer()
{
m_device = 0;
m_size = 0;
m_ptr = 0;
m_uav = 0;
m_srv = 0;
m_allocated = false;
}
template<typename T>
Buffer<T>::Buffer(const Device* deviceData, int nElems, BufferType type )
{
m_device = 0;
allocate( deviceData, nElems, type );
}
template<typename T>
Buffer<T>::~Buffer()
{
if( m_allocated )
{
if( m_device )
SELECT_DEVICEDATA( m_device->m_type, deallocate( this ) );
}
m_device = 0;
m_ptr = 0;
m_size = 0;
}
template<typename T>
void Buffer<T>::setRawPtr( const Device* device, T* ptr, int size, BufferType type )
{
ADLASSERT( m_device == 0 );
ADLASSERT( type == BUFFER ); // todo. implement
ADLASSERT( device->m_type != TYPE_DX11 ); // todo. implement set srv, uav
m_device = device;
m_ptr = ptr;
m_size = size;
}
template<typename T>
void Buffer<T>::allocate(const Device* deviceData, int nElems, BufferType type )
{
ADLASSERT( m_device == 0 );
m_device = deviceData;
m_size = 0;
m_ptr = 0;
m_uav = 0;
m_srv = 0;
SELECT_DEVICEDATA( m_device->m_type, allocate( this, nElems, type ) );
m_allocated = true;
}
template<typename T>
void Buffer<T>::write(T* hostPtr, int nElems, int offsetNElems)
{
ADLASSERT( nElems+offsetNElems <= m_size );
SELECT_DEVICEDATA( m_device->m_type, copy(this, hostPtr, nElems, offsetNElems) );
}
template<typename T>
void Buffer<T>::read(T* hostPtr, int nElems, int offsetNElems) const
{
SELECT_DEVICEDATA( m_device->m_type, copy(hostPtr,this, nElems, offsetNElems) );
}
template<typename T>
void Buffer<T>::write(Buffer<T>& src, int nElems)
{
ADLASSERT( nElems <= m_size );
SELECT_DEVICEDATA( m_device->m_type, copy(this, &src, nElems) );
}
template<typename T>
void Buffer<T>::read(Buffer<T>& dst, int nElems) const
{
SELECT_DEVICEDATA( m_device->m_type, copy(&dst, this, nElems) );
}
/*
template<typename T>
Buffer<T>& Buffer<T>::operator = ( const Buffer<T>& buffer )
{
// ADLASSERT( buffer.m_size <= m_size );
SELECT_DEVICEDATA( m_device->m_type, copy(this, &buffer, min2( m_size, buffer.m_size) ) );
return *this;
}
*/
template<DeviceType TYPE, bool COPY, typename T>
__inline
static
typename Buffer<T>* BufferUtils::map(const Device* device, const Buffer<T>* in, int copySize)
{
Buffer<T>* native;
ADLASSERT( device->m_type == TYPE );
if( in->getType() == TYPE )
native = (Buffer<T>*)in;
else
{
ADLASSERT( copySize <= in->getSize() );
copySize = (copySize==-1)? in->getSize() : copySize;
native = new Buffer<T>( device, copySize );
if( COPY )
{
if( in->getType() == TYPE_HOST )
native->write( in->m_ptr, copySize );
else if( native->getType() == TYPE_HOST )
{
in->read( native->m_ptr, copySize );
DeviceUtils::waitForCompletion( in->m_device );
}
else
{
T* tmp = new T[copySize];
in->read( tmp, copySize );
DeviceUtils::waitForCompletion( in->m_device );
native->write( tmp, copySize );
DeviceUtils::waitForCompletion( native->m_device );
delete [] tmp;
}
}
}
return native;
}
template<bool COPY, typename T>
__inline
static
void BufferUtils::unmap( Buffer<T>* native, const Buffer<T>* orig, int copySize )
{
if( native != orig )
{
if( COPY )
{
copySize = (copySize==-1)? orig->getSize() : copySize;
ADLASSERT( copySize <= orig->getSize() );
if( orig->getType() == TYPE_HOST )
{
native->read( orig->m_ptr, copySize );
DeviceUtils::waitForCompletion( native->m_device );
}
else if( native->getType() == TYPE_HOST )
{
Buffer<T>* dst = (Buffer<T>*)orig;
dst->write( native->m_ptr, copySize );
DeviceUtils::waitForCompletion( dst->m_device );
}
else
{
T* tmp = new T[copySize];
native->read( tmp, copySize );
DeviceUtils::waitForCompletion( native->m_device );
Buffer<T>* dst = (Buffer<T>*)orig;
dst->write( tmp, copySize );
DeviceUtils::waitForCompletion( dst->m_device );
delete [] tmp;
}
}
delete native;
}
}
template<typename T>
T& HostBuffer<T>::operator[](int idx)
{
return m_ptr[idx];
}
template<typename T>
const T& HostBuffer<T>::operator[](int idx) const
{
return m_ptr[idx];
}
template<typename T>
HostBuffer<T>& HostBuffer<T>::operator = ( const Buffer<T>& device )
{
ADLASSERT( device.m_size <= m_size );
SELECT_DEVICEDATA1( device.m_device, copy( m_ptr, &device, device.m_size ) );
return *this;
}
#undef SELECT_DEVICEDATA
};

View File

@@ -0,0 +1,27 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
//ADL_ENABLE_CL and ADL_ENABLE_DX11 can be set in the build system using C/C++ preprocessor defines
//#define ADL_ENABLE_CL
//#define ADL_ENABLE_DX11
//#define ADL_CL_FORCE_UNCACHE_KERNEL
#define ADL_CL_DUMP_MEMORY_LOG
//load the kernels from string instead of loading them from file
#define ADL_LOAD_KERNEL_FROM_STRING
#define ADL_DUMP_DX11_ERROR

View File

@@ -0,0 +1,80 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#ifndef ADL_ERROR_H
#define ADL_ERROR_H
#if defined(ADL_DUMP_DX11_ERROR)
#include <windows.h>
#endif
#ifdef _DEBUG
#include <assert.h>
#include <stdarg.h>
#include <stdio.h>
#endif
namespace adl
{
#ifdef _DEBUG
#define ADLASSERT(x) if(!(x)){__debugbreak(); }
#else
#define ADLASSERT(x) if(x){}
#endif
#ifdef _DEBUG
#define COMPILE_TIME_ASSERT(x) {int compileTimeAssertFailed[x]; compileTimeAssertFailed[0];}
#else
#define COMPILE_TIME_ASSERT(x)
#endif
#ifdef _DEBUG
__inline
void debugPrintf(const char *fmt, ...)
{
va_list arg;
va_start(arg, fmt);
#if defined(ADL_DUMP_DX11_ERROR)
const int size = 1024*10;
char buf[size];
vsprintf_s( buf, size, fmt, arg );
#ifdef UNICODE
WCHAR wbuf[size];
int sizeWide = MultiByteToWideChar(0,0,buf,-1,wbuf,0);
MultiByteToWideChar(0,0,buf,-1,wbuf,sizeWide);
// swprintf_s( wbuf, 256, L"%s", buf );
OutputDebugString( wbuf );
#else
OutputDebugString( buf );
#endif
#else
vprintf(fmt, arg);
#endif
va_end(arg);
}
#else
__inline
void debugPrintf(const char *fmt, ...)
{
}
#endif
};
#endif

View File

@@ -0,0 +1,142 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#ifndef ADL_KERNEL_H
#define ADL_KERNEL_H
#include <map>
#include <string>
#include <fstream>
namespace adl
{
//==========================
// Kernel
//==========================
struct Kernel
{
DeviceType m_type;
void* m_kernel;
};
//==========================
// KernelManager
//==========================
class KernelManager
{
public:
typedef std::map<std::string, Kernel*> KMap;
__inline
~KernelManager();
__inline
// static
Kernel* query(const Device* dd, const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL,
bool cacheKernel = true);
public:
KMap m_map;
};
//==========================
// Launcher
//==========================
class Launcher
{
public:
struct BufferInfo
{
BufferInfo(){}
template<typename T>
BufferInfo(Buffer<T>* buff, bool isReadOnly = false): m_buffer(buff), m_isReadOnly(isReadOnly){}
void* m_buffer;
bool m_isReadOnly;
};
__inline
Launcher(const Device* dd, char* fileName, char* funcName, char* option = NULL);
__inline
Launcher(const Device* dd, Kernel* kernel);
__inline
void setBuffers( BufferInfo* buffInfo, int n );
template<typename T>
__inline
void setConst( Buffer<T>& constBuff, const T& consts );
__inline
void launch1D( int numThreads, int localSize = 64 );
__inline
void launch2D( int numThreadsX, int numThreadsY, int localSizeX = 8, int localSizeY = 8 );
public:
enum
{
CONST_BUFFER_SIZE = 512,
};
const Device* m_deviceData;
Kernel* m_kernel;
int m_idx;
int m_idxRw;
};
template<DeviceType TYPE>
class KernelBuilder
{
public:
__inline
KernelBuilder(): m_ptr(0){}
__inline
void setFromFile( const Device* deviceData, const char* fileName, const char* option = NULL, bool addExtension = false,
bool cacheKernel = true);
__inline
void setFromSrc( const Device* deviceData, const char* src, const char* option = NULL );
__inline
void setFromSrcCached( const Device* deviceData, const char* src, const char* fileName, const char* option );
__inline
void createKernel( const char* funcName, Kernel& kernelOut );
__inline
~KernelBuilder();
// todo. implemement in kernel destructor?
__inline
static void deleteKernel( Kernel& kernel );
private:
enum
{
MAX_PATH_LENGTH = 260,
};
const Device* m_deviceData;
#ifdef UNICODE
wchar_t m_path[MAX_PATH_LENGTH];
#else
char m_path[MAX_PATH_LENGTH];
#endif
void* m_ptr;
};
};
#endif //ADL_KERNEL_H

View File

@@ -0,0 +1,223 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#ifdef ADL_ENABLE_CL
#include <Adl/CL/AdlKernelUtilsCL.inl>
#endif
#ifdef ADL_ENABLE_DX11
#include <Adl/DX11/AdlKernelUtilsDX11.inl>
#endif
namespace adl
{
//==========================
// KernelManager
//==========================
Kernel* KernelManager::query(const Device* dd, const char* fileName, const char* funcName, const char* option, const char* src,
bool cacheKernel)
{
printf("compiling kernel %s",funcName);
const int charSize = 1024*2;
KernelManager* s_kManager = this;
char fullFineName[charSize];
switch( dd->m_type )
{
case TYPE_CL:
#if defined(ADL_ENABLE_CL)
sprintf_s(fullFineName,charSize,"%s.cl", fileName);
break;
#endif
#if defined(ADL_ENABLE_DX11)
case TYPE_DX11:
sprintf_s(fullFineName,charSize,"%s.hlsl", fileName);
break;
#endif
default:
ADLASSERT(0);
break;
};
char mapName[charSize];
{
if( option )
sprintf_s(mapName, charSize, "%d%s%s%s", (int)dd->getContext(), fullFineName, funcName, option);
else
sprintf_s(mapName, charSize, "%d%s%s", (int)dd->getContext(), fullFineName, funcName);
}
std::string str(mapName);
KMap::iterator iter = s_kManager->m_map.find( str );
Kernel* kernelOut;
if( iter == s_kManager->m_map.end() )
{
kernelOut = new Kernel();
switch( dd->m_type )
{
#if defined(ADL_ENABLE_CL)
case TYPE_CL:
{
KernelBuilder<TYPE_CL> builder;
if( src )
if (cacheKernel)
{
builder.setFromSrcCached( dd, src, fileName, option );
} else
{
builder.setFromSrc( dd, src, option );
}
else
builder.setFromFile( dd, fileName, option, true, cacheKernel );
builder.createKernel( funcName, *kernelOut );
}
break;
#endif
#if defined(ADL_ENABLE_DX11)
case TYPE_DX11:
{
KernelBuilder<TYPE_DX11> builder;
if( src )
builder.setFromSrc( dd, src, option );
else
builder.setFromFile( dd, fileName, option, true, cacheKernel );
builder.createKernel( funcName, *kernelOut );
}
break;
#endif
default:
ADLASSERT(0);
break;
};
s_kManager->m_map.insert( KMap::value_type(str,kernelOut) );
}
else
{
kernelOut = iter->second;
}
printf(" ready\n");
return kernelOut;
}
KernelManager::~KernelManager()
{
for(KMap::iterator iter = m_map.begin(); iter != m_map.end(); iter++)
{
Kernel* k = iter->second;
switch( k->m_type )
{
#if defined(ADL_ENABLE_CL)
case TYPE_CL:
KernelBuilder<TYPE_CL>::deleteKernel( *k );
delete k;
break;
#endif
#if defined(ADL_ENABLE_DX11)
case TYPE_DX11:
KernelBuilder<TYPE_DX11>::deleteKernel( *k );
delete k;
break;
#endif
default:
ADLASSERT(0);
break;
};
}
}
//==========================
// Launcher
//==========================
#if defined(ADL_ENABLE_DX11)
#if defined(ADL_ENABLE_CL)
#define SELECT_LAUNCHER( type, func ) \
switch( type ) \
{ \
case TYPE_CL: LauncherCL::func; break; \
case TYPE_DX11: LauncherDX11::func; break; \
default: ADLASSERT(0); break; \
};
#else
#define SELECT_LAUNCHER( type, func ) \
switch( type ) \
{ \
case TYPE_DX11: LauncherDX11::func; break; \
default: ADLASSERT(0); break; \
};
#endif
#else
#if defined(ADL_ENABLE_CL)
#define SELECT_LAUNCHER( type, func ) \
switch( type ) \
{ \
case TYPE_CL: LauncherCL::func; break; \
default: ADLASSERT(0); break; \
};
#else
#define SELECT_LAUNCHER( type, func ) \
switch( type ) \
{ \
default: ADLASSERT(0); break; \
};
#endif
#endif
Launcher::Launcher(const Device *dd, char *fileName, char *funcName, char *option)
{
m_kernel = dd->getKernel( fileName, funcName, option );
m_deviceData = dd;
m_idx = 0;
m_idxRw = 0;
}
Launcher::Launcher(const Device* dd, Kernel* kernel)
{
m_kernel = kernel;
m_deviceData = dd;
m_idx = 0;
m_idxRw = 0;
}
void Launcher::setBuffers( BufferInfo* buffInfo, int n )
{
SELECT_LAUNCHER( m_deviceData->m_type, setBuffers( this, buffInfo, n ) );
}
template<typename T>
void Launcher::setConst( Buffer<T>& constBuff, const T& consts )
{
SELECT_LAUNCHER( m_deviceData->m_type, setConst( this, constBuff, consts ) );
}
void Launcher::launch1D( int numThreads, int localSize )
{
SELECT_LAUNCHER( m_deviceData->m_type, launch2D( this, numThreads, 1, localSize, 1 ) );
}
void Launcher::launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
{
SELECT_LAUNCHER( m_deviceData->m_type, launch2D( this, numThreadsX, numThreadsY, localSizeX, localSizeY ) );
}
#undef SELECT_LAUNCHER
};

View File

@@ -0,0 +1,81 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#include <windows.h>
namespace adl
{
struct StopwatchBase
{
__inline
StopwatchBase(): m_device(0){}
__inline
StopwatchBase( const Device* deviceData ){ init(deviceData); }
__inline
virtual ~StopwatchBase(){}
__inline
virtual void init( const Device* deviceData ) = 0;
__inline
virtual void start() = 0;
__inline
virtual void split() = 0;
__inline
virtual void stop() = 0;
__inline
virtual float getMs(int index=0) = 0;
__inline
virtual void getMs( float* times, int capacity ) = 0;
__inline
int getNIntervals() const{ return m_idx-1;}
enum
{
CAPACITY = 64,
};
const Device* m_device;
int m_idx;
};
struct Stopwatch
{
__inline
Stopwatch( const Device* deviceData = NULL ) { m_impl=0; if(deviceData) init(deviceData);}
__inline
~Stopwatch();
__inline
void init( const Device* deviceData );
__inline
void start(){if(!m_impl) init(0); m_impl->start();}
__inline
void split(){m_impl->split();}
__inline
void stop(){m_impl->stop();}
__inline
float getMs(){ return m_impl->getMs();}
__inline
void getMs( float* times, int capacity ){m_impl->getMs(times, capacity);}
__inline
int getNIntervals() const{return m_impl->getNIntervals();}
StopwatchBase* m_impl;
};
};

View File

@@ -0,0 +1,59 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
namespace adl
{
void Stopwatch::init( const Device* deviceData )
{
ADLASSERT( m_impl == 0 );
if( deviceData )
{
switch( deviceData->m_type )
{
#if defined(ADL_ENABLE_CL)
case TYPE_CL:
m_impl = new StopwatchHost;//StopwatchCL
break;
#endif
#if defined(ADL_ENABLE_DX11)
case TYPE_DX11:
m_impl = new StopwatchHost;//StopwatchDX11;
break;
#endif
case TYPE_HOST:
m_impl = new StopwatchHost;
break;
default:
ADLASSERT(0);
break;
};
}
else
{
m_impl = new StopwatchHost;
}
m_impl->init( deviceData );
}
Stopwatch::~Stopwatch()
{
if( m_impl == 0 ) return;
delete m_impl;
}
};

View File

@@ -0,0 +1,384 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#pragma comment(lib,"OpenCL.lib")
#include <CL/cl.h>
#include <CL/cl_ext.h>
#include <CL/cl_platform.h>
namespace adl
{
struct DeviceCL : public Device
{
typedef DeviceUtils::Config Config;
__inline
DeviceCL() : Device( TYPE_CL ), m_kernelManager(0){}
__inline
void* getContext() const { return m_context; }
__inline
void initialize(const Config& cfg);
__inline
void release();
template<typename T>
__inline
void allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type);
template<typename T>
__inline
void deallocate(Buffer<T>* buf);
template<typename T>
__inline
void copy(Buffer<T>* dst, const Buffer<T>* src, int nElems,int srcOffsetNElems = 0,int dstOffsetNElems = 0);
template<typename T>
__inline
void copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems = 0);
template<typename T>
__inline
void copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems = 0);
__inline
void waitForCompletion() const;
__inline
void getDeviceName( char nameOut[128] ) const;
__inline
static
int getNDevices();
__inline
Kernel* getKernel(const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL, bool cacheKernel = true )const;
enum
{
MAX_NUM_DEVICES = 6,
};
cl_context m_context;
cl_command_queue m_commandQueue;
cl_device_id m_deviceIdx;
KernelManager* m_kernelManager;
};
//===
//===
void DeviceCL::initialize(const Config& cfg)
{
// DeviceUtils::create( cfg, (DeviceCL*)this );
{
// dd = new DeviceCL();
DeviceCL* deviceData = (DeviceCL*)this;
// cl_device_type deviceType = (driverType == DRIVER_HARDWARE)? CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU;
cl_device_type deviceType = (cfg.m_type== Config::DEVICE_GPU)? CL_DEVICE_TYPE_GPU: CL_DEVICE_TYPE_CPU;
// int numContextQueuePairsToCreate = 1;
bool enableProfiling = false;
#ifdef _DEBUG
enableProfiling = true;
#endif
cl_int status;
cl_platform_id platform;
{
cl_uint nPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &nPlatforms);
ADLASSERT( status == CL_SUCCESS );
cl_platform_id pIdx[5];
status = clGetPlatformIDs(nPlatforms, pIdx, NULL);
ADLASSERT( status == CL_SUCCESS );
cl_uint atiIdx = -1;
cl_uint intelIdx = -1;
cl_uint nvIdx = -1;
for(cl_uint i=0; i<nPlatforms; i++)
{
char buff[512];
status = clGetPlatformInfo( pIdx[i], CL_PLATFORM_VENDOR, 512, buff, 0 );
ADLASSERT( status == CL_SUCCESS );
//skip the platform if there are no devices available
cl_uint numDevice;
status = clGetDeviceIDs( pIdx[i], deviceType, 0, NULL, &numDevice );
if (numDevice>0)
{
if( strcmp( buff, "NVIDIA Corporation" )==0 ) nvIdx = i;
if( strcmp( buff, "Advanced Micro Devices, Inc." )==0 ) atiIdx = i;
if( strcmp( buff, "Intel(R) Corporation" )==0 ) intelIdx = i;
}
}
if( deviceType == CL_DEVICE_TYPE_GPU )
{
switch( cfg.m_vendor )
{
case DeviceUtils::Config::VD_AMD:
if( atiIdx == -1 && nvIdx != -1 ) goto USE_NV_GPU;
USE_AMD_GPU:
ADLASSERT(atiIdx != -1 );
platform = pIdx[atiIdx];
break;
case DeviceUtils::Config::VD_NV:
if( atiIdx != -1 && nvIdx == -1 ) goto USE_AMD_GPU;
USE_NV_GPU:
ADLASSERT(nvIdx != -1 );
platform = pIdx[nvIdx];
break;
default:
ADLASSERT(0);
break;
};
}
else if( deviceType == CL_DEVICE_TYPE_CPU )
{
switch( cfg.m_vendor )
{
case DeviceUtils::Config::VD_AMD:
ADLASSERT(atiIdx != -1 );
platform = pIdx[atiIdx];
break;
case DeviceUtils::Config::VD_INTEL:
ADLASSERT(intelIdx != -1 );
platform = pIdx[intelIdx];
break;
default:
ADLASSERT(0);
break;
};
}
}
cl_uint numDevice;
status = clGetDeviceIDs( platform, deviceType, 0, NULL, &numDevice );
// ADLASSERT( cfg.m_deviceIdx < (int)numDevice );
debugPrintf("CL: %d %s Devices ", numDevice, (deviceType==CL_DEVICE_TYPE_GPU)? "GPU":"CPU");
// numContextQueuePairsToCreate = min( (int)numDevice, numContextQueuePairsToCreate );
// numContextQueuePairsToCreate = ( (int)numDevice < numContextQueuePairsToCreate )? numDevice : numContextQueuePairsToCreate;
cl_device_id deviceIds[ MAX_NUM_DEVICES ];
status = clGetDeviceIDs( platform, deviceType, numDevice, deviceIds, NULL );
ADLASSERT( status == CL_SUCCESS );
{ int i = min( (int)numDevice-1, cfg.m_deviceIdx );
m_deviceIdx = deviceIds[i];
deviceData->m_context = clCreateContext( NULL, 1, &deviceData->m_deviceIdx, NULL, NULL, &status );
ADLASSERT( status == CL_SUCCESS );
char buff[512];
status = clGetDeviceInfo( deviceData->m_deviceIdx, CL_DEVICE_NAME, sizeof(buff), &buff, NULL );
ADLASSERT( status == CL_SUCCESS );
debugPrintf("[%s]\n", buff);
deviceData->m_commandQueue = clCreateCommandQueue( deviceData->m_context, deviceData->m_deviceIdx, (enableProfiling)?CL_QUEUE_PROFILING_ENABLE:NULL, NULL );
ADLASSERT( status == CL_SUCCESS );
// status = clSetCommandQueueProperty( commandQueue, CL_QUEUE_PROFILING_ENABLE, CL_TRUE, 0 );
// CLASSERT( status == CL_SUCCESS );
if(0)
{
cl_bool image_support;
clGetDeviceInfo(deviceData->m_deviceIdx, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL);
debugPrintf(" CL_DEVICE_IMAGE_SUPPORT : %s\n", image_support?"Yes":"No");
}
}
}
m_kernelManager = new KernelManager;
}
void DeviceCL::release()
{
clReleaseCommandQueue( m_commandQueue );
clReleaseContext( m_context );
if( m_kernelManager ) delete m_kernelManager;
}
template<typename T>
void DeviceCL::allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type)
{
buf->m_device = this;
buf->m_size = nElems;
buf->m_ptr = 0;
if( type == BufferBase::BUFFER_CONST ) return;
#if defined(ADL_CL_DUMP_MEMORY_LOG)
char deviceName[256];
getDeviceName( deviceName );
printf( "adlCLMemoryLog %s : %3.2fMB Allocation: %3.2fKB ", deviceName, m_memoryUsage/1024.f/1024.f, sizeof(T)*nElems/1024.f );
fflush( stdout );
#endif
int sz=sizeof(T)*nElems;
cl_int status = 0;
if( type == BufferBase::BUFFER_ZERO_COPY )
buf->m_ptr = (T*)clCreateBuffer( m_context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sz, 0, &status );
else if( type == BufferBase::BUFFER_RAW )
buf->m_ptr = (T*)clCreateBuffer( m_context, CL_MEM_WRITE_ONLY, sz, 0, &status );
else
buf->m_ptr = (T*)clCreateBuffer( m_context, CL_MEM_READ_WRITE, sz, 0, &status );
m_memoryUsage += buf->m_size*sizeof(T);
#if defined(ADL_CL_DUMP_MEMORY_LOG)
printf( "%s\n", (status==CL_SUCCESS)? "Succeed": "Failed" );
fflush( stdout );
#endif
ADLASSERT( status == CL_SUCCESS );
}
template<typename T>
void DeviceCL::deallocate(Buffer<T>* buf)
{
if( buf->m_ptr )
{
m_memoryUsage -= buf->m_size*sizeof(T);
clReleaseMemObject( (cl_mem)buf->m_ptr );
}
buf->m_device = 0;
buf->m_size = 0;
buf->m_ptr = 0;
}
template<typename T>
void DeviceCL::copy(Buffer<T>* dst, const Buffer<T>* src, int nElems,int srcOffsetNElems,int dstOffsetNElems )
{
if( dst->m_device->m_type == TYPE_CL && src->m_device->m_type == TYPE_CL )
{
cl_int status = 0;
status = clEnqueueCopyBuffer( m_commandQueue, (cl_mem)src->m_ptr, (cl_mem)dst->m_ptr, sizeof(T)*srcOffsetNElems, sizeof(T)*dstOffsetNElems, sizeof(T)*nElems, 0, 0, 0 );
ADLASSERT( status == CL_SUCCESS );
}
else if( src->m_device->m_type == TYPE_HOST )
{
ADLASSERT( dst->getType() == TYPE_CL );
dst->write( src->m_ptr, nElems );
}
else if( dst->m_device->m_type == TYPE_HOST )
{
ADLASSERT( src->getType() == TYPE_CL );
src->read( dst->m_ptr, nElems );
}
else
{
ADLASSERT( 0 );
}
}
template<typename T>
void DeviceCL::copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems )
{
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, (cl_mem)src->m_ptr, 0, sizeof(T)*srcOffsetNElems, sizeof(T)*nElems,
dst, 0,0,0 );
ADLASSERT( status == CL_SUCCESS );
}
template<typename T>
void DeviceCL::copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems )
{
cl_int status = 0;
int sz=sizeof(T)*nElems;
status = clEnqueueWriteBuffer( m_commandQueue, (cl_mem)dst->m_ptr, 0, sizeof(T)*dstOffsetNElems, sz,
src, 0,0,0 );
ADLASSERT( status == CL_SUCCESS );
}
void DeviceCL::waitForCompletion() const
{
clFinish( m_commandQueue );
}
int DeviceCL::getNDevices()
{
cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
cl_int status;
cl_platform_id platform;
{
cl_uint nPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &nPlatforms);
ADLASSERT( status == CL_SUCCESS );
cl_platform_id pIdx[5];
status = clGetPlatformIDs(nPlatforms, pIdx, NULL);
ADLASSERT( status == CL_SUCCESS );
cl_uint nvIdx = -1;
cl_uint atiIdx = -1;
for(cl_uint i=0; i<nPlatforms; i++)
{
char buff[512];
status = clGetPlatformInfo( pIdx[i], CL_PLATFORM_VENDOR, 512, buff, 0 );
ADLASSERT( status == CL_SUCCESS );
if( strcmp( buff, "NVIDIA Corporation" )==0 ) nvIdx = i;
if( strcmp( buff, "Advanced Micro Devices, Inc." )==0 ) atiIdx = i;
}
if( deviceType == CL_DEVICE_TYPE_GPU )
{
if( nvIdx != -1 ) platform = pIdx[nvIdx];
else platform = pIdx[atiIdx];
}
else if( deviceType == CL_DEVICE_TYPE_CPU )
{
platform = pIdx[atiIdx];
}
}
cl_uint numDevice;
status = clGetDeviceIDs( platform, deviceType, 0, NULL, &numDevice );
ADLASSERT( status == CL_SUCCESS );
return numDevice;
}
void DeviceCL::getDeviceName( char nameOut[128] ) const
{
cl_int status;
status = clGetDeviceInfo( m_deviceIdx, CL_DEVICE_NAME, sizeof(char)*128, nameOut, NULL );
ADLASSERT( status == CL_SUCCESS );
}
Kernel* DeviceCL::getKernel(const char* fileName, const char* funcName, const char* option, const char* src, bool cacheKernel )const
{
return m_kernelManager->query( this, fileName, funcName, option, src, cacheKernel );
}
};

View File

@@ -0,0 +1,541 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
namespace adl
{
struct KernelCL : public Kernel
{
cl_kernel& getKernel() { return (cl_kernel&)m_kernel; }
};
static const char* strip(const char* name, const char* pattern)
{
size_t const patlen = strlen(pattern);
size_t patcnt = 0;
const char * oriptr;
const char * patloc;
// find how many times the pattern occurs in the original string
for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
{
patcnt++;
}
return oriptr;
}
static bool isFileUpToDate(const char* binaryFileName,const char* srcFileName)
{
bool fileUpToDate = false;
bool binaryFileValid=false;
FILETIME modtimeBinary;
int nameLength = (int)strlen(binaryFileName)+1;
#ifdef UNICODE
WCHAR* fName = new WCHAR[nameLength];
MultiByteToWideChar(CP_ACP,0,binaryFileName,-1, fName, nameLength);
HANDLE binaryFileHandle = CreateFile(fName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
delete [] fName;
#else
HANDLE binaryFileHandle = CreateFile(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
#endif
if (binaryFileHandle ==INVALID_HANDLE_VALUE)
{
DWORD errorCode;
errorCode = GetLastError();
switch (errorCode)
{
case ERROR_FILE_NOT_FOUND:
{
debugPrintf("\nCached file not found %s\n", binaryFileName);
break;
}
case ERROR_PATH_NOT_FOUND:
{
debugPrintf("\nCached file path not found %s\n", binaryFileName);
break;
}
default:
{
debugPrintf("\nFailed reading cached file with errorCode = %d\n", errorCode);
}
}
} else
{
if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0)
{
DWORD errorCode;
errorCode = GetLastError();
debugPrintf("\nGetFileTime errorCode = %d\n", errorCode);
} else
{
binaryFileValid = true;
}
CloseHandle(binaryFileHandle);
}
if (binaryFileValid)
{
#ifdef UNICODE
int nameLength = (int)strlen(srcFileName)+1;
WCHAR* fName = new WCHAR[nameLength];
MultiByteToWideChar(CP_ACP,0,srcFileName,-1, fName, nameLength);
HANDLE srcFileHandle = CreateFile(fName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
delete [] fName;
#else
HANDLE srcFileHandle = CreateFile(srcFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
#endif
if (srcFileHandle!=INVALID_HANDLE_VALUE)
{
FILETIME modtimeSrc;
if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0)
{
DWORD errorCode;
errorCode = GetLastError();
debugPrintf("\nGetFileTime errorCode = %d\n", errorCode);
}
if ( ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime)
||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
{
fileUpToDate=true;
} else
{
debugPrintf("\nCached binary file found (%s), but out-of-date\n",binaryFileName);
}
CloseHandle(srcFileHandle);
}
else
{
#ifdef _DEBUG
DWORD errorCode;
errorCode = GetLastError();
switch (errorCode)
{
case ERROR_FILE_NOT_FOUND:
{
debugPrintf("\nSrc file not found %s\n", srcFileName);
break;
}
case ERROR_PATH_NOT_FOUND:
{
debugPrintf("\nSrc path not found %s\n", srcFileName);
break;
}
default:
{
debugPrintf("\nnSrc file reading errorCode = %d\n", errorCode);
}
}
ADLASSERT(0);
#else
//if we cannot find the source, assume it is OK in release builds
fileUpToDate = true;
#endif
}
}
return fileUpToDate;
}
template<>
void KernelBuilder<TYPE_CL>::setFromFile( const Device* deviceData, const char* fileName, const char* option, bool addExtension,
bool cacheKernel)
{
m_deviceData = deviceData;
char fileNameWithExtension[256];
if( addExtension )
sprintf_s( fileNameWithExtension, "%s.cl", fileName );
else
sprintf_s( fileNameWithExtension, "%s", fileName );
class File
{
public:
__inline
bool open(const char* fileNameWithExtension)
{
size_t size;
char* str;
// Open file stream
std::fstream f(fileNameWithExtension, (std::fstream::in | std::fstream::binary));
// Check if we have opened file stream
if (f.is_open()) {
size_t sizeFile;
// Find the stream size
f.seekg(0, std::fstream::end);
size = sizeFile = (size_t)f.tellg();
f.seekg(0, std::fstream::beg);
str = new char[size + 1];
if (!str) {
f.close();
return NULL;
}
// Read file
f.read(str, sizeFile);
f.close();
str[size] = '\0';
m_source = str;
delete[] str;
return true;
}
return false;
}
const std::string& getSource() const {return m_source;}
private:
std::string m_source;
};
cl_program& program = (cl_program&)m_ptr;
cl_int status = 0;
bool cacheBinary = cacheKernel;
#if defined(ADL_CL_FORCE_UNCACHE_KERNEL)
cacheBinary = false;
#endif
char binaryFileName[512];
{
char deviceName[256];
deviceData->getDeviceName(deviceName);
char driverVersion[256];
const DeviceCL* dd = (const DeviceCL*) deviceData;
clGetDeviceInfo(dd->m_deviceIdx, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
const char* strippedFileName = strip(fileName,"\\");
strippedFileName = strip(strippedFileName,"/");
sprintf_s(binaryFileName,"cache/%s.%s.%s.bin",strippedFileName, deviceName,driverVersion );
}
bool upToDate = isFileUpToDate(binaryFileName,fileNameWithExtension);
if( cacheBinary && upToDate)
{
FILE* file = fopen(binaryFileName, "rb");
if( file )
{
fseek( file, 0L, SEEK_END );
size_t binarySize = ftell( file );
rewind( file );
char* binary = new char[binarySize];
fread( binary, sizeof(char), binarySize, file );
fclose( file );
if (binarySize)
{
const DeviceCL* dd = (const DeviceCL*) deviceData;
program = clCreateProgramWithBinary( dd->m_context, 1, &dd->m_deviceIdx, &binarySize, (const unsigned char**)&binary, 0, &status );
ADLASSERT( status == CL_SUCCESS );
status = clBuildProgram( program, 1, &dd->m_deviceIdx, option, 0, 0 );
ADLASSERT( status == CL_SUCCESS );
if( status != CL_SUCCESS )
{
char *build_log;
size_t ret_val_size;
clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = new char[ret_val_size+1];
clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
build_log[ret_val_size] = '\0';
debugPrintf("%s\n", build_log);
delete build_log;
ADLASSERT(0);
}
}
}
}
if( !m_ptr )
{
File kernelFile;
ADLASSERT( kernelFile.open( fileNameWithExtension ) );
const char* source = kernelFile.getSource().c_str();
setFromSrc( m_deviceData, source, option );
if( cacheBinary )
{ // write to binary
size_t binarySize;
status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
ADLASSERT( status == CL_SUCCESS );
char* binary = new char[binarySize];
status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
ADLASSERT( status == CL_SUCCESS );
{
FILE* file = fopen(binaryFileName, "wb");
if (file)
{
fwrite( binary, sizeof(char), binarySize, file );
fclose( file );
}
}
delete [] binary;
}
}
}
template<>
void KernelBuilder<TYPE_CL>::setFromSrcCached( const Device* deviceData, const char* src, const char* fileName, const char* option )
{
m_deviceData = deviceData;
bool cacheBinary = true;
cl_program& program = (cl_program&)m_ptr;
cl_int status = 0;
char binaryFileName[512];
{
char deviceName[256];
deviceData->getDeviceName(deviceName);
char driverVersion[256];
const DeviceCL* dd = (const DeviceCL*) deviceData;
clGetDeviceInfo(dd->m_deviceIdx, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
const char* strippedFileName = strip(fileName,"\\");
strippedFileName = strip(strippedFileName,"/");
sprintf_s(binaryFileName,"cache/%s.%s.%s.bin",strippedFileName, deviceName,driverVersion );
}
char fileNameWithExtension[256];
sprintf_s(fileNameWithExtension,"%s.cl",fileName, ".cl");
bool upToDate = isFileUpToDate(binaryFileName,fileNameWithExtension);
if( cacheBinary )
{
bool fileUpToDate = isFileUpToDate(binaryFileName,fileNameWithExtension);
if( fileUpToDate)
{
FILE* file = fopen(binaryFileName, "rb");
if (file)
{
fseek( file, 0L, SEEK_END );
size_t binarySize = ftell( file );
rewind( file );
char* binary = new char[binarySize];
fread( binary, sizeof(char), binarySize, file );
fclose( file );
const DeviceCL* dd = (const DeviceCL*) deviceData;
program = clCreateProgramWithBinary( dd->m_context, 1, &dd->m_deviceIdx, &binarySize, (const unsigned char**)&binary, 0, &status );
ADLASSERT( status == CL_SUCCESS );
status = clBuildProgram( program, 1, &dd->m_deviceIdx, option, 0, 0 );
ADLASSERT( status == CL_SUCCESS );
if( status != CL_SUCCESS )
{
char *build_log;
size_t ret_val_size;
clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = new char[ret_val_size+1];
clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
build_log[ret_val_size] = '\0';
debugPrintf("%s\n", build_log);
delete build_log;
ADLASSERT(0);
}
delete[] binary;
}
}
}
if( !m_ptr )
{
setFromSrc( deviceData, src, option );
if( cacheBinary )
{ // write to binary
cl_uint numAssociatedDevices;
status = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 );
ADLASSERT( status == CL_SUCCESS );
if (numAssociatedDevices==1)
{
size_t binarySize;
status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
ADLASSERT( status == CL_SUCCESS );
char* binary = new char[binarySize];
status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
ADLASSERT( status == CL_SUCCESS );
{
FILE* file = fopen(binaryFileName, "wb");
if (file)
{
fwrite( binary, sizeof(char), binarySize, file );
fclose( file );
}
}
delete [] binary;
}
}
}
}
template<>
void KernelBuilder<TYPE_CL>::setFromSrc( const Device* deviceData, const char* src, const char* option )
{
ADLASSERT( deviceData->m_type == TYPE_CL );
m_deviceData = deviceData;
const DeviceCL* dd = (const DeviceCL*) deviceData;
cl_program& program = (cl_program&)m_ptr;
cl_int status = 0;
size_t srcSize[] = {strlen( src )};
program = clCreateProgramWithSource( dd->m_context, 1, &src, srcSize, &status );
ADLASSERT( status == CL_SUCCESS );
status = clBuildProgram( program, 1, &dd->m_deviceIdx, option, NULL, NULL );
if( status != CL_SUCCESS )
{
char *build_log;
size_t ret_val_size;
clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = new char[ret_val_size+1];
clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
build_log[ret_val_size] = '\0';
debugPrintf("%s\n", build_log);
printf("%s\n", build_log);
ADLASSERT(0);
delete build_log;
}
}
template<>
KernelBuilder<TYPE_CL>::~KernelBuilder()
{
cl_program program = (cl_program)m_ptr;
clReleaseProgram( program );
}
template<>
void KernelBuilder<TYPE_CL>::createKernel( const char* funcName, Kernel& kernelOut )
{
KernelCL* clKernel = (KernelCL*)&kernelOut;
cl_program program = (cl_program)m_ptr;
cl_int status = 0;
clKernel->getKernel() = clCreateKernel(program, funcName, &status );
ADLASSERT( status == CL_SUCCESS );
kernelOut.m_type = TYPE_CL;
}
template<>
void KernelBuilder<TYPE_CL>::deleteKernel( Kernel& kernel )
{
KernelCL* clKernel = (KernelCL*)&kernel;
clReleaseKernel( clKernel->getKernel() );
}
class LauncherCL
{
public:
typedef Launcher::BufferInfo BufferInfo;
__inline
static void setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n );
template<typename T>
__inline
static void setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts );
__inline
static void launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY );
};
void LauncherCL::setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n )
{
KernelCL* clKernel = (KernelCL*)launcher->m_kernel;
for(int i=0; i<n; i++)
{
Buffer<int>* buff = (Buffer<int>*)buffInfo[i].m_buffer;
cl_int status = clSetKernelArg( clKernel->getKernel(), launcher->m_idx++, sizeof(cl_mem), &buff->m_ptr );
ADLASSERT( status == CL_SUCCESS );
}
}
template<typename T>
void LauncherCL::setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts )
{
KernelCL* clKernel = (KernelCL*)launcher->m_kernel;
int sz=sizeof(T);
cl_int status = clSetKernelArg( clKernel->getKernel(), launcher->m_idx++, sz, &consts );
ADLASSERT( status == CL_SUCCESS );
}
void LauncherCL::launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
{
KernelCL* clKernel = (KernelCL*)launcher->m_kernel;
const DeviceCL* ddcl = (const DeviceCL*)launcher->m_deviceData;
size_t gRange[3] = {1,1,1};
size_t lRange[3] = {1,1,1};
lRange[0] = localSizeX;
lRange[1] = localSizeY;
gRange[0] = max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
gRange[0] *= lRange[0];
gRange[1] = max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
gRange[1] *= lRange[1];
cl_int status = clEnqueueNDRangeKernel( ddcl->m_commandQueue,
clKernel->getKernel(), 2, NULL, gRange, lRange, 0,0,0 );
ADLASSERT( status == CL_SUCCESS );
}
};

View File

@@ -0,0 +1,512 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#include <windows.h>
#include <d3d11.h>
#include <d3dx11.h>
#include <d3dcompiler.h>
#include <DXGI.h>
#pragma comment(lib,"d3dx11.lib")
#pragma comment(lib,"d3d11.lib")
#pragma comment(lib,"DXGI.lib")
namespace adl
{
#define u32 unsigned int
struct DeviceDX11 : public Device
{
typedef DeviceUtils::Config Config;
__inline
DeviceDX11() : Device( TYPE_DX11 ), m_kernelManager(0){}
__inline
void* getContext() const { return m_context; }
__inline
void initialize(const Config& cfg);
__inline
void release();
template<typename T>
__inline
void allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type);
template<typename T>
__inline
void deallocate(Buffer<T>* buf);
template<typename T>
__inline
void copy(Buffer<T>* dst, const Buffer<T>* src, int nElems);
template<typename T>
__inline
void copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems = 0);
template<typename T>
__inline
void copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems = 0);
__inline
void waitForCompletion() const;
__inline
void getDeviceName( char nameOut[128] ) const;
__inline
static
int getNDevices();
__inline
Kernel* getKernel(const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL, bool cacheKernel = true )const;
ID3D11DeviceContext* m_context;
ID3D11Device* m_device;
IDXGISwapChain* m_swapChain;
KernelManager* m_kernelManager;
};
template<typename T>
struct BufferDX11 : public Buffer<T>
{
ID3D11Buffer* getBuffer() { return (ID3D11Buffer*)m_ptr; }
ID3D11UnorderedAccessView* getUAV() { return (ID3D11UnorderedAccessView*)m_uav; }
ID3D11ShaderResourceView* getSRV() { return (ID3D11ShaderResourceView*)m_srv; }
ID3D11Buffer** getBufferPtr() { return (ID3D11Buffer**)&m_ptr; }
ID3D11UnorderedAccessView** getUAVPtr() { return (ID3D11UnorderedAccessView**)&m_uav; }
ID3D11ShaderResourceView** getSRVPtr() { return (ID3D11ShaderResourceView**)&m_srv; }
};
#define SAFE_RELEASE(p) { if(p) { (p)->Release(); (p)=NULL; } }
void DeviceDX11::initialize(const Config& cfg)
{
DeviceDX11* deviceData = this;
HRESULT hr = S_OK;
UINT createDeviceFlg = 0;
#ifdef _DEBUG
createDeviceFlg |= D3D11_CREATE_DEVICE_DEBUG;
#endif
D3D_FEATURE_LEVEL fl[] = {
D3D_FEATURE_LEVEL_11_0,
D3D_FEATURE_LEVEL_10_1,
D3D_FEATURE_LEVEL_10_0
};
typedef HRESULT (WINAPI * LPD3D11CREATEDEVICE)( IDXGIAdapter*, D3D_DRIVER_TYPE, HMODULE, u32, D3D_FEATURE_LEVEL*, UINT, u32, ID3D11Device**, D3D_FEATURE_LEVEL*, ID3D11DeviceContext** );
HMODULE moduleD3D11 = 0;
#ifdef UNICODE
moduleD3D11 = LoadLibrary( L"d3d11.dll" );
#else
moduleD3D11 = LoadLibrary( "d3d11.dll" );
#endif
ADLASSERT( moduleD3D11 );
LPD3D11CREATEDEVICE _DynamicD3D11CreateDevice;
_DynamicD3D11CreateDevice = ( LPD3D11CREATEDEVICE )GetProcAddress( moduleD3D11, "D3D11CreateDevice" );
D3D_DRIVER_TYPE type = D3D_DRIVER_TYPE_HARDWARE;
// http://msdn.microsoft.com/en-us/library/ff476082(v=VS.85).aspx
// If you set the pAdapter parameter to a non-NULL value, you must also set the DriverType parameter to the D3D_DRIVER_TYPE_UNKNOWN value. If you set the pAdapter parameter to a non-NULL value and the DriverType parameter to the D3D_DRIVER_TYPE_HARDWARE value, D3D11CreateDevice returns an HRESULT of E_INVALIDARG.
type = D3D_DRIVER_TYPE_UNKNOWN;
/*
// Create a hardware Direct3D 11 device
hr = _DynamicD3D11CreateDevice( NULL,
type, NULL, createDeviceFlg,
fl, _countof(fl), D3D11_SDK_VERSION, &deviceData->m_device, NULL, &deviceData->m_context );
*/
IDXGIAdapter* adapter = NULL;
{// get adapter of the index
IDXGIFactory* factory = NULL;
int targetAdapterIdx = cfg.m_deviceIdx;//min( cfg.m_deviceIdx, getNDevices()-1 );
CreateDXGIFactory( __uuidof(IDXGIFactory), (void**)&factory );
u32 i = 0;
while( factory->EnumAdapters( i, &adapter ) != DXGI_ERROR_NOT_FOUND )
{
if( i== targetAdapterIdx ) break;
i++;
}
factory->Release();
}
// Create a hardware Direct3D 11 device
hr = D3D11CreateDevice( adapter,
type,
NULL, createDeviceFlg,
fl, _countof(fl), D3D11_SDK_VERSION, &deviceData->m_device, NULL, &deviceData->m_context );
ADLASSERT( hr == S_OK );
// Check if the hardware device supports Compute Shader 4.0
D3D11_FEATURE_DATA_D3D10_X_HARDWARE_OPTIONS hwopts;
deviceData->m_device->CheckFeatureSupport(D3D11_FEATURE_D3D10_X_HARDWARE_OPTIONS, &hwopts, sizeof(hwopts));
if( !hwopts.ComputeShaders_Plus_RawAndStructuredBuffers_Via_Shader_4_x )
{
SAFE_RELEASE( deviceData->m_context );
SAFE_RELEASE( deviceData->m_device );
debugPrintf("DX11 GPU is not present\n");
ADLASSERT( 0 );
}
m_kernelManager = new KernelManager;
}
void DeviceDX11::release()
{
SAFE_RELEASE( m_context );
SAFE_RELEASE( m_device );
if( m_kernelManager ) delete m_kernelManager;
}
template<typename T>
void DeviceDX11::allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type)
{
ADLASSERT( type != BufferBase::BUFFER_ZERO_COPY );
DeviceDX11* deviceData = this;
buf->m_device = deviceData;
buf->m_size = nElems;
BufferDX11<T>* dBuf = (BufferDX11<T>*)buf;
// if( type & BufferBase::BUFFER )
{
HRESULT hr = S_OK;
if( type == BufferBase::BUFFER_CONST )
{
ADLASSERT( nElems == 1 );
D3D11_BUFFER_DESC constant_buffer_desc;
ZeroMemory( &constant_buffer_desc, sizeof(constant_buffer_desc) );
// constant_buffer_desc.ByteWidth = NEXTMULTIPLEOF( sizeof(T), 16 );
constant_buffer_desc.ByteWidth = (((sizeof(T))/(16) + (((sizeof(T))%(16)==0)?0:1))*(16));
// constant_buffer_desc.Usage = D3D11_USAGE_DYNAMIC;
// constant_buffer_desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
// constant_buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
constant_buffer_desc.Usage = D3D11_USAGE_DEFAULT;
constant_buffer_desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
constant_buffer_desc.CPUAccessFlags = 0;
hr = deviceData->m_device->CreateBuffer( &constant_buffer_desc, NULL, dBuf->getBufferPtr() );
ADLASSERT( hr == S_OK );
return;
}
D3D11_BUFFER_DESC buffer_desc;
ZeroMemory(&buffer_desc, sizeof(buffer_desc));
buffer_desc.ByteWidth = nElems * sizeof(T);
if( type != BufferBase::BUFFER_RAW )
{
buffer_desc.StructureByteStride = sizeof(T);
// buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
}
if( type == BufferBase::BUFFER_STAGING )
{
buffer_desc.Usage = D3D11_USAGE_STAGING;
buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
}
else if( type == BufferBase::BUFFER_INDEX )
{
buffer_desc.Usage = D3D11_USAGE_DEFAULT;
buffer_desc.BindFlags = D3D11_BIND_INDEX_BUFFER;
}
else if( type == BufferBase::BUFFER_VERTEX )
{
buffer_desc.Usage = D3D11_USAGE_DEFAULT;
buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
}
else
{
buffer_desc.Usage = D3D11_USAGE_DEFAULT;
buffer_desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
// check this
if(type == BufferBase::BUFFER_RAW)
{
// buffer_desc.BindFlags |= D3D11_BIND_INDEX_BUFFER | D3D11_BIND_VERTEX_BUFFER;
buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS | D3D11_RESOURCE_MISC_DRAWINDIRECT_ARGS; // need this to be used for DispatchIndirect
}
}
hr = deviceData->m_device->CreateBuffer(&buffer_desc, NULL, dBuf->getBufferPtr());
ADLASSERT( hr == S_OK );
if( type == BufferBase::BUFFER_INDEX ) return;
if( type == BufferBase::BUFFER ||
type == BufferBase::BUFFER_RAW ||
type == BufferBase::BUFFER_W_COUNTER )
{
// Create UAVs for all CS buffers
D3D11_UNORDERED_ACCESS_VIEW_DESC uavbuffer_desc;
ZeroMemory(&uavbuffer_desc, sizeof(uavbuffer_desc));
uavbuffer_desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
if( type == BufferBase::BUFFER_RAW )
{
uavbuffer_desc.Format = DXGI_FORMAT_R32_TYPELESS;
uavbuffer_desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW;
uavbuffer_desc.Buffer.NumElements = buffer_desc.ByteWidth / 4;
}
else
{
uavbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
uavbuffer_desc.Buffer.NumElements = nElems;
}
if( type == BufferBase::BUFFER_W_COUNTER )
{
uavbuffer_desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_COUNTER;
}
hr = deviceData->m_device->CreateUnorderedAccessView(dBuf->getBuffer(), &uavbuffer_desc, dBuf->getUAVPtr());
ADLASSERT( hr == S_OK );
// Create SRVs for all CS buffers
D3D11_SHADER_RESOURCE_VIEW_DESC srvbuffer_desc;
ZeroMemory(&srvbuffer_desc, sizeof(srvbuffer_desc));
if( type == BufferBase::BUFFER_RAW )
{
ADLASSERT( sizeof(T) <= 16 );
srvbuffer_desc.Format = DXGI_FORMAT_R32_UINT;
srvbuffer_desc.Buffer.ElementWidth = nElems;
// if ( buffer_desc.MiscFlags & D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS )
// {
// srvbuffer_desc.Format = DXGI_FORMAT_R32_TYPELESS;
// srvbuffer_desc.BufferEx.Flags = D3D11_BUFFEREX_SRV_FLAG_RAW;
// srvbuffer_desc.BufferEx.NumElements = buffer_desc.ByteWidth / 4;
}
else
{
srvbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
srvbuffer_desc.Buffer.ElementWidth = nElems;
}
srvbuffer_desc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
hr = deviceData->m_device->CreateShaderResourceView(dBuf->getBuffer(), &srvbuffer_desc, dBuf->getSRVPtr());
ADLASSERT( hr == S_OK );
}
else if( type == BufferBase::BUFFER_APPEND )
{
D3D11_UNORDERED_ACCESS_VIEW_DESC desc;
ZeroMemory( &desc, sizeof(desc) );
desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
desc.Buffer.FirstElement = 0;
desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_APPEND;
desc.Format = DXGI_FORMAT_UNKNOWN; // Format must be must be DXGI_FORMAT_UNKNOWN, when creating a View of a Structured Buffer
desc.Buffer.NumElements = buffer_desc.ByteWidth / buffer_desc.StructureByteStride;
hr = deviceData->m_device->CreateUnorderedAccessView( dBuf->getBuffer(), &desc, dBuf->getUAVPtr() );
ADLASSERT( hr == S_OK );
}
}
// else
// {
// ADLASSERT(0);
// }
}
template<typename T>
void DeviceDX11::deallocate(Buffer<T>* buf)
{
BufferDX11<T>* dBuf = (BufferDX11<T>*)buf;
if( dBuf->getBuffer() )
{
dBuf->getBuffer()->Release();
dBuf->m_ptr = NULL;
}
if( dBuf->getUAV() )
{
dBuf->getUAV()->Release();
dBuf->m_uav = NULL;
}
if( dBuf->getSRV() )
{
dBuf->getSRV()->Release();
dBuf->m_srv = NULL;
}
buf->m_device = 0;
}
template<typename T>
void DeviceDX11::copy(Buffer<T>* dst, const Buffer<T>* src, int nElems)
{
if( dst->m_device->m_type == TYPE_DX11 || src->m_device->m_type == TYPE_DX11 )
{
DeviceDX11* deviceData = this;
BufferDX11<T>* dDst = (BufferDX11<T>*)dst;
BufferDX11<T>* dSrc = (BufferDX11<T>*)src;
D3D11_MAPPED_SUBRESOURCE MappedVelResource = {0};
D3D11_BOX destRegion;
destRegion.left = 0*sizeof(T);
destRegion.front = 0;
destRegion.top = 0;
destRegion.bottom = 1;
destRegion.back = 1;
destRegion.right = (0+nElems)*sizeof(T);
deviceData->m_context->CopySubresourceRegion(
dDst->getBuffer(),
0, 0, 0, 0,
dSrc->getBuffer(),
0,
&destRegion );
}
else if( src->m_device->m_type == TYPE_HOST )
{
ADLASSERT( dst->getType() == TYPE_DX11 );
dst->write( src->m_ptr, nElems );
}
else if( dst->m_device->m_type == TYPE_HOST )
{
ADLASSERT( src->getType() == TYPE_DX11 );
src->read( dst->m_ptr, nElems );
}
else
{
ADLASSERT( 0 );
}
}
template<typename T>
void DeviceDX11::copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems)
{
DeviceDX11* deviceData = this;
BufferDX11<T>* dSrc = (BufferDX11<T>*)src;
Buffer<T> sBuf( deviceData, nElems, BufferBase::BUFFER_STAGING );
BufferDX11<T>* dStagingBuf = (BufferDX11<T>*)&sBuf;
ID3D11Buffer *StagingBuffer = dStagingBuf->getBuffer();
D3D11_MAPPED_SUBRESOURCE MappedVelResource = {0};
D3D11_BOX destRegion;
destRegion.left = srcOffsetNElems*sizeof(T);
destRegion.front = 0;
destRegion.top = 0;
destRegion.bottom = 1;
destRegion.back = 1;
destRegion.right = (srcOffsetNElems+nElems)*sizeof(T);
deviceData->m_context->CopySubresourceRegion(
StagingBuffer,
0, 0, 0, 0,
dSrc->getBuffer(),
0,
&destRegion);
deviceData->m_context->Map(StagingBuffer, 0, D3D11_MAP_READ, 0, &MappedVelResource);
memcpy(dst, MappedVelResource.pData, nElems*sizeof(T));
deviceData->m_context->Unmap(StagingBuffer, 0);
}
template<typename T>
void DeviceDX11::copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems)
{
BufferDX11<T>* dBuf = (BufferDX11<T>*)dst;
DeviceDX11* deviceData = this;
D3D11_BOX destRegion;
destRegion.left = dstOffsetNElems*sizeof(T);
destRegion.front = 0;
destRegion.top = 0;
destRegion.bottom = 1;
destRegion.back = 1;
destRegion.right = (dstOffsetNElems+nElems)*sizeof(T);
deviceData->m_context->UpdateSubresource(dBuf->getBuffer(), 0, &destRegion, src, 0, 0);
}
void DeviceDX11::waitForCompletion() const
{
const DeviceDX11* deviceData = this;
ID3D11Query* syncQuery;
D3D11_QUERY_DESC qDesc;
qDesc.Query = D3D11_QUERY_EVENT;
qDesc.MiscFlags = 0;
deviceData->m_device->CreateQuery( &qDesc, &syncQuery );
deviceData->m_context->End( syncQuery );
while( deviceData->m_context->GetData( syncQuery, 0,0,0 ) == S_FALSE ){}
syncQuery->Release();
}
int DeviceDX11::getNDevices()
{
IDXGIFactory1* factory = NULL;
IDXGIAdapter1* adapter = NULL;
CreateDXGIFactory1( __uuidof(IDXGIFactory1), (void**)&factory );
u32 i = 0;
while( factory->EnumAdapters1( i, &adapter ) != DXGI_ERROR_NOT_FOUND )
{
i++;
}
factory->Release();
return i;
}
void DeviceDX11::getDeviceName( char nameOut[128] ) const
{
IDXGIAdapter* adapter;// = getAdapterFromDevice( this );
{
IDXGIDevice* pDXGIDevice;
ADLASSERT( m_device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice) == S_OK );
ADLASSERT( pDXGIDevice->GetParent(__uuidof(IDXGIAdapter), (void **)&adapter) == S_OK );
pDXGIDevice->Release();
}
DXGI_ADAPTER_DESC adapterDesc;
adapter->GetDesc( &adapterDesc );
// wcstombs( nameOut, adapterDesc.Description, 128 );
size_t i;
wcstombs_s( &i, nameOut, 128, adapterDesc.Description, 128 );
}
Kernel* DeviceDX11::getKernel(const char* fileName, const char* funcName, const char* option, const char* src, bool cacheKernel ) const
{
return m_kernelManager->query( this, fileName, funcName, option, src, cacheKernel );
}
#undef u32
#undef SAFE_RELEASE
};

View File

@@ -0,0 +1,348 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
namespace adl
{
#define SAFE_RELEASE(p) { if(p) { (p)->Release(); (p)=NULL; } }
struct KernelDX11 : public Kernel
{
ID3D11ComputeShader* getKernel() { return (ID3D11ComputeShader*)m_kernel; }
ID3D11ComputeShader** getKernelPtr() { return (ID3D11ComputeShader**)&m_kernel; }
};
__inline
#ifdef UNICODE
HRESULT FindDXSDKShaderFileCch( __in_ecount(cchDest) WCHAR* strDestPath,
int cchDest,
__in LPCWSTR strFilename )
#else
HRESULT FindDXSDKShaderFileCch( __in_ecount(cchDest) CHAR* strDestPath,
int cchDest,
__in LPCSTR strFilename )
#endif
{
if( NULL == strFilename || strFilename[0] == 0 || NULL == strDestPath || cchDest < 10 )
return E_INVALIDARG;
// Get the exe name, and exe path
#ifdef UNICODE
WCHAR strExePath[MAX_PATH] =
#else
CHAR strExePath[MAX_PATH] =
#endif
{
0
};
#ifdef UNICODE
WCHAR strExeName[MAX_PATH] =
#else
CHAR strExeName[MAX_PATH] =
#endif
{
0
};
#ifdef UNICODE
WCHAR* strLastSlash = NULL;
#else
CHAR* strLastSlash = NULL;
#endif
GetModuleFileName( NULL, strExePath, MAX_PATH );
strExePath[MAX_PATH - 1] = 0;
#ifdef UNICODE
strLastSlash = wcsrchr( strExePath, TEXT( '\\' ) );
#else
strLastSlash = strrchr( strExePath, TEXT( '\\' ) );
#endif
if( strLastSlash )
{
#ifdef UNICODE
wcscpy_s( strExeName, MAX_PATH, &strLastSlash[1] );
#else
#endif
// Chop the exe name from the exe path
*strLastSlash = 0;
// Chop the .exe from the exe name
#ifdef UNICODE
strLastSlash = wcsrchr( strExeName, TEXT( '.' ) );
#else
strLastSlash = strrchr( strExeName, TEXT( '.' ) );
#endif
if( strLastSlash )
*strLastSlash = 0;
}
// Search in directories:
// .\
// %EXE_DIR%\..\..\%EXE_NAME%
#ifdef UNICODE
wcscpy_s( strDestPath, cchDest, strFilename );
#else
strcpy_s( strDestPath, cchDest, strFilename );
#endif
if( GetFileAttributes( strDestPath ) != 0xFFFFFFFF )
return S_OK;
// swprintf_s( strDestPath, cchDest, L"%s\\..\\..\\%s\\%s", strExePath, strExeName, strFilename );
#ifdef UNICODE
swprintf_s( strDestPath, cchDest, L"%s\\..\\%s\\%s", strExePath, strExeName, strFilename );
#else
sprintf_s( strDestPath, cchDest, "%s\\..\\%s\\%s", strExePath, strExeName, strFilename );
#endif
if( GetFileAttributes( strDestPath ) != 0xFFFFFFFF )
return S_OK;
// On failure, return the file as the path but also return an error code
#ifdef UNICODE
wcscpy_s( strDestPath, cchDest, strFilename );
#else
strcpy_s( strDestPath, cchDest, strFilename );
#endif
ADLASSERT( 0 );
return E_FAIL;
}
template<>
void KernelBuilder<TYPE_DX11>::setFromFile( const Device* deviceData, const char* fileName, const char* option, bool addExtension,
bool cacheKernel)
{
char fileNameWithExtension[256];
if( addExtension )
sprintf_s( fileNameWithExtension, "%s.hlsl", fileName );
else
sprintf_s( fileNameWithExtension, "%s", fileName );
m_deviceData = deviceData;
int nameLength = (int)strlen(fileNameWithExtension)+1;
#ifdef UNICODE
WCHAR* wfileNameWithExtension = new WCHAR[nameLength];
#else
CHAR* wfileNameWithExtension = new CHAR[nameLength];
#endif
memset(wfileNameWithExtension,0,nameLength);
#ifdef UNICODE
MultiByteToWideChar(CP_ACP,0,fileNameWithExtension,-1, wfileNameWithExtension, nameLength);
#else
sprintf_s(wfileNameWithExtension, nameLength, "%s", fileNameWithExtension);
#endif
// swprintf_s(wfileNameWithExtension, nameLength*2, L"%s", fileNameWithExtension);
HRESULT hr;
// Finds the correct path for the shader file.
// This is only required for this sample to be run correctly from within the Sample Browser,
// in your own projects, these lines could be removed safely
hr = FindDXSDKShaderFileCch( m_path, MAX_PATH, wfileNameWithExtension );
delete [] wfileNameWithExtension;
ADLASSERT( hr == S_OK );
}
template<>
void KernelBuilder<TYPE_DX11>::setFromSrc( const Device* deviceData, const char* src, const char* option )
{
m_deviceData = deviceData;
m_ptr = (void*)src;
m_path[0] = '0';
}
template<>
KernelBuilder<TYPE_DX11>::~KernelBuilder()
{
}
template<>
void KernelBuilder<TYPE_DX11>::createKernel( const char* funcName, Kernel& kernelOut )
{
const DeviceDX11* deviceData = (const DeviceDX11*)m_deviceData;
KernelDX11* dxKernel = (KernelDX11*)&kernelOut;
HRESULT hr;
DWORD dwShaderFlags = D3DCOMPILE_ENABLE_STRICTNESS;
#if defined( DEBUG ) || defined( _DEBUG )
// Set the D3DCOMPILE_DEBUG flag to embed debug information in the shaders.
// Setting this flag improves the shader debugging experience, but still allows
// the shaders to be optimized and to run exactly the way they will run in
// the release configuration of this program.
dwShaderFlags |= D3DCOMPILE_DEBUG;
#endif
const D3D_SHADER_MACRO defines[] =
{
#ifdef USE_STRUCTURED_BUFFERS
"USE_STRUCTURED_BUFFERS", "1",
#endif
#ifdef TEST_DOUBLE
"TEST_DOUBLE", "1",
#endif
NULL, NULL
};
// We generally prefer to use the higher CS shader profile when possible as CS 5.0 is better performance on 11-class hardware
LPCSTR pProfile = ( deviceData->m_device->GetFeatureLevel() >= D3D_FEATURE_LEVEL_11_0 ) ? "cs_5_0" : "cs_4_0";
ID3DBlob* pErrorBlob = NULL;
ID3DBlob* pBlob = NULL;
if( m_path[0] == '0' )
{
char* src = (char*)m_ptr;
hr = D3DX11CompileFromMemory( src, strlen(src), 0, defines, NULL, funcName, pProfile,
dwShaderFlags, NULL, NULL, &pBlob, &pErrorBlob, NULL );
}
else
{
hr = D3DX11CompileFromFile( m_path, defines, NULL, funcName, pProfile,
dwShaderFlags, NULL, NULL, &pBlob, &pErrorBlob, NULL );
}
if ( FAILED(hr) )
{
debugPrintf("%s", (char*)pErrorBlob->GetBufferPointer());
}
ADLASSERT( hr == S_OK );
hr = deviceData->m_device->CreateComputeShader( pBlob->GetBufferPointer(), pBlob->GetBufferSize(), NULL,
dxKernel->getKernelPtr() );
#if defined(DEBUG) || defined(PROFILE)
if ( kernelOut.m_kernel )
kernelOut.m_kernel->SetPrivateData( WKPDID_D3DDebugObjectName, lstrlenA(pFunctionName), pFunctionName );
#endif
SAFE_RELEASE( pErrorBlob );
SAFE_RELEASE( pBlob );
kernelOut.m_type = TYPE_DX11;
}
template<>
void KernelBuilder<TYPE_DX11>::deleteKernel( Kernel& kernel )
{
KernelDX11* dxKernel = (KernelDX11*)&kernel;
if( kernel.m_kernel )
{
dxKernel->getKernel()->Release();
kernel.m_kernel = NULL;
}
}
class LauncherDX11
{
public:
typedef Launcher::BufferInfo BufferInfo;
__inline
static void setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n );
template<typename T>
__inline
static void setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts );
__inline
static void launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY );
};
void LauncherDX11::setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n )
{
KernelDX11* dxKernel = (KernelDX11*)launcher->m_kernel;
const DeviceDX11* dddx = (const DeviceDX11*)launcher->m_deviceData;
for(int i=0; i<n; i++)
{
BufferDX11<int>* dBuf = (BufferDX11<int>*)buffInfo[i].m_buffer;
if( buffInfo[i].m_isReadOnly )
{
dddx->m_context->CSSetShaderResources( launcher->m_idx++, 1, dBuf->getSRVPtr() );
}
else
{
// todo. cannot initialize append buffer with proper counter value which is the last arg
dddx->m_context->CSSetUnorderedAccessViews( launcher->m_idxRw++, 1, dBuf->getUAVPtr(), 0 );
}
}
}
template<typename T>
void LauncherDX11::setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts )
{
KernelDX11* dxKernel = (KernelDX11*)launcher->m_kernel;
const DeviceDX11* dddx = (const DeviceDX11*)launcher->m_deviceData;
BufferDX11<T>* dBuf = (BufferDX11<T>*)&constBuff;
/*
D3D11_MAPPED_SUBRESOURCE MappedResource;
dddx->m_context->Map( dBuf->getBuffer(), 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
memcpy( MappedResource.pData, &consts, sizeof(T) );
dddx->m_context->Unmap( dBuf->getBuffer(), 0 );
*/
dddx->m_context->UpdateSubresource( dBuf->getBuffer(), 0, NULL, &consts, 0, 0 );
dddx->m_context->CSSetConstantBuffers( 0, 1, dBuf->getBufferPtr() );
}
void LauncherDX11::launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
{
KernelDX11* dxKernel = (KernelDX11*)launcher->m_kernel;
const DeviceDX11* dddx = (const DeviceDX11*)launcher->m_deviceData;
dddx->m_context->CSSetShader( dxKernel->getKernel(), NULL, 0 );
int nx, ny, nz;
nx = max( 1, (numThreadsX/localSizeX)+(!(numThreadsX%localSizeX)?0:1) );
ny = max( 1, (numThreadsY/localSizeY)+(!(numThreadsY%localSizeY)?0:1) );
nz = 1;
dddx->m_context->Dispatch( nx, ny, nz );
// set 0 to registers
{
dddx->m_context->CSSetShader( NULL, NULL, 0 );
if( launcher->m_idxRw )
{
ID3D11UnorderedAccessView* aUAViewsNULL[ 16 ] = { 0 };
dddx->m_context->CSSetUnorderedAccessViews( 0,
min( (unsigned int)launcher->m_idxRw, sizeof(aUAViewsNULL)/sizeof(*aUAViewsNULL) ), aUAViewsNULL, NULL );
}
if( launcher->m_idx )
{
ID3D11ShaderResourceView* ppSRVNULL[16] = { 0 };
dddx->m_context->CSSetShaderResources( 0,
min( (unsigned int)launcher->m_idx, sizeof(ppSRVNULL)/sizeof(*ppSRVNULL) ), ppSRVNULL );
}
}
}
#undef SAFE_RELEASE
};

View File

@@ -0,0 +1,131 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
namespace adl
{
struct StopwatchDX11 : public StopwatchBase
{
public:
__inline
StopwatchDX11() : StopwatchBase(){}
__inline
~StopwatchDX11();
__inline
void init( const Device* deviceData );
__inline
void start();
__inline
void split();
__inline
void stop();
__inline
float getMs(int index=0);
__inline
void getMs( float* times, int capacity );
public:
ID3D11Query* m_tQuery[CAPACITY+1];
ID3D11Query* m_fQuery;
UINT64 m_t[CAPACITY];
};
void StopwatchDX11::init( const Device* deviceData )
{
ADLASSERT( deviceData->m_type == TYPE_DX11 );
m_device = deviceData;
{
D3D11_QUERY_DESC qDesc;
qDesc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT;
qDesc.MiscFlags = 0;
((const DeviceDX11*)m_device)->m_device->CreateQuery( &qDesc, &m_fQuery );
}
for(int i=0; i<CAPACITY+1; i++)
{
D3D11_QUERY_DESC qDesc;
qDesc.Query = D3D11_QUERY_TIMESTAMP;
qDesc.MiscFlags = 0;
((const DeviceDX11*)m_device)->m_device->CreateQuery( &qDesc, &m_tQuery[i] );
}
}
StopwatchDX11::~StopwatchDX11()
{
m_fQuery->Release();
for(int i=0; i<CAPACITY+1; i++)
{
m_tQuery[i]->Release();
}
}
void StopwatchDX11::start()
{
m_idx = 0;
((const DeviceDX11*)m_device)->m_context->Begin( m_fQuery );
((const DeviceDX11*)m_device)->m_context->End( m_tQuery[m_idx++] );
}
void StopwatchDX11::split()
{
if( m_idx < CAPACITY )
((const DeviceDX11*)m_device)->m_context->End( m_tQuery[m_idx++] );
}
void StopwatchDX11::stop()
{
((const DeviceDX11*)m_device)->m_context->End( m_tQuery[m_idx++] );
((const DeviceDX11*)m_device)->m_context->End( m_fQuery );
}
float StopwatchDX11::getMs(int index)
{
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT d;
// m_deviceData->m_context->End( m_fQuery );
while( ((const DeviceDX11*)m_device)->m_context->GetData( m_fQuery, &d,sizeof(D3D11_QUERY_DATA_TIMESTAMP_DISJOINT),0 ) == S_FALSE ) {}
while( ((const DeviceDX11*)m_device)->m_context->GetData( m_tQuery[0], &m_t[index],sizeof(UINT64),0 ) == S_FALSE ){}
while( ((const DeviceDX11*)m_device)->m_context->GetData( m_tQuery[1], &m_t[index+1],sizeof(UINT64),0 ) == S_FALSE ){}
ADLASSERT( d.Disjoint == false );
float elapsedMs = (m_t[index+1] - m_t[index])/(float)d.Frequency*1000;
return elapsedMs;
}
void StopwatchDX11::getMs( float* times, int capacity )
{
ADLASSERT( capacity <= CAPACITY );
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT d;
while( ((const DeviceDX11*)m_device)->m_context->GetData( m_fQuery, &d,sizeof(D3D11_QUERY_DATA_TIMESTAMP_DISJOINT),0 ) == S_FALSE ) {}
for(int i=0; i<m_idx; i++)
{
while( ((const DeviceDX11*)m_device)->m_context->GetData( m_tQuery[i], &m_t[i],sizeof(UINT64),0 ) == S_FALSE ){}
}
ADLASSERT( d.Disjoint == false );
for(int i=0; i<capacity; i++)
{
times[i] = (m_t[i+1] - m_t[i])/(float)d.Frequency*1000;
}
}
};

View File

@@ -0,0 +1,107 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
namespace adl
{
struct DeviceHost : public Device
{
DeviceHost() : Device( TYPE_HOST ){}
__inline
void initialize(const Config& cfg);
__inline
void release();
template<typename T>
__inline
void allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type);
template<typename T>
__inline
void deallocate(Buffer<T>* buf);
template<typename T>
__inline
void copy(Buffer<T>* dst, const Buffer<T>* src, int nElems);
template<typename T>
__inline
void copy(T* dst, const Buffer<T>* src, int nElems, int offsetNElems = 0);
template<typename T>
__inline
void copy(Buffer<T>* dst, const T* src, int nElems, int offsetNElems = 0);
__inline
void waitForCompletion() const;
};
void DeviceHost::initialize(const Config& cfg)
{
}
void DeviceHost::release()
{
}
template<typename T>
void DeviceHost::allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type)
{
buf->m_device = this;
if( type == BufferBase::BUFFER_CONST ) return;
buf->m_ptr = new T[nElems];
ADLASSERT( buf->m_ptr );
buf->m_size = nElems;
}
template<typename T>
void DeviceHost::deallocate(Buffer<T>* buf)
{
if( buf->m_ptr ) delete [] buf->m_ptr;
}
template<typename T>
void DeviceHost::copy(Buffer<T>* dst, const Buffer<T>* src, int nElems)
{
copy( dst, src->m_ptr, nElems );
}
template<typename T>
void DeviceHost::copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems)
{
ADLASSERT( src->getType() == TYPE_HOST );
memcpy( dst, src->m_ptr+srcOffsetNElems, nElems*sizeof(T) );
}
template<typename T>
void DeviceHost::copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems)
{
ADLASSERT( dst->getType() == TYPE_HOST );
memcpy( dst->m_ptr+dstOffsetNElems, src, nElems*sizeof(T) );
}
void DeviceHost::waitForCompletion() const
{
}
};

View File

@@ -0,0 +1,119 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#ifdef _WIN32
#include <windows.h>
#else
#include <sys/time.h>
#endif
namespace adl
{
class StopwatchHost : public StopwatchBase
{
public:
__inline
StopwatchHost();
__inline
void init( const Device* deviceData );
__inline
void start();
__inline
void split();
__inline
void stop();
__inline
float getMs(int index=0);
__inline
void getMs( float* times, int capacity );
private:
#ifdef _WIN32
LARGE_INTEGER m_frequency;
LARGE_INTEGER m_t[CAPACITY];
#else
struct timeval mStartTime;
timeval m_t[CAPACITY];
#endif
};
__inline
StopwatchHost::StopwatchHost()
: StopwatchBase()
{
}
__inline
void StopwatchHost::init( const Device* deviceData )
{
m_device = deviceData;
#ifdef _WIN32
QueryPerformanceFrequency( &m_frequency );
#else
gettimeofday(&mStartTime, 0);
#endif
}
__inline
void StopwatchHost::start()
{
m_idx = 0;
#ifdef _WIN32
QueryPerformanceCounter(&m_t[m_idx++]);
#else
gettimeofday(&m_t[m_idx++], 0);
#endif
}
__inline
void StopwatchHost::split()
{
#ifdef _WIN32
QueryPerformanceCounter(&m_t[m_idx++]);
#else
gettimeofday(&m_t[m_idx++], 0);
#endif
}
__inline
void StopwatchHost::stop()
{
split();
}
__inline
float StopwatchHost::getMs(int index)
{
#ifdef _WIN32
return (float)(1000*(m_t[index+1].QuadPart - m_t[index].QuadPart))/m_frequency.QuadPart;
#else
return (m_t[index+1].tv_sec - m_t[index].tv_sec) * 1000 +
(m_t[index+1].tv_usec - m_t[index].tv_usec) / 1000;
#endif
}
__inline
void StopwatchHost::getMs(float* times, int capacity)
{
for(int i=0; i<capacity; i++) times[i] = 0.f;
for(int i=0; i<min(capacity, m_idx-1); i++)
{
times[i] = getMs(i);
}
}
};

View File

@@ -0,0 +1,73 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#pragma once
#include <Adl/Adl.h>
#include <AdlPrimitives/Math/Math.h>
namespace adl
{
class CopyBase
{
public:
enum Option
{
PER_WI_1,
PER_WI_2,
PER_WI_4,
};
};
template<DeviceType TYPE>
class Copy : public CopyBase
{
public:
typedef Launcher::BufferInfo BufferInfo;
struct Data
{
const Device* m_device;
Kernel* m_copy1F4Kernel;
Kernel* m_copy2F4Kernel;
Kernel* m_copy4F4Kernel;
Kernel* m_copyF1Kernel;
Kernel* m_copyF2Kernel;
Buffer<int4>* m_constBuffer;
};
static
Data* allocate(const Device* deviceData);
static
void deallocate(Data* data);
static
void execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option = PER_WI_1);
static
void execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n);
static
void execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n);
};
#include <AdlPrimitives/Copy/CopyHost.inl>
#include <AdlPrimitives/Copy/Copy.inl>
};

View File

@@ -0,0 +1,151 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Copy\\CopyKernels"
#define KERNEL0 "Copy1F4Kernel"
#define KERNEL1 "Copy2F4Kernel"
#define KERNEL2 "Copy4F4Kernel"
#define KERNEL3 "CopyF1Kernel"
#define KERNEL4 "CopyF2Kernel"
#include <AdlPrimitives/Copy/CopyKernelsCL.h>
#include <AdlPrimitives/Copy/CopyKernelsDX11.h>
template<DeviceType TYPE>
typename Copy<TYPE>::Data* Copy<TYPE>::allocate( const Device* device )
{
ADLASSERT( TYPE == device->m_type );
const char* src[] =
#if defined(ADL_LOAD_KERNEL_FROM_STRING)
{copyKernelsCL, copyKernelsDX11};
// ADLASSERT(0);
#else
{0,0};
#endif
Data* data = new Data;
data->m_device = device;
data->m_copy1F4Kernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
data->m_copy2F4Kernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
data->m_copy4F4Kernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
data->m_copyF1Kernel = device->getKernel( PATH, KERNEL3, 0, src[TYPE] );
data->m_copyF2Kernel = device->getKernel( PATH, KERNEL4, 0, src[TYPE] );
data->m_constBuffer = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
return data;
}
template<DeviceType TYPE>
void Copy<TYPE>::deallocate( Data* data )
{
delete data->m_constBuffer;
delete data;
}
template<DeviceType TYPE>
void Copy<TYPE>::execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option )
{
ADLASSERT( TYPE == dst.getType() );
ADLASSERT( TYPE == src.getType() );
int4 constBuffer;
constBuffer.x = n;
switch (option)
{
case PER_WI_1:
{
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
Launcher launcher( data->m_device, data->m_copy1F4Kernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
launcher.setConst( *data->m_constBuffer, constBuffer );
launcher.launch1D( n/1 );
}
break;
case PER_WI_2:
{
ADLASSERT( n%2 == 0 );
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
Launcher launcher( data->m_device, data->m_copy2F4Kernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
launcher.setConst( *data->m_constBuffer, constBuffer );
launcher.launch1D( n/2 );
}
break;
case PER_WI_4:
{
ADLASSERT( n%4 == 0 );
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
Launcher launcher( data->m_device, data->m_copy4F4Kernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
launcher.setConst( *data->m_constBuffer, constBuffer );
launcher.launch1D( n/4 );
}
break;
default:
ADLASSERT(0);
break;
};
}
template<DeviceType TYPE>
void Copy<TYPE>::execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n )
{
ADLASSERT( TYPE == dst.getType() );
ADLASSERT( TYPE == src.getType() );
int4 constBuffer;
constBuffer.x = n;
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
Launcher launcher( data->m_device, data->m_copyF2Kernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
launcher.setConst( *data->m_constBuffer, constBuffer );
launcher.launch1D( n/1 );
}
template<DeviceType TYPE>
void Copy<TYPE>::execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n )
{
ADLASSERT( TYPE == dst.getType() );
ADLASSERT( TYPE == src.getType() );
int4 constBuffer;
constBuffer.x = n;
BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
Launcher launcher( data->m_device, data->m_copyF1Kernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
launcher.setConst( *data->m_constBuffer, constBuffer );
launcher.launch1D( n/1 );
}
#undef PATH
#undef KERNEL0
#undef KERNEL1
#undef KERNEL2
#undef KERNEL3
#undef KERNEL4

Some files were not shown because too many files have changed in this diff Show More