Files
bullet3/Demos/ParticlesOpenCL/ParticlesOCL.cl
erwin.coumans 5b34356c43 Load the ParticlesOCL.cl from memory, using the MSTRINGIFY macro, instead of loading it from disk.
Small cleanup of cmake build for NVIDIA and AMD OpenCL

For AMD Stream SDK use:
INCLUDE_DIRECTORIES( ${AMD_OPENCL_INCLUDES} )
LINK_LIBRARIES( ${CMAKE_ATISTREAMSDK_LIBPATH}/OpenCL.lib )

For NVIDIA CUDA SDK:
INCLUDE_DIRECTORIES( ${NVIDIA_OPENCL_INCLUDES} )
LINK_LIBRARIES( ${NVIDIA_OPENCL_LIBRARIES})
2010-09-09 23:07:23 +00:00

468 lines
15 KiB
Common Lisp

MSTRINGIFY(
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
int4 getGridPos(float4 worldPos, __global float4* pParams)
{
int4 gridPos;
gridPos.x = (int)floor((worldPos.x - pParams[1].x) / pParams[3].x);
gridPos.y = (int)floor((worldPos.y - pParams[1].y) / pParams[3].y);
gridPos.z = (int)floor((worldPos.z - pParams[1].z) / pParams[3].z);
return gridPos;
}
unsigned int getPosHash(int4 gridPos, __global float4* pParams)
{
int4 gridDim = *((__global int4*)(pParams + 4));
if(gridPos.x < 0) gridPos.x = 0;
if(gridPos.x >= gridDim.x) gridPos.x = gridDim.x - 1;
if(gridPos.y < 0) gridPos.y = 0;
if(gridPos.y >= gridDim.y) gridPos.y = gridDim.y - 1;
if(gridPos.z < 0) gridPos.z = 0;
if(gridPos.z >= gridDim.z) gridPos.z = gridDim.z - 1;
unsigned int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;
return hash;
}
__kernel void kComputeCellId( int numParticles,
__global float4* pPos,
__global int2* pPosHash,
__global float4* pParams GUID_ARG)
{
int index = get_global_id(0);
if(index >= numParticles)
{
return;
}
float4 pos = pPos[index];
int4 gridPos = getGridPos(pos, pParams);
unsigned int hash = getPosHash(gridPos, pParams);
pPosHash[index].x = hash;
pPosHash[index].y = index;
}
__kernel void kClearCellStart( int numCells,
__global int* pCellStart GUID_ARG)
{
int index = get_global_id(0);
if(index >= numCells)
{
return;
}
pCellStart[index] = -1;
}
__kernel void kFindCellStart( int numParticles,
__global int2* pHash,
__global int* cellStart,
__global float4* pPos,
__global float4* pVel,
__global float4* pSortedPos,
__global float4* pSortedVel GUID_ARG)
{
int index = get_global_id(0);
__local int sharedHash[1025];//maximum workgroup size 1024
int2 sortedData;
if(index < numParticles)
{
sortedData = pHash[index];
// Load hash data into shared memory so that we can look
// at neighboring body's hash value without loading
// two hash values per thread
sharedHash[get_local_id(0) + 1] = sortedData.x;
if((index > 0) && (get_local_id(0) == 0))
{
// first thread in block must load neighbor body hash
sharedHash[0] = pHash[index-1].x;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(index < numParticles)
{
if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))
{
cellStart[sortedData.x] = index;
}
int unsortedIndex = sortedData.y;
float4 pos = pPos[unsortedIndex];
float4 vel = pVel[unsortedIndex];
pSortedPos[index] = pos;
pSortedVel[index] = vel;
}
}
__kernel void kIntegrateMotion( int numParticles,
__global float4* pPos,
__global float4* pVel,
__global float4* pParams,
float timeStep GUID_ARG)
{
int index = get_global_id(0);
if(index >= numParticles)
{
return;
}
float4 pos = pPos[index];
float4 vel = pVel[index];
pos.w = 1.0f;
vel.w = 0.0f;
// apply gravity
float4 gravity = *((__global float4*)(pParams + 0));
float particleRad = pParams[5].x;
float globalDamping = pParams[5].y;
float boundaryDamping = pParams[5].z;
vel += gravity * timeStep;
vel *= globalDamping;
// integrate position
pos += vel * timeStep;
// collide with world boundaries
float4 worldMin = *((__global float4*)(pParams + 1));
float4 worldMax = *((__global float4*)(pParams + 2));
if(pos.x < (worldMin.x + 2*particleRad))
{
pos.x = worldMin.x + 2*particleRad;
vel.x *= boundaryDamping;
}
if(pos.x > (worldMax.x - 2*particleRad))
{
pos.x = worldMax.x - 2*particleRad;
vel.x *= boundaryDamping;
}
if(pos.y < (worldMin.y + 2*particleRad))
{
pos.y = worldMin.y + 2*particleRad;
vel.y *= boundaryDamping;
}
if(pos.y > (worldMax.y - 2*particleRad))
{
pos.y = worldMax.y - 2*particleRad;
vel.y *= boundaryDamping;
}
if(pos.z < (worldMin.z + 2*particleRad))
{
pos.z = worldMin.z + 2*particleRad;
vel.z *= boundaryDamping;
}
if(pos.z > (worldMax.z - 2*particleRad))
{
pos.z = worldMax.z - 2*particleRad;
vel.z *= boundaryDamping;
}
// write back position and velocity
pPos[index] = pos;
pVel[index] = vel;
}
float4 collideTwoParticles(
float4 posA,
float4 posB,
float4 velA,
float4 velB,
float radiusA,
float radiusB,
float spring,
float damping,
float shear,
float attraction
)
{
//Calculate relative position
float4 relPos = posB - posA; relPos.w = 0.f;
float dist = sqrt(relPos.x * relPos.x + relPos.y * relPos.y + relPos.z * relPos.z);
float collideDist = radiusA + radiusB;
float4 force = (float4)0.f;
if(dist < collideDist){
float4 norm = relPos * (1.f / dist); norm.w = 0.f;
//Relative velocity
float4 relVel = velB - velA; relVel.w = 0.f;
//Relative tangential velocity
float relVelDotNorm = relVel.x * norm.x + relVel.y * norm.y + relVel.z * norm.z;
float4 tanVel = relVel - norm * relVelDotNorm; tanVel.w = 0.f;
//Spring force (potential)
float springFactor = -spring * (collideDist - dist);
force = springFactor * norm + damping * relVel + shear * tanVel + attraction * relPos;
force.w = 0.f;
}
return force;
}
__kernel void kCollideParticles(int numParticles,
__global float4* pVel, //output: new velocity
__global const float4* pSortedPos, //input: reordered positions
__global const float4* pSortedVel, //input: reordered velocities
__global const int2 *pPosHash, //input: reordered particle indices
__global const int *pCellStart, //input: cell boundaries
__global float4* pParams GUID_ARG)
{
int index = get_global_id(0);
if(index >= numParticles)
{
return;
}
float4 posA = pSortedPos[index];
float4 velA = pSortedVel[index];
float4 force = (float4)0.f;
float particleRad = pParams[5].x;
float collisionDamping = pParams[5].w;
float spring = pParams[6].x;
float shear = pParams[6].y;
float attraction = pParams[6].z;
int unsortedIndex = pPosHash[index].y;
//Get address in grid
int4 gridPosA = getGridPos(posA, pParams);
//Accumulate surrounding cells
int4 gridPosB;
for(int z = -1; z <= 1; z++)
{
gridPosB.z = gridPosA.z + z;
for(int y = -1; y <= 1; y++)
{
gridPosB.y = gridPosA.y + y;
for(int x = -1; x <= 1; x++)
{
gridPosB.x = gridPosA.x + x;
//Get start particle index for this cell
uint hashB = getPosHash(gridPosB, pParams);
int startI = pCellStart[hashB];
//Skip empty cell
if(startI < 0)
{
continue;
}
//Iterate over particles in this cell
int endI = startI + 32;
if(endI >= numParticles)
endI = numParticles ;
for(int j = startI; j < endI; j++)
{
uint hashC = pPosHash[j].x;
if(hashC != hashB)
{
break;
}
if(j == index)
{
continue;
}
float4 posB = pSortedPos[j];
float4 velB = pSortedVel[j];
//Collide two spheres
force += collideTwoParticles( posA, posB, velA, velB, particleRad, particleRad,
spring, collisionDamping, shear, attraction);
}
}
}
}
//Write new velocity back to original unsorted location
pVel[unsortedIndex] = velA + force;
}
/*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/
inline void ComparatorPrivate(int2* keyA, int2* keyB, uint dir)
{
if((keyA[0].x > keyB[0].x) == dir)
{
int2 tmp = *keyA;
*keyA = *keyB;
*keyB = tmp;
}
}
inline void ComparatorLocal(__local int2* keyA, __local int2* keyB, uint dir)
{
if((keyA[0].x > keyB[0].x) == dir)
{
int2 tmp = *keyA;
*keyA = *keyB;
*keyB = tmp;
}
}
////////////////////////////////////////////////////////////////////////////////
// Monolithic bitonic sort kernel for short arrays fitting into local memory
////////////////////////////////////////////////////////////////////////////////
__kernel void kBitonicSortCellIdLocal(__global int2* pKey, uint arrayLength, uint dir GUID_ARG)
{
__local int2 l_key[LOCAL_SIZE_MAX];
int localSizeLimit = get_local_size(0) * 2;
//Offset to the beginning of subbatch and load data
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
for(uint size = 2; size < arrayLength; size <<= 1)
{
//Bitonic merge
uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );
for(uint stride = size / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
//ddd == dir for the last bitonic merge step
{
for(uint stride = arrayLength / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], dir);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
////////////////////////////////////////////////////////////////////////////////
// Bitonic sort kernel for large arrays (not fitting into local memory)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level bitonic sort
//Almost the same as bitonicSortLocal with the only exception
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
//sorted in opposite directions
__kernel void kBitonicSortCellIdLocal1(__global int2* pKey GUID_ARG)
{
__local int2 l_key[LOCAL_SIZE_MAX];
uint localSizeLimit = get_local_size(0) * 2;
//Offset to the beginning of subarray and load data
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
uint comparatorI = get_global_id(0) & ((localSizeLimit / 2) - 1);
for(uint size = 2; size < localSizeLimit; size <<= 1)
{
//Bitonic merge
uint ddd = (comparatorI & (size / 2)) != 0;
for(uint stride = size / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
//Odd / even arrays of localSizeLimit elements
//sorted in opposite directions
{
uint ddd = (get_group_id(0) & 1);
for(uint stride = localSizeLimit / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
__kernel void kBitonicSortCellIdMergeGlobal(__global int2* pKey, uint arrayLength, uint size, uint stride, uint dir GUID_ARG)
{
uint global_comparatorI = get_global_id(0);
uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
//Bitonic merge
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
int2 keyA = pKey[pos + 0];
int2 keyB = pKey[pos + stride];
ComparatorPrivate(&keyA, &keyB, ddd);
pKey[pos + 0] = keyA;
pKey[pos + stride] = keyB;
}
//Combined bitonic merge steps for
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
__kernel void kBitonicSortCellIdMergeLocal(__global int2* pKey, uint arrayLength, uint stride, uint size, uint dir GUID_ARG)
{
__local int2 l_key[LOCAL_SIZE_MAX];
int localSizeLimit = get_local_size(0) * 2;
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
//Bitonic merge
uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
for(; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
);