reorder files, in preparation for Bullet 3 -> Bullet 2 merge

This commit is contained in:
erwincoumans
2013-04-29 19:04:08 -07:00
parent 55b69201a9
commit 3ac332f3a7
162 changed files with 215 additions and 3070 deletions

View File

@@ -0,0 +1,565 @@
#include "b3GpuSapBroadphase.h"
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3Common/b3Quickprof.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "kernels/sapKernels.h"
#include "kernels/sapFastKernels.h"
#include "Bullet3Common/b3MinMax.h"
#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
#define B3_BROADPHASE_SAPFAST_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl"
b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q )
:m_context(ctx),
m_device(device),
m_queue(q),
m_allAabbsGPU(ctx,q),
m_smallAabbsGPU(ctx,q),
m_largeAabbsGPU(ctx,q),
m_overlappingPairs(ctx,q),
m_gpuSmallSortData(ctx,q),
m_gpuSmallSortedAabbs(ctx,q),
m_currentBuffer(-1)
{
const char* sapSrc = sapCL;
const char* sapFastSrc = sapFastCL;
cl_int errNum=0;
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
b3Assert(errNum==CL_SUCCESS);
cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"",B3_BROADPHASE_SAPFAST_PATH);
b3Assert(errNum==CL_SUCCESS);
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg );
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
b3Assert(errNum==CL_SUCCESS);
#if 0
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
b3Assert(errNum==CL_SUCCESS);
#else
#ifndef __APPLE__
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernel",&errNum,sapFastProg );
b3Assert(errNum==CL_SUCCESS);
#else
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
b3Assert(errNum==CL_SUCCESS);
#endif
#endif
m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg );
m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg );
m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue);
}
b3GpuSapBroadphase::~b3GpuSapBroadphase()
{
delete m_sorter;
clReleaseKernel(m_scatterKernel);
clReleaseKernel(m_flipFloatKernel);
clReleaseKernel(m_copyAabbsKernel);
clReleaseKernel(m_sapKernel);
clReleaseKernel(m_sap2Kernel);
}
/// conservative test for overlap between two aabbs
static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1,
const b3Vector3 &aabbMin2, const b3Vector3 &aabbMax2)
{
bool overlap = true;
overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap;
overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap;
overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap;
return overlap;
}
//http://stereopsis.com/radix.html
static unsigned int FloatFlip(float fl)
{
unsigned int f = *(unsigned int*)&fl;
unsigned int mask = -(int)(f >> 31) | 0x80000000;
return f ^ mask;
};
void b3GpuSapBroadphase::init3dSap()
{
if (m_currentBuffer<0)
{
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
m_currentBuffer = 0;
for (int axis=0;axis<3;axis++)
{
for (int buf=0;buf<2;buf++)
{
int totalNumAabbs = m_allAabbsCPU.size();
m_sortedAxisCPU[axis][buf].resize(totalNumAabbs);
if (buf==m_currentBuffer)
{
for (int i=0;i<totalNumAabbs;i++)
{
m_sortedAxisCPU[axis][buf][i].m_key = FloatFlip(m_allAabbsCPU[i].m_minIndices[axis]);
m_sortedAxisCPU[axis][buf][i].m_value = i;
}
}
}
}
}
}
void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
{
b3Assert(m_currentBuffer>=0);
if (m_currentBuffer<0)
return;
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
for (int axis=0;axis<3;axis++)
{
for (int buf=0;buf<2;buf++)
{
b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size());
}
}
m_currentBuffer = 1-m_currentBuffer;
for (int axis=0;axis<3;axis++)
{
int totalNumAabbs = m_allAabbsCPU.size();
for (int i=0;i<totalNumAabbs;i++)
{
m_sortedAxisCPU[axis][m_currentBuffer][i].m_key = FloatFlip(m_allAabbsCPU[i].m_minIndices[axis]);
m_sortedAxisCPU[axis][m_currentBuffer][i].m_value = i;
}
}
}
void b3GpuSapBroadphase::calculateOverlappingPairsHost()
{
//test
//if (m_currentBuffer>=0)
// calculateOverlappingPairsHostIncremental3Sap();
int axis=0;
b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
{
int numSmallAabbs = m_smallAabbsCPU.size();
for (int j=0;j<numSmallAabbs;j++)
{
//sync aabb
int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
m_smallAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
}
}
{
int numLargeAabbs = m_largeAabbsCPU.size();
for (int j=0;j<numLargeAabbs;j++)
{
//sync aabb
int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
m_largeAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
}
}
b3AlignedObjectArray<b3Int2> hostPairs;
{
int numSmallAabbs = m_smallAabbsCPU.size();
for (int i=0;i<numSmallAabbs;i++)
{
float reference = m_smallAabbsCPU[i].m_max[axis];
for (int j=i+1;j<numSmallAabbs;j++)
{
if (TestAabbAgainstAabb2((b3Vector3&)m_smallAabbsCPU[i].m_min, (b3Vector3&)m_smallAabbsCPU[i].m_max,
(b3Vector3&)m_smallAabbsCPU[j].m_min,(b3Vector3&)m_smallAabbsCPU[j].m_max))
{
b3Int2 pair;
pair.x = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
pair.y = m_smallAabbsCPU[j].m_minIndices[3];
hostPairs.push_back(pair);
}
}
}
}
{
int numSmallAabbs = m_smallAabbsCPU.size();
for (int i=0;i<numSmallAabbs;i++)
{
float reference = m_smallAabbsCPU[i].m_max[axis];
int numLargeAabbs = m_largeAabbsCPU.size();
for (int j=0;j<numLargeAabbs;j++)
{
if (TestAabbAgainstAabb2((b3Vector3&)m_smallAabbsCPU[i].m_min, (b3Vector3&)m_smallAabbsCPU[i].m_max,
(b3Vector3&)m_largeAabbsCPU[j].m_min,(b3Vector3&)m_largeAabbsCPU[j].m_max))
{
b3Int2 pair;
pair.x = m_largeAabbsCPU[j].m_minIndices[3];
pair.y = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
hostPairs.push_back(pair);
}
}
}
}
if (hostPairs.size())
{
m_overlappingPairs.copyFromHost(hostPairs);
} else
{
m_overlappingPairs.resize(0);
}
//init3dSap();
}
void b3GpuSapBroadphase::calculateOverlappingPairs()
{
int axis = 0;//todo on GPU for now hardcode
{
bool syncOnHost = false;
if (syncOnHost)
{
B3_PROFILE("Synchronize m_smallAabbsGPU (CPU/slow)");
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
m_smallAabbsGPU.copyToHost(m_smallAabbsCPU);
{
int numSmallAabbs = m_smallAabbsCPU.size();
for (int j=0;j<numSmallAabbs;j++)
{
//sync aabb
int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
m_smallAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
}
}
m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
} else
{
{
int numSmallAabbs = m_smallAabbsGPU.size();
if (numSmallAabbs)
{
B3_PROFILE("copyAabbsKernelSmall");
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),
b3BufferInfoCL( m_smallAabbsGPU.getBufferCL()),
};
b3LauncherCL launcher(m_queue, m_copyAabbsKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numSmallAabbs );
int num = numSmallAabbs;
launcher.launch1D( num);
clFinish(m_queue);
}
}
}
if (syncOnHost)
{
B3_PROFILE("Synchronize m_largeAabbsGPU (CPU/slow)");
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
m_largeAabbsGPU.copyToHost(m_largeAabbsCPU);
{
int numLargeAabbs = m_largeAabbsCPU.size();
for (int j=0;j<numLargeAabbs;j++)
{
//sync aabb
int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
m_largeAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
}
}
m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
} else
{
int numLargeAabbs = m_largeAabbsGPU.size();
if (numLargeAabbs)
{
B3_PROFILE("copyAabbsKernelLarge");
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),
b3BufferInfoCL( m_largeAabbsGPU.getBufferCL()),
};
b3LauncherCL launcher(m_queue, m_copyAabbsKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numLargeAabbs );
int num = numLargeAabbs;
launcher.launch1D( num);
clFinish(m_queue);
}
}
B3_PROFILE("GPU SAP");
int numSmallAabbs = m_smallAabbsGPU.size();
m_gpuSmallSortData.resize(numSmallAabbs);
int numLargeAabbs = m_smallAabbsGPU.size();
#if 1
if (m_smallAabbsGPU.size())
{
B3_PROFILE("flipFloatKernel");
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), b3BufferInfoCL( m_gpuSmallSortData.getBufferCL())};
b3LauncherCL launcher(m_queue, m_flipFloatKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numSmallAabbs );
launcher.setConst( axis );
int num = numSmallAabbs;
launcher.launch1D( num);
clFinish(m_queue);
}
{
B3_PROFILE("gpu radix sort\n");
m_sorter->execute(m_gpuSmallSortData);
clFinish(m_queue);
}
m_gpuSmallSortedAabbs.resize(numSmallAabbs);
if (numSmallAabbs)
{
B3_PROFILE("scatterKernel");
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), b3BufferInfoCL( m_gpuSmallSortData.getBufferCL(),true),b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_scatterKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numSmallAabbs);
int num = numSmallAabbs;
launcher.launch1D( num);
clFinish(m_queue);
}
int maxPairsPerBody = 64;
int maxPairs = maxPairsPerBody * numSmallAabbs;//todo
m_overlappingPairs.resize(maxPairs);
b3OpenCLArray<int> pairCount(m_context, m_queue);
pairCount.push_back(0);
int numPairs=0;
{
int numLargeAabbs = m_largeAabbsGPU.size();
if (numLargeAabbs && numSmallAabbs)
{
B3_PROFILE("sap2Kernel");
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_largeAabbsGPU.getBufferCL() ),b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(pairCount.getBufferCL())};
b3LauncherCL launcher(m_queue, m_sap2Kernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numLargeAabbs );
launcher.setConst( numSmallAabbs);
launcher.setConst( axis );
launcher.setConst( maxPairs );
//@todo: use actual maximum work item sizes of the device instead of hardcoded values
launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
numPairs = pairCount.at(0);
if (numPairs >maxPairs)
numPairs =maxPairs;
}
}
if (m_gpuSmallSortedAabbs.size())
{
B3_PROFILE("sapKernel");
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(pairCount.getBufferCL())};
b3LauncherCL launcher(m_queue, m_sapKernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numSmallAabbs );
launcher.setConst( axis );
launcher.setConst( maxPairs );
int num = numSmallAabbs;
#if 0
int buffSize = launcher.getSerializationBufferSize();
unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
for (int i=0;i<buffSize+1;i++)
{
unsigned char* ptr = (unsigned char*)&buf[i];
*ptr = 0xff;
}
int actualWrite = launcher.serializeArguments(buf,buffSize);
unsigned char* cptr = (unsigned char*)&buf[buffSize];
// printf("buf[buffSize] = %d\n",*cptr);
assert(buf[buffSize]==0xff);//check for buffer overrun
int* ptr = (int*)&buf[buffSize];
*ptr = num;
FILE* f = fopen("m_sapKernelArgs.bin","wb");
fwrite(buf,buffSize+sizeof(int),1,f);
fclose(f);
#endif//
launcher.launch1D( num);
clFinish(m_queue);
numPairs = pairCount.at(0);
if (numPairs>maxPairs)
numPairs = maxPairs;
}
#else
int numPairs = 0;
b3LauncherCL launcher(m_queue, m_sapKernel);
const char* fileName = "m_sapKernelArgs.bin";
FILE* f = fopen(fileName,"rb");
if (f)
{
int sizeInBytes=0;
if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
{
printf("error, cannot get file size\n");
exit(0);
}
unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
fread(buf,sizeInBytes,1,f);
int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
int num = *(int*)&buf[serializedBytes];
launcher.launch1D( num);
b3OpenCLArray<int> pairCount(m_context, m_queue);
int numElements = launcher.m_arrays[2]->size()/sizeof(int);
pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements);
numPairs = pairCount.at(0);
//printf("overlapping pairs = %d\n",numPairs);
b3AlignedObjectArray<b3Int2> hostOoverlappingPairs;
b3OpenCLArray<b3Int2> tmpGpuPairs(m_context,m_queue);
tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs );
tmpGpuPairs.copyToHost(hostOoverlappingPairs);
m_overlappingPairs.copyFromHost(hostOoverlappingPairs);
//printf("hello %d\n", m_overlappingPairs.size());
free(buf);
fclose(f);
} else {
printf("error: cannot find file %s\n",fileName);
}
clFinish(m_queue);
#endif
m_overlappingPairs.resize(numPairs);
}//B3_PROFILE("GPU_RADIX SORT");
}
void b3GpuSapBroadphase::writeAabbsToGpu()
{
m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this
m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
}
void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
{
int index = userPtr;
b3SapAabb aabb;
for (int i=0;i<4;i++)
{
aabb.m_min[i] = aabbMin[i];
aabb.m_max[i] = aabbMax[i];
}
aabb.m_minIndices[3] = index;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
m_largeAabbsCPU.push_back(aabb);
m_allAabbsCPU.push_back(aabb);
}
void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
{
int index = userPtr;
b3SapAabb aabb;
for (int i=0;i<4;i++)
{
aabb.m_min[i] = aabbMin[i];
aabb.m_max[i] = aabbMax[i];
}
aabb.m_minIndices[3] = index;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
m_smallAabbsCPU.push_back(aabb);
m_allAabbsCPU.push_back(aabb);
}
cl_mem b3GpuSapBroadphase::getAabbBufferWS()
{
return m_allAabbsGPU.getBufferCL();
}
int b3GpuSapBroadphase::getNumOverlap()
{
return m_overlappingPairs.size();
}
cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer()
{
return m_overlappingPairs.getBufferCL();
}

View File

@@ -0,0 +1,69 @@
#ifndef B3_GPU_SAP_BROADPHASE_H
#define B3_GPU_SAP_BROADPHASE_H
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
class b3Vector3;
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "b3SapAabb.h"
class b3GpuSapBroadphase
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_flipFloatKernel;
cl_kernel m_scatterKernel ;
cl_kernel m_copyAabbsKernel;
cl_kernel m_sapKernel;
cl_kernel m_sap2Kernel;
class b3RadixSort32CL* m_sorter;
///test for 3d SAP
b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
int m_currentBuffer;
public:
b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
b3OpenCLArray<b3SapAabb> m_smallAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_smallAabbsCPU;
b3OpenCLArray<b3SapAabb> m_largeAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_largeAabbsCPU;
b3OpenCLArray<b3Int2> m_overlappingPairs;
//temporary gpu work memory
b3OpenCLArray<b3SortData> m_gpuSmallSortData;
b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q );
virtual ~b3GpuSapBroadphase();
void calculateOverlappingPairs();
void calculateOverlappingPairsHost();
void init3dSap();
void calculateOverlappingPairsHostIncremental3Sap();
void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
//call writeAabbsToGpu after done making all changes (createProxy etc)
void writeAabbsToGpu();
cl_mem getAabbBufferWS();
int getNumOverlap();
cl_mem getOverlappingPairBuffer();
};
#endif //B3_GPU_SAP_BROADPHASE_H

View File

@@ -0,0 +1,18 @@
#ifndef B3_SAP_AABB_H
#define B3_SAP_AABB_H
struct b3SapAabb
{
union
{
float m_min[4];
int m_minIndices[4];
};
union
{
float m_max[4];
int m_signedMaxIndices[4];
};
};
#endif //B3_SAP_AABB_H

View File

@@ -0,0 +1,320 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
typedef struct
{
union
{
float4 m_min;
float m_minElems[4];
int m_minIndices[4];
};
union
{
float4 m_max;
float m_maxElems[4];
int m_maxIndices[4];
};
} btAabbCL;
/// conservative test for overlap between two aabbs
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);
bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);
bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)
{
int i = get_global_id(0);
if (i>=numUnsortedAabbs)
return;
int j = get_global_id(1);
if (j>=numSortedAabbs)
return;
if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))
{
int2 myPair;
myPair.x = unsortedAabbs[i].m_minIndices[3];
myPair.y = sortedAabbs[j].m_minIndices[3];
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
for (int j=i+1;j<numObjects;j++)
{
if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
{
break;
}
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
{
int2 myPair;
myPair.x = aabbs[i].m_minIndices[3];
myPair.y = aabbs[j].m_minIndices[3];
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
}
__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
int localId = get_local_id(0);
__local int numActiveWgItems[1];
__local int breakRequest[1];
if (localId==0)
{
numActiveWgItems[0] = 0;
breakRequest[0] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
atomic_inc(numActiveWgItems);
barrier(CLK_LOCAL_MEM_FENCE);
int localBreak = 0;
int j=i+1;
do
{
barrier(CLK_LOCAL_MEM_FENCE);
if (j<numObjects)
{
if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
{
if (!localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (j>=numObjects && !localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (!localBreak)
{
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
{
int2 myPair;
myPair.x = aabbs[i].m_minIndices[3];
myPair.y = aabbs[j].m_minIndices[3];
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
j++;
} while (breakRequest[0]<numActiveWgItems[0]);
}
__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
int localId = get_local_id(0);
__local int numActiveWgItems[1];
__local int breakRequest[1];
__local btAabbCL localAabbs[128];// = aabbs[i];
btAabbCL myAabb;
myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
float testValue = myAabb.m_maxElems[axis];
if (localId==0)
{
numActiveWgItems[0] = 0;
breakRequest[0] = 0;
}
int localCount=0;
int block=0;
localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
barrier(CLK_LOCAL_MEM_FENCE);
atomic_inc(numActiveWgItems);
barrier(CLK_LOCAL_MEM_FENCE);
int localBreak = 0;
int j=i+1;
do
{
barrier(CLK_LOCAL_MEM_FENCE);
if (j<numObjects)
{
if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
{
if (!localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (j>=numObjects && !localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (!localBreak)
{
if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
{
int2 myPair;
myPair.x = myAabb.m_minIndices[3];
myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
localCount++;
if (localCount==64)
{
localCount = 0;
block+=64;
localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
}
j++;
} while (breakRequest[0]<numActiveWgItems[0]);
}
//http://stereopsis.com/radix.html
unsigned int FloatFlip(float fl);
unsigned int FloatFlip(float fl)
{
unsigned int f = *(unsigned int*)&fl;
unsigned int mask = -(int)(f >> 31) | 0x80000000;
return f ^ mask;
}
float IFloatFlip(unsigned int f);
float IFloatFlip(unsigned int f)
{
unsigned int mask = ((f >> 31) - 1) | 0x80000000;
unsigned int fl = f ^ mask;
return *(float*)&fl;
}
__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
int src = destAabbs[i].m_maxIndices[3];
destAabbs[i] = allAabbs[src];
destAabbs[i].m_maxIndices[3] = src;
}
__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);
sortData[i].y = i;
}
__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
sortedAabbs[i] = aabbs[sortData[i].y];
}

View File

@@ -0,0 +1,161 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
typedef struct
{
union
{
float4 m_min;
float m_minElems[4];
int m_minIndices[4];
};
union
{
float4 m_max;
float m_maxElems[4];
int m_maxIndices[4];
};
} btAabbCL;
/// conservative test for overlap between two aabbs
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
{
//skip pairs between static (mass=0) objects
if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))
return false;
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
//computePairsKernelBatchWrite
__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
int localId = get_local_id(0);
__local int numActiveWgItems[1];
__local int breakRequest[1];
__local btAabbCL localAabbs[128];// = aabbs[i];
int2 myPairs[64];
btAabbCL myAabb;
myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
float testValue = myAabb.m_maxElems[axis];
if (localId==0)
{
numActiveWgItems[0] = 0;
breakRequest[0] = 0;
}
int localCount=0;
int block=0;
localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
barrier(CLK_LOCAL_MEM_FENCE);
atomic_inc(numActiveWgItems);
barrier(CLK_LOCAL_MEM_FENCE);
int localBreak = 0;
int curNumPairs = 0;
int j=i+1;
do
{
barrier(CLK_LOCAL_MEM_FENCE);
if (j<numObjects)
{
if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
{
if (!localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (j>=numObjects && !localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (!localBreak)
{
if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
{
int2 myPair;
myPair.x = myAabb.m_minIndices[3];
myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
myPairs[curNumPairs] = myPair;
curNumPairs++;
if (curNumPairs==64)
{
int curPair = atomic_add(pairCount,curNumPairs);
//avoid a buffer overrun
if ((curPair+curNumPairs)<maxPairs)
{
for (int p=0;p<curNumPairs;p++)
{
pairsOut[curPair+p] = myPairs[p]; //flush to main memory
}
}
curNumPairs = 0;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
localCount++;
if (localCount==64)
{
localCount = 0;
block+=64;
localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
}
j++;
} while (breakRequest[0]<numActiveWgItems[0]);
if (curNumPairs>0)
{
//avoid a buffer overrun
int curPair = atomic_add(pairCount,curNumPairs);
if ((curPair+curNumPairs)<maxPairs)
{
for (int p=0;p<curNumPairs;p++)
{
pairsOut[curPair+p] = myPairs[p]; //flush to main memory
}
}
curNumPairs = 0;
}
}

View File

@@ -0,0 +1,164 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* sapFastCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"\n"
"\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} b3AabbCL;\n"
"\n"
"\n"
"/// conservative test for overlap between two aabbs\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
"{\n"
"//skip pairs between static (mass=0) objects\n"
" if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n"
" return false;\n"
" \n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"\n"
"\n"
"//computePairsKernelBatchWrite\n"
"__kernel void computePairsKernel( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
"\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" __local b3AabbCL localAabbs[128];// = aabbs[i];\n"
" \n"
" int2 myPairs[64];\n"
" \n"
" b3AabbCL myAabb;\n"
" \n"
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
" float testValue = myAabb.m_maxElems[axis];\n"
" \n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" int localCount=0;\n"
" int block=0;\n"
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" int curNumPairs = 0;\n"
" \n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
" {\n"
" int2 myPair;\n"
" myPair.x = myAabb.m_minIndices[3];\n"
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
" myPairs[curNumPairs] = myPair;\n"
" curNumPairs++;\n"
" if (curNumPairs==64)\n"
" {\n"
" int curPair = atomic_add(pairCount,curNumPairs);\n"
" //avoid a buffer overrun\n"
" if ((curPair+curNumPairs)<maxPairs)\n"
" {\n"
" for (int p=0;p<curNumPairs;p++)\n"
" {\n"
" pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
" }\n"
" }\n"
" curNumPairs = 0;\n"
" }\n"
" }\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" localCount++;\n"
" if (localCount==64)\n"
" {\n"
" localCount = 0;\n"
" block+=64; \n"
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
" }\n"
" j++;\n"
" \n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
" \n"
" \n"
" if (curNumPairs>0)\n"
" {\n"
" //avoid a buffer overrun\n"
" int curPair = atomic_add(pairCount,curNumPairs);\n"
" if ((curPair+curNumPairs)<maxPairs)\n"
" {\n"
" for (int p=0;p<curNumPairs;p++)\n"
" {\n"
" pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
" }\n"
" }\n"
" curNumPairs = 0;\n"
" }\n"
"}\n"
;

View File

@@ -0,0 +1,324 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* sapCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"\n"
"\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} b3AabbCL;\n"
"\n"
"\n"
"/// conservative test for overlap between two aabbs\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"\n"
"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"\n"
"\n"
"__kernel void computePairsKernelTwoArrays( __global const b3AabbCL* unsortedAabbs, __global const b3AabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numUnsortedAabbs)\n"
" return;\n"
"\n"
" int j = get_global_id(1);\n"
" if (j>=numSortedAabbs)\n"
" return;\n"
"\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))\n"
" {\n"
" int2 myPair;\n"
" \n"
" myPair.x = unsortedAabbs[i].m_minIndices[3];\n"
" myPair.y = sortedAabbs[j].m_minIndices[3];\n"
"\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
"}\n"
"\n"
"__kernel void computePairsKernelOriginal( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" break;\n"
" }\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int2 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"__kernel void computePairsKernelBarrier( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
"\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
"\n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
"\n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int2 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" j++;\n"
"\n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
"}\n"
"\n"
"\n"
"__kernel void computePairsKernelLocalSharedMemory( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
"\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" __local b3AabbCL localAabbs[128];// = aabbs[i];\n"
" \n"
" b3AabbCL myAabb;\n"
" \n"
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
" float testValue = myAabb.m_maxElems[axis];\n"
" \n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" int localCount=0;\n"
" int block=0;\n"
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" \n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
" {\n"
" int2 myPair;\n"
" myPair.x = myAabb.m_minIndices[3];\n"
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
" localCount++;\n"
" if (localCount==64)\n"
" {\n"
" localCount = 0;\n"
" block+=64; \n"
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
" }\n"
" j++;\n"
" \n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
" \n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"//http://stereopsis.com/radix.html\n"
"unsigned int FloatFlip(float fl);\n"
"unsigned int FloatFlip(float fl)\n"
"{\n"
" unsigned int f = *(unsigned int*)&fl;\n"
" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
" return f ^ mask;\n"
"}\n"
"float IFloatFlip(unsigned int f);\n"
"float IFloatFlip(unsigned int f)\n"
"{\n"
" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
" unsigned int fl = f ^ mask;\n"
" return *(float*)&fl;\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"__kernel void copyAabbsKernel( __global const b3AabbCL* allAabbs, __global b3AabbCL* destAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" int src = destAabbs[i].m_maxIndices[3];\n"
" destAabbs[i] = allAabbs[src];\n"
" destAabbs[i].m_maxIndices[3] = src;\n"
"}\n"
"\n"
"\n"
"__kernel void flipFloatKernel( __global const b3AabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" \n"
" sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);\n"
" sortData[i].y = i;\n"
" \n"
"}\n"
"\n"
"\n"
"__kernel void scatterKernel( __global const b3AabbCL* aabbs, volatile __global const int2* sortData, __global b3AabbCL* sortedAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
"\n"
" sortedAabbs[i] = aabbs[sortData[i].y];\n"
"}\n"
"\n"
;

View File

@@ -0,0 +1,44 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_OPENCL_INCLUDE_H
#define B3_OPENCL_INCLUDE_H
#ifdef __APPLE__
#ifdef USE_MINICL
#include <MiniCL/cl.h>
#else
#include <OpenCL/cl.h>
#include <OpenCL/cl_ext.h> //clLogMessagesToStderrAPPLE
#endif
#else
#ifdef USE_MINICL
#include <MiniCL/cl.h>
#else
#include <CL/cl.h>
#ifdef _WIN32
#include "CL/cl_gl.h"
#endif //_WIN32
#endif
#endif //__APPLE__
#include <assert.h>
#include <stdio.h>
#define oclCHECKERROR(a, b) if((a)!=(b)) { printf("OCL Error : %d\n", (a)); assert((a) == (b)); }
#endif //B3_OPENCL_INCLUDE_H

View File

@@ -0,0 +1,911 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//original author: Roman Ponomarev
//cleanup by Erwin Coumans
bool gDebugForceLoadingFromSource = false;
bool gDebugSkipLoadingBinary = false;
#include <string.h>
#ifdef _WIN32
#pragma warning (disable:4996)
#endif
#include "b3OpenCLUtils.h"
//#include "b3OpenCLInclude.h"
#include <stdio.h>
#include <stdlib.h>
#define B3_MAX_CL_DEVICES 16 //who needs 16 devices?
#ifdef _WIN32
#include <Windows.h>
#endif
#include <assert.h>
#define b3Assert assert
//Set the preferred platform vendor using the OpenCL SDK
static const char* spPlatformVendor =
#if defined(CL_PLATFORM_MINI_CL)
"MiniCL, SCEA";
#elif defined(CL_PLATFORM_AMD)
"Advanced Micro Devices, Inc.";
#elif defined(CL_PLATFORM_NVIDIA)
"NVIDIA Corporation";
#elif defined(CL_PLATFORM_INTEL)
"Intel(R) Corporation";
#else
"Unknown Vendor";
#endif
#ifndef CL_PLATFORM_MINI_CL
#ifdef _WIN32
#include "CL/cl_gl.h"
#endif //_WIN32
#endif
void MyFatalBreakAPPLE( const char * errstr ,
const void * private_info ,
size_t cb ,
void * user_data )
{
printf("Error: %s\n", errstr);
const char* patloc = strstr(errstr, "Warning");
//find out if it is a warning or error, exit if error
if (patloc)
{
printf("warning\n");
} else
{
printf("error\n");
b3Assert(0);
}
}
int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum)
{
cl_platform_id pPlatforms[10] = { 0 };
cl_uint numPlatforms = 0;
cl_int ciErrNum = clGetPlatformIDs(10, pPlatforms, &numPlatforms);
//cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL)
*pErrNum = ciErrNum;
}
return numPlatforms;
}
const char* b3OpenCLUtils_getSdkVendorName()
{
return spPlatformVendor;
}
cl_platform_id b3OpenCLUtils_getPlatform(int platformIndex0, cl_int* pErrNum)
{
cl_platform_id platform = 0;
unsigned int platformIndex = (unsigned int )platformIndex0;
cl_uint numPlatforms;
cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
if (platformIndex>=0 && platformIndex<numPlatforms)
{
cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms);
ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL)
*pErrNum = ciErrNum;
return platform;
}
platform = platforms[platformIndex];
free (platforms);
}
return platform;
}
void b3OpenCLUtils::getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInfo* platformInfo)
{
cl_int ciErrNum;
ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_VENDOR,B3_MAX_STRING_LENGTH,platformInfo->m_platformVendor,NULL);
oclCHECKERROR(ciErrNum,CL_SUCCESS);
ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_NAME,B3_MAX_STRING_LENGTH,platformInfo->m_platformName,NULL);
oclCHECKERROR(ciErrNum,CL_SUCCESS);
ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_VERSION,B3_MAX_STRING_LENGTH,platformInfo->m_platformVersion,NULL);
oclCHECKERROR(ciErrNum,CL_SUCCESS);
}
void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform)
{
b3OpenCLPlatformInfo platformInfo;
b3OpenCLUtils::getPlatformInfo (platform, &platformInfo);
printf("Platform info:\n");
printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
}
cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex)
{
cl_context retContext = 0;
cl_int ciErrNum=0;
cl_uint num_entries;
cl_device_id devices[B3_MAX_CL_DEVICES];
cl_uint num_devices;
cl_context_properties* cprops;
/*
* If we could find our platform, use it. Otherwise pass a NULL and get whatever the
* implementation thinks we should be using.
*/
cl_context_properties cps[7] = {0,0,0,0,0,0,0};
cps[0] = CL_CONTEXT_PLATFORM;
cps[1] = (cl_context_properties)platform;
#ifdef _WIN32
if (pGLContext && pGLDC)
{
cps[2] = CL_GL_CONTEXT_KHR;
cps[3] = (cl_context_properties)pGLContext;
cps[4] = CL_WGL_HDC_KHR;
cps[5] = (cl_context_properties)pGLDC;
}
#endif //_WIN32
num_entries = B3_MAX_CL_DEVICES;
num_devices=-1;
ciErrNum = clGetDeviceIDs(
platform,
deviceType,
num_entries,
devices,
&num_devices);
if (ciErrNum<0)
{
printf("clGetDeviceIDs returned %d\n",ciErrNum);
return 0;
}
cprops = (NULL == platform) ? NULL : cps;
if (!num_devices)
return 0;
if (pGLContext)
{
//search for the GPU that relates to the OpenCL context
unsigned int i;
for (i=0;i<num_devices;i++)
{
retContext = clCreateContext(cprops,1,&devices[i],NULL,NULL,&ciErrNum);
if (ciErrNum==CL_SUCCESS)
break;
}
}
else
{
if (preferredDeviceIndex>=0 && (unsigned int)preferredDeviceIndex<num_devices)
{
//create a context of the preferred device index
retContext = clCreateContext(cprops,1,&devices[preferredDeviceIndex],NULL,NULL,&ciErrNum);
} else
{
//create a context of all devices
#if defined (__APPLE__)
retContext = clCreateContext(cprops,num_devices,devices,MyFatalBreakAPPLE,NULL,&ciErrNum);
#else
printf("numDevices=%d\n",num_devices);
retContext = clCreateContext(cprops,num_devices,devices,NULL,NULL,&ciErrNum);
#endif
}
}
if(pErrNum != NULL)
{
*pErrNum = ciErrNum;
};
return retContext;
}
cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC , int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* retPlatformId)
{
cl_uint numPlatforms;
cl_context retContext = 0;
unsigned int i;
cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL) *pErrNum = ciErrNum;
return NULL;
}
if(numPlatforms > 0)
{
cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms);
ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL)
*pErrNum = ciErrNum;
free(platforms);
return NULL;
}
for ( i = 0; i < numPlatforms; ++i)
{
char pbuf[128];
ciErrNum = clGetPlatformInfo( platforms[i],
CL_PLATFORM_VENDOR,
sizeof(pbuf),
pbuf,
NULL);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL) *pErrNum = ciErrNum;
return NULL;
}
if (preferredPlatformIndex>=0 && i==preferredPlatformIndex)
{
cl_platform_id tmpPlatform = platforms[0];
platforms[0] = platforms[i];
platforms[i] = tmpPlatform;
break;
} else
{
if(!strcmp(pbuf, spPlatformVendor))
{
cl_platform_id tmpPlatform = platforms[0];
platforms[0] = platforms[i];
platforms[i] = tmpPlatform;
}
}
}
for (i = 0; i < numPlatforms; ++i)
{
cl_platform_id platform = platforms[i];
assert(platform);
retContext = b3OpenCLUtils_createContextFromPlatform(platform,deviceType,pErrNum,pGLContext,pGLDC,preferredDeviceIndex,preferredPlatformIndex);
if (retContext)
{
// printf("OpenCL platform details:\n");
b3OpenCLPlatformInfo platformInfo;
b3OpenCLUtils::getPlatformInfo(platform, &platformInfo);
if (retPlatformId)
*retPlatformId = platform;
break;
}
}
free (platforms);
}
return retContext;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxMainContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int deviceIndex)
{
assert(cxMainContext);
size_t szParmDataBytes;
cl_device_id* cdDevices;
cl_device_id device ;
// get the list of devices associated with context
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
if( szParmDataBytes / sizeof(cl_device_id) < (unsigned int)deviceIndex ) {
return (cl_device_id)-1;
}
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
device = cdDevices[deviceIndex];
free(cdDevices);
return device;
}
int b3OpenCLUtils_getNumDevices(cl_context cxMainContext)
{
size_t szParamDataBytes;
int device_count;
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParamDataBytes);
device_count = (int) szParamDataBytes/ sizeof(cl_device_id);
return device_count;
}
void b3OpenCLUtils::getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info)
{
// CL_DEVICE_NAME
clGetDeviceInfo(device, CL_DEVICE_NAME, B3_MAX_STRING_LENGTH, &info->m_deviceName, NULL);
// CL_DEVICE_VENDOR
clGetDeviceInfo(device, CL_DEVICE_VENDOR, B3_MAX_STRING_LENGTH, &info->m_deviceVendor, NULL);
// CL_DRIVER_VERSION
clGetDeviceInfo(device, CL_DRIVER_VERSION, B3_MAX_STRING_LENGTH, &info->m_driverVersion, NULL);
// CL_DEVICE_INFO
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &info->m_deviceType, NULL);
// CL_DEVICE_MAX_COMPUTE_UNITS
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(info->m_computeUnits), &info->m_computeUnits, NULL);
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(info->m_workitemDims), &info->m_workitemDims, NULL);
// CL_DEVICE_MAX_WORK_ITEM_SIZES
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(info->m_workItemSize), &info->m_workItemSize, NULL);
// CL_DEVICE_MAX_WORK_GROUP_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(info->m_workgroupSize), &info->m_workgroupSize, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(info->m_clockFrequency), &info->m_clockFrequency, NULL);
// CL_DEVICE_ADDRESS_BITS
clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(info->m_addressBits), &info->m_addressBits, NULL);
// CL_DEVICE_MAX_MEM_ALLOC_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(info->m_maxMemAllocSize), &info->m_maxMemAllocSize, NULL);
// CL_DEVICE_GLOBAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(info->m_globalMemSize), &info->m_globalMemSize, NULL);
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(info->m_errorCorrectionSupport), &info->m_errorCorrectionSupport, NULL);
// CL_DEVICE_LOCAL_MEM_TYPE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(info->m_localMemType), &info->m_localMemType, NULL);
// CL_DEVICE_LOCAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(info->m_localMemSize), &info->m_localMemSize, NULL);
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(info->m_constantBufferSize), &info->m_constantBufferSize, NULL);
// CL_DEVICE_QUEUE_PROPERTIES
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(info->m_queueProperties), &info->m_queueProperties, NULL);
// CL_DEVICE_IMAGE_SUPPORT
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(info->m_imageSupport), &info->m_imageSupport, NULL);
// CL_DEVICE_MAX_READ_IMAGE_ARGS
clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(info->m_maxReadImageArgs), &info->m_maxReadImageArgs, NULL);
// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(info->m_maxWriteImageArgs), &info->m_maxWriteImageArgs, NULL);
// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &info->m_image2dMaxWidth, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &info->m_image2dMaxHeight, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &info->m_image3dMaxWidth, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &info->m_image3dMaxHeight, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &info->m_image3dMaxDepth, NULL);
// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, B3_MAX_STRING_LENGTH, &info->m_deviceExtensions, NULL);
// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &info->m_vecWidthChar, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &info->m_vecWidthShort, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &info->m_vecWidthInt, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &info->m_vecWidthLong, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &info->m_vecWidthFloat, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &info->m_vecWidthDouble, NULL);
}
void b3OpenCLUtils_printDeviceInfo(cl_device_id device)
{
b3OpenCLDeviceInfo info;
b3OpenCLUtils::getDeviceInfo(device,&info);
printf("Device Info:\n");
printf(" CL_DEVICE_NAME: \t\t\t%s\n", info.m_deviceName);
printf(" CL_DEVICE_VENDOR: \t\t\t%s\n", info.m_deviceVendor);
printf(" CL_DRIVER_VERSION: \t\t\t%s\n", info.m_driverVersion);
if( info.m_deviceType & CL_DEVICE_TYPE_CPU )
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
if( info.m_deviceType & CL_DEVICE_TYPE_GPU )
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
if( info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR )
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
if( info.m_deviceType & CL_DEVICE_TYPE_DEFAULT )
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
printf(" CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", info.m_computeUnits);
printf(" CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", info.m_workitemDims);
printf(" CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", info.m_workItemSize[0], info.m_workItemSize[1], info.m_workItemSize[2]);
printf(" CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", info.m_workgroupSize);
printf(" CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", info.m_clockFrequency);
printf(" CL_DEVICE_ADDRESS_BITS:\t\t%u\n", info.m_addressBits);
printf(" CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize/ (1024 * 1024)));
printf(" CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize/ (1024 * 1024)));
printf(" CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport== CL_TRUE ? "yes" : "no");
printf(" CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", info.m_localMemType == 1 ? "local" : "global");
printf(" CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(info.m_localMemSize / 1024));
printf(" CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(info.m_constantBufferSize / 1024));
if( info.m_queueProperties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
if( info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE )
printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
printf(" CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", info.m_imageSupport);
printf(" CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", info.m_maxReadImageArgs);
printf(" CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", info.m_maxWriteImageArgs);
printf("\n CL_DEVICE_IMAGE <dim>");
printf("\t\t\t2D_MAX_WIDTH\t %u\n", info.m_image2dMaxWidth);
printf("\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", info.m_image2dMaxHeight);
printf("\t\t\t\t\t3D_MAX_WIDTH\t %u\n", info.m_image3dMaxWidth);
printf("\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", info.m_image3dMaxHeight);
printf("\t\t\t\t\t3D_MAX_DEPTH\t %u\n", info.m_image3dMaxDepth);
if (info.m_deviceExtensions != 0)
printf("\n CL_DEVICE_EXTENSIONS:%s\n",info.m_deviceExtensions);
else
printf(" CL_DEVICE_EXTENSIONS: None\n");
printf(" CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
printf("CHAR %u, SHORT %u, INT %u,LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong,info.m_vecWidthFloat, info.m_vecWidthDouble);
}
static const char* strip2(const char* name, const char* pattern)
{
size_t const patlen = strlen(pattern);
size_t patcnt = 0;
const char * oriptr;
const char * patloc;
// find how many times the pattern occurs in the original string
for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
{
patcnt++;
}
return oriptr;
}
cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSourceOrg, cl_int* pErrNum, const char* additionalMacrosArg , const char* clFileNameForCaching, bool disableBinaryCaching)
{
const char* additionalMacros = additionalMacrosArg?additionalMacrosArg:"";
if (disableBinaryCaching)
{
kernelSourceOrg = 0;
}
cl_program m_cpProgram=0;
cl_int status;
#ifdef _WIN32
char binaryFileName[B3_MAX_STRING_LENGTH];
char* bla=0;
if (clFileNameForCaching && !(disableBinaryCaching || gDebugSkipLoadingBinary||gDebugForceLoadingFromSource) )
{
char deviceName[256];
char driverVersion[256];
const char* strippedName;
int fileUpToDate = 0;
int binaryFileValid=0;
FILETIME modtimeBinary;
clGetDeviceInfo(device, CL_DEVICE_NAME, 256, &deviceName, NULL);
clGetDeviceInfo(device, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
strippedName = strip2(clFileNameForCaching,"\\");
strippedName = strip2(strippedName,"/");
#ifdef _WIN32
sprintf_s(binaryFileName,B3_MAX_STRING_LENGTH,"cache/%s.%s.%s.bin",strippedName, deviceName,driverVersion );
#else
sprintf(binaryFileName,"cache/%s.%s.%s.bin",strippedName, deviceName,driverVersion );
#endif
//printf("searching for %s\n", binaryFileName);
CreateDirectory("cache",0);
{
HANDLE binaryFileHandle = CreateFile(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
if (binaryFileHandle ==INVALID_HANDLE_VALUE)
{
DWORD errorCode;
errorCode = GetLastError();
switch (errorCode)
{
case ERROR_FILE_NOT_FOUND:
{
printf("\nCached file not found %s\n", binaryFileName);
break;
}
case ERROR_PATH_NOT_FOUND:
{
printf("\nCached file path not found %s\n", binaryFileName);
break;
}
default:
{
printf("\nFailed reading cached file with errorCode = %d\n", errorCode);
}
}
} else
{
if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0)
{
DWORD errorCode;
errorCode = GetLastError();
printf("\nGetFileTime errorCode = %d\n", errorCode);
} else
{
binaryFileValid = 1;
}
CloseHandle(binaryFileHandle);
}
if (binaryFileValid)
{
HANDLE srcFileHandle = CreateFile(clFileNameForCaching,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
if (srcFileHandle==INVALID_HANDLE_VALUE)
{
const char* prefix[]={"../","../../","../../../","../../../../"};
for (int i=0;(srcFileHandle==INVALID_HANDLE_VALUE) && i<3;i++)
{
char relativeFileName[1024];
sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching);
srcFileHandle = CreateFile(relativeFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
}
}
if (srcFileHandle!=INVALID_HANDLE_VALUE)
{
FILETIME modtimeSrc;
if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0)
{
DWORD errorCode;
errorCode = GetLastError();
printf("\nGetFileTime errorCode = %d\n", errorCode);
}
if ( ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime)
||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
{
fileUpToDate=1;
} else
{
printf("\nCached binary file out-of-date (%s)\n",binaryFileName);
}
CloseHandle(srcFileHandle);
}
else
{
#ifdef _DEBUG
DWORD errorCode;
errorCode = GetLastError();
switch (errorCode)
{
case ERROR_FILE_NOT_FOUND:
{
printf("\nSrc file not found %s\n", clFileNameForCaching);
break;
}
case ERROR_PATH_NOT_FOUND:
{
printf("\nSrc path not found %s\n", clFileNameForCaching);
break;
}
default:
{
printf("\nnSrc file reading errorCode = %d\n", errorCode);
}
}
//we should make sure the src file exists so we can verify the timestamp with binary
assert(0);
fileUpToDate = false;
#else
//if we cannot find the source, assume it is OK in release builds
fileUpToDate = true;
#endif
}
}
}
if( fileUpToDate)
{
#ifdef _WIN32
FILE* file;
if (fopen_s(&file,binaryFileName, "rb")!=0)
file=0;
#else
FILE* file = fopen(binaryFileName, "rb");
#endif
if (file)
{
size_t binarySize=0;
char* binary =0;
fseek( file, 0L, SEEK_END );
binarySize = ftell( file );
rewind( file );
binary = (char*)malloc(sizeof(char)*binarySize);
fread( binary, sizeof(char), binarySize, file );
fclose( file );
m_cpProgram = clCreateProgramWithBinary( clContext, 1,&device, &binarySize, (const unsigned char**)&binary, 0, &status );
b3Assert( status == CL_SUCCESS );
status = clBuildProgram( m_cpProgram, 1, &device, additionalMacros, 0, 0 );
b3Assert( status == CL_SUCCESS );
if( status != CL_SUCCESS )
{
char *build_log;
size_t ret_val_size;
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = (char*)malloc(sizeof(char)*(ret_val_size+1));
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
build_log[ret_val_size] = '\0';
printf("%s\n", build_log);
free (build_log);
b3Assert(0);
m_cpProgram = 0;
}
free (binary);
}
}
}
#endif //_WIN32
if (!m_cpProgram)
{
cl_int localErrNum;
char* compileFlags;
int flagsize;
const char* kernelSource = kernelSourceOrg;
if (!kernelSourceOrg || gDebugForceLoadingFromSource)
{
if (clFileNameForCaching)
{
FILE* file = fopen(clFileNameForCaching, "rb");
//in many cases the relative path is a few levels up the directory hierarchy, so try it
if (!file)
{
const char* prefix[]={"../","../../","../../../","../../../../"};
for (int i=0;!file && i<3;i++)
{
char relativeFileName[1024];
sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching);
file = fopen(relativeFileName, "rb");
}
}
if (file)
{
char* kernelSrc=0;
fseek( file, 0L, SEEK_END );
int kernelSize = ftell( file );
rewind( file );
kernelSrc = (char*)malloc(kernelSize+1);
int readBytes = fread((void*)kernelSrc,1,kernelSize, file);
kernelSrc[kernelSize] = 0;
fclose(file);
kernelSource = kernelSrc;
}
}
}
size_t program_length = kernelSource ? strlen(kernelSource) : 0;
#ifdef MAC //or __APPLE__?
char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
#else
//const char* flags = "-DGUID_ARG= -fno-alias";
const char* flags = "-DGUID_ARG= ";
#endif
m_cpProgram = clCreateProgramWithSource(clContext, 1, (const char**)&kernelSource, &program_length, &localErrNum);
if (localErrNum!= CL_SUCCESS)
{
if (pErrNum)
*pErrNum = localErrNum;
return 0;
}
// Build the program with 'mad' Optimization option
flagsize = sizeof(char)*(strlen(additionalMacros) + strlen(flags) + 5);
compileFlags = (char*) malloc(flagsize);
#ifdef _WIN32
sprintf_s(compileFlags,flagsize, "%s %s", flags, additionalMacros);
#else
sprintf(compileFlags, "%s %s", flags, additionalMacros);
#endif
localErrNum = clBuildProgram(m_cpProgram, 1, &device, compileFlags, NULL, NULL);
if (localErrNum!= CL_SUCCESS)
{
char *build_log;
size_t ret_val_size;
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = (char*) malloc(sizeof(char)*(ret_val_size+1));
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
// to be carefully, terminate with \0
// there's no information in the reference whether the string is 0 terminated or not
build_log[ret_val_size] = '\0';
printf("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log);
free (build_log);
if (pErrNum)
*pErrNum = localErrNum;
return 0;
}
#ifdef _WIN32
if( clFileNameForCaching )
{ // write to binary
cl_uint numAssociatedDevices;
status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 );
b3Assert( status == CL_SUCCESS );
if (numAssociatedDevices==1)
{
size_t binarySize;
char* binary ;
status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
b3Assert( status == CL_SUCCESS );
binary = (char*)malloc(sizeof(char)*binarySize);
status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
b3Assert( status == CL_SUCCESS );
{
FILE* file=0;
#ifdef _WIN32
if (fopen_s(&file,binaryFileName, "wb")!=0)
file=0;
#else
file = fopen(binaryFileName, "wb");
#endif
if (file)
{
fwrite( binary, sizeof(char), binarySize, file );
fclose( file );
} else
{
printf("cannot write file %s\n", binaryFileName);
}
}
free (binary);
}
}
#endif //_WIN32
free(compileFlags);
}
return m_cpProgram;
}
cl_kernel b3OpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros )
{
cl_kernel kernel;
cl_int localErrNum;
cl_program m_cpProgram = prog;
printf("compiling kernel %s ",kernelName);
if (!m_cpProgram)
{
m_cpProgram = b3OpenCLUtils_compileCLProgramFromString(clContext,device,kernelSource,pErrNum, additionalMacros,0, false);
}
// Create the kernel
kernel = clCreateKernel(m_cpProgram, kernelName, &localErrNum);
if (localErrNum != CL_SUCCESS)
{
printf("Error in clCreateKernel, Line %u in file %s, cannot find kernel function %s !!!\n\n", __LINE__, __FILE__, kernelName);
assert(0);
if (pErrNum)
*pErrNum = localErrNum;
return 0;
}
if (!prog && m_cpProgram)
{
clReleaseProgram(m_cpProgram);
}
printf("ready. \n");
if (pErrNum)
*pErrNum = CL_SUCCESS;
return kernel;
}

View File

@@ -0,0 +1,179 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//original author: Roman Ponomarev
//cleanup by Erwin Coumans
#ifndef B3_OPENCL_UTILS_H
#define B3_OPENCL_UTILS_H
#include "b3OpenCLInclude.h"
#ifdef __cplusplus
extern "C" {
#endif
///C API for OpenCL utilities: convenience functions, see below for C++ API
/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC , int preferredDeviceIndex , int preferredPlatformIndex, cl_platform_id* platformId);
int b3OpenCLUtils_getNumDevices(cl_context cxMainContext);
cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int nr);
void b3OpenCLUtils_printDeviceInfo(cl_device_id device);
cl_kernel b3OpenCLUtils_compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog,const char* additionalMacros);
//optional
cl_program b3OpenCLUtils_compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum,const char* additionalMacros , const char* srcFileNameForCaching, bool disableBinaryCaching);
//the following optional APIs provide access using specific platform information
int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum);
///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
cl_platform_id b3OpenCLUtils_getPlatform(int nr, cl_int* pErrNum);
void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform);
const char* b3OpenCLUtils_getSdkVendorName();
cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC ,int preferredDeviceIndex , int preferredPlatformIndex);
#ifdef __cplusplus
}
#define B3_MAX_STRING_LENGTH 1024
typedef struct
{
char m_deviceName[B3_MAX_STRING_LENGTH];
char m_deviceVendor[B3_MAX_STRING_LENGTH];
char m_driverVersion[B3_MAX_STRING_LENGTH];
char m_deviceExtensions[B3_MAX_STRING_LENGTH];
cl_device_type m_deviceType;
cl_uint m_computeUnits;
size_t m_workitemDims;
size_t m_workItemSize[3];
size_t m_image2dMaxWidth;
size_t m_image2dMaxHeight;
size_t m_image3dMaxWidth;
size_t m_image3dMaxHeight;
size_t m_image3dMaxDepth;
size_t m_workgroupSize;
cl_uint m_clockFrequency;
cl_ulong m_constantBufferSize;
cl_ulong m_localMemSize;
cl_ulong m_globalMemSize;
cl_bool m_errorCorrectionSupport;
cl_device_local_mem_type m_localMemType;
cl_uint m_maxReadImageArgs;
cl_uint m_maxWriteImageArgs;
cl_uint m_addressBits;
cl_ulong m_maxMemAllocSize;
cl_command_queue_properties m_queueProperties;
cl_bool m_imageSupport;
cl_uint m_vecWidthChar;
cl_uint m_vecWidthShort;
cl_uint m_vecWidthInt;
cl_uint m_vecWidthLong;
cl_uint m_vecWidthFloat;
cl_uint m_vecWidthDouble;
} b3OpenCLDeviceInfo;
typedef struct
{
char m_platformVendor[B3_MAX_STRING_LENGTH];
char m_platformName[B3_MAX_STRING_LENGTH];
char m_platformVersion[B3_MAX_STRING_LENGTH];
} b3OpenCLPlatformInfo;
///C++ API for OpenCL utilities: convenience functions
struct b3OpenCLUtils
{
/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
static inline cl_context createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex= - 1, cl_platform_id* platformId=0)
{
return b3OpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx , pGLDC , preferredDeviceIndex, preferredPlatformIndex, platformId);
}
static inline int getNumDevices(cl_context cxMainContext)
{
return b3OpenCLUtils_getNumDevices(cxMainContext);
}
static inline cl_device_id getDevice(cl_context cxMainContext, int nr)
{
return b3OpenCLUtils_getDevice(cxMainContext,nr);
}
static void getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info);
static inline void printDeviceInfo(cl_device_id device)
{
b3OpenCLUtils_printDeviceInfo(device);
}
static inline cl_kernel compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum=0, cl_program prog=0,const char* additionalMacros = "" )
{
return b3OpenCLUtils_compileCLKernelFromString(clContext,device, kernelSource, kernelName, pErrNum, prog,additionalMacros);
}
//optional
static inline cl_program compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum=0,const char* additionalMacros = "" , const char* srcFileNameForCaching=0, bool disableBinaryCaching=false)
{
return b3OpenCLUtils_compileCLProgramFromString(clContext,device, kernelSource, pErrNum,additionalMacros, srcFileNameForCaching, disableBinaryCaching);
}
//the following optional APIs provide access using specific platform information
static inline int getNumPlatforms(cl_int* pErrNum=0)
{
return b3OpenCLUtils_getNumPlatforms(pErrNum);
}
///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum=0)
{
return b3OpenCLUtils_getPlatform(nr,pErrNum);
}
static void getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInfo* platformInfo);
static inline void printPlatformInfo(cl_platform_id platform)
{
b3OpenCLUtils_printPlatformInfo(platform);
}
static inline const char* getSdkVendorName()
{
return b3OpenCLUtils_getSdkVendorName();
}
static inline cl_context createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0,int preferredDeviceIndex = -1, int preferredPlatformIndex= -1)
{
return b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx,pGLDC,preferredDeviceIndex, preferredPlatformIndex);
}
};
#endif //__cplusplus
#endif // B3_OPENCL_UTILS_H

View File

@@ -0,0 +1,18 @@
#ifndef B3_BVH_INFO_H
#define B3_BVH_INFO_H
#include "Bullet3Common/b3Vector3.h"
struct b3BvhInfo
{
b3Vector3 m_aabbMin;
b3Vector3 m_aabbMax;
b3Vector3 m_quantization;
int m_numNodes;
int m_numSubTrees;
int m_nodeOffset;
int m_subTreeOffset;
};
#endif //B3_BVH_INFO_H

View File

@@ -0,0 +1,53 @@
#ifndef B3_COLLIDABLE_H
#define B3_COLLIDABLE_H
enum b3ShapeTypes
{
SHAPE_HEIGHT_FIELD=1,
SHAPE_CONVEX_HULL=3,
SHAPE_PLANE=4,
SHAPE_CONCAVE_TRIMESH=5,
SHAPE_COMPOUND_OF_CONVEX_HULLS=6,
SHAPE_SPHERE=7,
MAX_NUM_SHAPE_TYPES,
};
struct b3Collidable
{
union {
int m_numChildShapes;
int m_bvhIndex;
};
float m_radius;
int m_shapeType;
int m_shapeIndex;
};
struct b3CollidableNew
{
short int m_shapeType;
short int m_numShapes;
int m_shapeIndex;
};
struct b3GpuChildShape
{
float m_childPosition[4];
float m_childOrientation[4];
int m_shapeIndex;
int m_unused0;
int m_unused1;
int m_unused2;
};
struct b3CompoundOverlappingPair
{
int m_bodyIndexA;
int m_bodyIndexB;
// int m_pairType;
int m_childShapeIndexA;
int m_childShapeIndexB;
};
#endif //B3_COLLIDABLE_H

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,99 @@
#ifndef _CONVEX_HULL_CONTACT_H
#define _CONVEX_HULL_CONTACT_H
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "b3ConvexUtility.h"
#include "b3ConvexPolyhedronCL.h"
#include "b3Collidable.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
#include "Bullet3Common/b3Int2.h"
#include "Bullet3Common/b3Int4.h"
#include "b3OptimizedBvh.h"
#include "b3BvhInfo.h"
//#include "../../dynamics/basic_demo/Stubs/ChNarrowPhase.h"
struct b3YetAnotherAabb
{
union
{
float m_min[4];
int m_minIndices[4];
};
union
{
float m_max[4];
//int m_signedMaxIndices[4];
//unsigned int m_unsignedMaxIndices[4];
};
};
struct GpuSatCollision
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_findSeparatingAxisKernel;
cl_kernel m_findConcaveSeparatingAxisKernel;
cl_kernel m_findCompoundPairsKernel;
cl_kernel m_processCompoundPairsKernel;
cl_kernel m_clipHullHullKernel;
cl_kernel m_clipCompoundsHullHullKernel;
cl_kernel m_clipFacesAndContactReductionKernel;
cl_kernel m_findClippingFacesKernel;
cl_kernel m_clipHullHullConcaveConvexKernel;
cl_kernel m_extractManifoldAndAddContactKernel;
cl_kernel m_newContactReductionKernel;
cl_kernel m_bvhTraversalKernel;
cl_kernel m_primitiveContactsKernel;
cl_kernel m_findConcaveSphereContactsKernel;
cl_kernel m_processCompoundPairsPrimitivesKernel;
b3OpenCLArray<int> m_totalContactsOut;
GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue q );
virtual ~GpuSatCollision();
void computeConvexConvexContactsGPUSAT( const b3OpenCLArray<b3Int2>* pairs, int nPairs,
const b3OpenCLArray<b3RigidBodyCL>* bodyBuf,
b3OpenCLArray<b3Contact4>* contactOut, int& nContacts,
int maxContactCapacity,
const b3OpenCLArray<b3ConvexPolyhedronCL>& hostConvexData,
const b3OpenCLArray<b3Vector3>& vertices,
const b3OpenCLArray<b3Vector3>& uniqueEdges,
const b3OpenCLArray<b3GpuFace>& faces,
const b3OpenCLArray<int>& indices,
const b3OpenCLArray<b3Collidable>& gpuCollidables,
const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes,
const b3OpenCLArray<b3YetAnotherAabb>& clAabbs,
b3OpenCLArray<b3Vector3>& worldVertsB1GPU,
b3OpenCLArray<b3Int4>& clippingFacesOutGPU,
b3OpenCLArray<b3Vector3>& worldNormalsAGPU,
b3OpenCLArray<b3Vector3>& worldVertsA1GPU,
b3OpenCLArray<b3Vector3>& worldVertsB2GPU,
b3AlignedObjectArray<class b3OptimizedBvh*>& bvhData,
b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU,
b3OpenCLArray<b3BvhSubtreeInfo>* subTreesGPU,
b3OpenCLArray<b3BvhInfo>* bvhInfo,
int numObjects,
int maxTriConvexPairCapacity,
b3OpenCLArray<b3Int4>& triangleConvexPairs,
int& numTriConvexPairsOut
);
};
#endif //_CONVEX_HULL_CONTACT_H

View File

@@ -0,0 +1,64 @@
#ifndef CONVEX_POLYHEDRON_CL
#define CONVEX_POLYHEDRON_CL
#include "Bullet3Common/b3Transform.h"
struct b3GpuFace
{
b3Vector4 m_plane;
int m_indexOffset;
int m_numIndices;
};
B3_ATTRIBUTE_ALIGNED16(struct) b3ConvexPolyhedronCL
{
b3Vector3 m_localCenter;
b3Vector3 m_extents;
b3Vector3 mC;
b3Vector3 mE;
b3Scalar m_radius;
int m_faceOffset;
int m_numFaces;
int m_numVertices;
int m_vertexOffset;
int m_uniqueEdgesOffset;
int m_numUniqueEdges;
int m_unused;
inline void project(const b3Transform& trans, const b3Vector3& dir, const b3AlignedObjectArray<b3Vector3>& vertices, b3Scalar& min, b3Scalar& max) const
{
min = FLT_MAX;
max = -FLT_MAX;
int numVerts = m_numVertices;
const b3Vector3 localDir = trans.getBasis().transpose()*dir;
const b3Vector3 localDi2 = b3QuatRotate(trans.getRotation().inverse(),dir);
b3Scalar offset = trans.getOrigin().dot(dir);
for(int i=0;i<numVerts;i++)
{
//b3Vector3 pt = trans * vertices[m_vertexOffset+i];
//b3Scalar dp = pt.dot(dir);
b3Scalar dp = vertices[m_vertexOffset+i].dot(localDir);
//b3Assert(dp==dpL);
if(dp < min) min = dp;
if(dp > max) max = dp;
}
if(min>max)
{
b3Scalar tmp = min;
min = max;
max = tmp;
}
min += offset;
max += offset;
}
};
#endif //CONVEX_POLYHEDRON_CL

View File

@@ -0,0 +1,520 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#include "b3ConvexUtility.h"
#include "Bullet3Geometry/b3ConvexHullComputer.h"
#include "Bullet3Geometry/b3GrahamScan2dConvexHull.h"
#include "Bullet3Common/b3Quaternion.h"
#include "Bullet3Common/b3HashMap.h"
#include "b3ConvexPolyhedronCL.h"
b3ConvexUtility::~b3ConvexUtility()
{
}
bool b3ConvexUtility::initializePolyhedralFeatures(const b3Vector3* orgVertices, int numPoints, bool mergeCoplanarTriangles)
{
b3ConvexHullComputer conv;
conv.compute(&orgVertices[0].getX(), sizeof(b3Vector3),numPoints,0.f,0.f);
b3AlignedObjectArray<b3Vector3> faceNormals;
int numFaces = conv.faces.size();
faceNormals.resize(numFaces);
b3ConvexHullComputer* convexUtil = &conv;
b3AlignedObjectArray<b3MyFace> tmpFaces;
tmpFaces.resize(numFaces);
int numVertices = convexUtil->vertices.size();
m_vertices.resize(numVertices);
for (int p=0;p<numVertices;p++)
{
m_vertices[p] = convexUtil->vertices[p];
}
for (int i=0;i<numFaces;i++)
{
int face = convexUtil->faces[i];
//printf("face=%d\n",face);
const b3ConvexHullComputer::Edge* firstEdge = &convexUtil->edges[face];
const b3ConvexHullComputer::Edge* edge = firstEdge;
b3Vector3 edges[3];
int numEdges = 0;
//compute face normals
do
{
int src = edge->getSourceVertex();
tmpFaces[i].m_indices.push_back(src);
int targ = edge->getTargetVertex();
b3Vector3 wa = convexUtil->vertices[src];
b3Vector3 wb = convexUtil->vertices[targ];
b3Vector3 newEdge = wb-wa;
newEdge.normalize();
if (numEdges<2)
edges[numEdges++] = newEdge;
edge = edge->getNextEdgeOfFace();
} while (edge!=firstEdge);
b3Scalar planeEq = 1e30f;
if (numEdges==2)
{
faceNormals[i] = edges[0].cross(edges[1]);
faceNormals[i].normalize();
tmpFaces[i].m_plane[0] = faceNormals[i].getX();
tmpFaces[i].m_plane[1] = faceNormals[i].getY();
tmpFaces[i].m_plane[2] = faceNormals[i].getZ();
tmpFaces[i].m_plane[3] = planeEq;
}
else
{
b3Assert(0);//degenerate?
faceNormals[i].setZero();
}
for (int v=0;v<tmpFaces[i].m_indices.size();v++)
{
b3Scalar eq = m_vertices[tmpFaces[i].m_indices[v]].dot(faceNormals[i]);
if (planeEq>eq)
{
planeEq=eq;
}
}
tmpFaces[i].m_plane[3] = -planeEq;
}
//merge coplanar faces and copy them to m_polyhedron
b3Scalar faceWeldThreshold= 0.999f;
b3AlignedObjectArray<int> todoFaces;
for (int i=0;i<tmpFaces.size();i++)
todoFaces.push_back(i);
while (todoFaces.size())
{
b3AlignedObjectArray<int> coplanarFaceGroup;
int refFace = todoFaces[todoFaces.size()-1];
coplanarFaceGroup.push_back(refFace);
b3MyFace& faceA = tmpFaces[refFace];
todoFaces.pop_back();
b3Vector3 faceNormalA(faceA.m_plane[0],faceA.m_plane[1],faceA.m_plane[2]);
for (int j=todoFaces.size()-1;j>=0;j--)
{
int i = todoFaces[j];
b3MyFace& faceB = tmpFaces[i];
b3Vector3 faceNormalB(faceB.m_plane[0],faceB.m_plane[1],faceB.m_plane[2]);
if (faceNormalA.dot(faceNormalB)>faceWeldThreshold)
{
coplanarFaceGroup.push_back(i);
todoFaces.remove(i);
}
}
bool did_merge = false;
if (coplanarFaceGroup.size()>1)
{
//do the merge: use Graham Scan 2d convex hull
b3AlignedObjectArray<b3GrahamVector3> orgpoints;
b3Vector3 averageFaceNormal(0,0,0);
for (int i=0;i<coplanarFaceGroup.size();i++)
{
// m_polyhedron->m_faces.push_back(tmpFaces[coplanarFaceGroup[i]]);
b3MyFace& face = tmpFaces[coplanarFaceGroup[i]];
b3Vector3 faceNormal(face.m_plane[0],face.m_plane[1],face.m_plane[2]);
averageFaceNormal+=faceNormal;
for (int f=0;f<face.m_indices.size();f++)
{
int orgIndex = face.m_indices[f];
b3Vector3 pt = m_vertices[orgIndex];
bool found = false;
for (int i=0;i<orgpoints.size();i++)
{
//if ((orgpoints[i].m_orgIndex == orgIndex) || ((rotatedPt-orgpoints[i]).length2()<0.0001))
if (orgpoints[i].m_orgIndex == orgIndex)
{
found=true;
break;
}
}
if (!found)
orgpoints.push_back(b3GrahamVector3(pt,orgIndex));
}
}
b3MyFace combinedFace;
for (int i=0;i<4;i++)
combinedFace.m_plane[i] = tmpFaces[coplanarFaceGroup[0]].m_plane[i];
b3AlignedObjectArray<b3GrahamVector3> hull;
averageFaceNormal.normalize();
b3GrahamScanConvexHull2D(orgpoints,hull,averageFaceNormal);
for (int i=0;i<hull.size();i++)
{
combinedFace.m_indices.push_back(hull[i].m_orgIndex);
for(int k = 0; k < orgpoints.size(); k++)
{
if(orgpoints[k].m_orgIndex == hull[i].m_orgIndex)
{
orgpoints[k].m_orgIndex = -1; // invalidate...
break;
}
}
}
// are there rejected vertices?
bool reject_merge = false;
for(int i = 0; i < orgpoints.size(); i++) {
if(orgpoints[i].m_orgIndex == -1)
continue; // this is in the hull...
// this vertex is rejected -- is anybody else using this vertex?
for(int j = 0; j < tmpFaces.size(); j++) {
b3MyFace& face = tmpFaces[j];
// is this a face of the current coplanar group?
bool is_in_current_group = false;
for(int k = 0; k < coplanarFaceGroup.size(); k++) {
if(coplanarFaceGroup[k] == j) {
is_in_current_group = true;
break;
}
}
if(is_in_current_group) // ignore this face...
continue;
// does this face use this rejected vertex?
for(int v = 0; v < face.m_indices.size(); v++) {
if(face.m_indices[v] == orgpoints[i].m_orgIndex) {
// this rejected vertex is used in another face -- reject merge
reject_merge = true;
break;
}
}
if(reject_merge)
break;
}
if(reject_merge)
break;
}
if (!reject_merge)
{
// do this merge!
did_merge = true;
m_faces.push_back(combinedFace);
}
}
if(!did_merge)
{
for (int i=0;i<coplanarFaceGroup.size();i++)
{
b3MyFace face = tmpFaces[coplanarFaceGroup[i]];
m_faces.push_back(face);
}
}
}
initialize();
return true;
}
inline bool IsAlmostZero(const b3Vector3& v)
{
if(fabsf(v.getX())>1e-6 || fabsf(v.getY())>1e-6 || fabsf(v.getZ())>1e-6) return false;
return true;
}
struct b3InternalVertexPair
{
b3InternalVertexPair(short int v0,short int v1)
:m_v0(v0),
m_v1(v1)
{
if (m_v1>m_v0)
b3Swap(m_v0,m_v1);
}
short int m_v0;
short int m_v1;
int getHash() const
{
return m_v0+(m_v1<<16);
}
bool equals(const b3InternalVertexPair& other) const
{
return m_v0==other.m_v0 && m_v1==other.m_v1;
}
};
struct b3InternalEdge
{
b3InternalEdge()
:m_face0(-1),
m_face1(-1)
{
}
short int m_face0;
short int m_face1;
};
//
#ifdef TEST_INTERNAL_OBJECTS
bool b3ConvexUtility::testContainment() const
{
for(int p=0;p<8;p++)
{
b3Vector3 LocalPt;
if(p==0) LocalPt = m_localCenter + b3Vector3(m_extents[0], m_extents[1], m_extents[2]);
else if(p==1) LocalPt = m_localCenter + b3Vector3(m_extents[0], m_extents[1], -m_extents[2]);
else if(p==2) LocalPt = m_localCenter + b3Vector3(m_extents[0], -m_extents[1], m_extents[2]);
else if(p==3) LocalPt = m_localCenter + b3Vector3(m_extents[0], -m_extents[1], -m_extents[2]);
else if(p==4) LocalPt = m_localCenter + b3Vector3(-m_extents[0], m_extents[1], m_extents[2]);
else if(p==5) LocalPt = m_localCenter + b3Vector3(-m_extents[0], m_extents[1], -m_extents[2]);
else if(p==6) LocalPt = m_localCenter + b3Vector3(-m_extents[0], -m_extents[1], m_extents[2]);
else if(p==7) LocalPt = m_localCenter + b3Vector3(-m_extents[0], -m_extents[1], -m_extents[2]);
for(int i=0;i<m_faces.size();i++)
{
const b3Vector3 Normal(m_faces[i].m_plane[0], m_faces[i].m_plane[1], m_faces[i].m_plane[2]);
const b3Scalar d = LocalPt.dot(Normal) + m_faces[i].m_plane[3];
if(d>0.0f)
return false;
}
}
return true;
}
#endif
void b3ConvexUtility::initialize()
{
b3HashMap<b3InternalVertexPair,b3InternalEdge> edges;
b3Scalar TotalArea = 0.0f;
m_localCenter.setValue(0, 0, 0);
for(int i=0;i<m_faces.size();i++)
{
int numVertices = m_faces[i].m_indices.size();
int NbTris = numVertices;
for(int j=0;j<NbTris;j++)
{
int k = (j+1)%numVertices;
b3InternalVertexPair vp(m_faces[i].m_indices[j],m_faces[i].m_indices[k]);
b3InternalEdge* edptr = edges.find(vp);
b3Vector3 edge = m_vertices[vp.m_v1]-m_vertices[vp.m_v0];
edge.normalize();
bool found = false;
b3Vector3 diff,diff2;
for (int p=0;p<m_uniqueEdges.size();p++)
{
diff = m_uniqueEdges[p]-edge;
diff2 = m_uniqueEdges[p]+edge;
// if ((diff.length2()==0.f) ||
// (diff2.length2()==0.f))
if (IsAlmostZero(diff) ||
IsAlmostZero(diff2))
{
found = true;
break;
}
}
if (!found)
{
m_uniqueEdges.push_back(edge);
}
if (edptr)
{
//TBD: figure out why I added this assert
// b3Assert(edptr->m_face0>=0);
// b3Assert(edptr->m_face1<0);
edptr->m_face1 = i;
} else
{
b3InternalEdge ed;
ed.m_face0 = i;
edges.insert(vp,ed);
}
}
}
#ifdef USE_CONNECTED_FACES
for(int i=0;i<m_faces.size();i++)
{
int numVertices = m_faces[i].m_indices.size();
m_faces[i].m_connectedFaces.resize(numVertices);
for(int j=0;j<numVertices;j++)
{
int k = (j+1)%numVertices;
b3InternalVertexPair vp(m_faces[i].m_indices[j],m_faces[i].m_indices[k]);
b3InternalEdge* edptr = edges.find(vp);
b3Assert(edptr);
b3Assert(edptr->m_face0>=0);
b3Assert(edptr->m_face1>=0);
int connectedFace = (edptr->m_face0==i)?edptr->m_face1:edptr->m_face0;
m_faces[i].m_connectedFaces[j] = connectedFace;
}
}
#endif//USE_CONNECTED_FACES
for(int i=0;i<m_faces.size();i++)
{
int numVertices = m_faces[i].m_indices.size();
int NbTris = numVertices-2;
const b3Vector3& p0 = m_vertices[m_faces[i].m_indices[0]];
for(int j=1;j<=NbTris;j++)
{
int k = (j+1)%numVertices;
const b3Vector3& p1 = m_vertices[m_faces[i].m_indices[j]];
const b3Vector3& p2 = m_vertices[m_faces[i].m_indices[k]];
b3Scalar Area = ((p0 - p1).cross(p0 - p2)).length() * 0.5f;
b3Vector3 Center = (p0+p1+p2)/3.0f;
m_localCenter += Area * Center;
TotalArea += Area;
}
}
m_localCenter /= TotalArea;
#ifdef TEST_INTERNAL_OBJECTS
if(1)
{
m_radius = FLT_MAX;
for(int i=0;i<m_faces.size();i++)
{
const b3Vector3 Normal(m_faces[i].m_plane[0], m_faces[i].m_plane[1], m_faces[i].m_plane[2]);
const b3Scalar dist = b3Fabs(m_localCenter.dot(Normal) + m_faces[i].m_plane[3]);
if(dist<m_radius)
m_radius = dist;
}
b3Scalar MinX = FLT_MAX;
b3Scalar MinY = FLT_MAX;
b3Scalar MinZ = FLT_MAX;
b3Scalar MaxX = -FLT_MAX;
b3Scalar MaxY = -FLT_MAX;
b3Scalar MaxZ = -FLT_MAX;
for(int i=0; i<m_vertices.size(); i++)
{
const b3Vector3& pt = m_vertices[i];
if(pt.getX()<MinX) MinX = pt.getX();
if(pt.getX()>MaxX) MaxX = pt.getX();
if(pt.getY()<MinY) MinY = pt.getY();
if(pt.getY()>MaxY) MaxY = pt.getY();
if(pt.getZ()<MinZ) MinZ = pt.getZ();
if(pt.getZ()>MaxZ) MaxZ = pt.getZ();
}
mC.setValue(MaxX+MinX, MaxY+MinY, MaxZ+MinZ);
mE.setValue(MaxX-MinX, MaxY-MinY, MaxZ-MinZ);
// const b3Scalar r = m_radius / sqrtf(2.0f);
const b3Scalar r = m_radius / sqrtf(3.0f);
const int LargestExtent = mE.maxAxis();
const b3Scalar Step = (mE[LargestExtent]*0.5f - r)/1024.0f;
m_extents[0] = m_extents[1] = m_extents[2] = r;
m_extents[LargestExtent] = mE[LargestExtent]*0.5f;
bool FoundBox = false;
for(int j=0;j<1024;j++)
{
if(testContainment())
{
FoundBox = true;
break;
}
m_extents[LargestExtent] -= Step;
}
if(!FoundBox)
{
m_extents[0] = m_extents[1] = m_extents[2] = r;
}
else
{
// Refine the box
const b3Scalar Step = (m_radius - r)/1024.0f;
const int e0 = (1<<LargestExtent) & 3;
const int e1 = (1<<e0) & 3;
for(int j=0;j<1024;j++)
{
const b3Scalar Saved0 = m_extents[e0];
const b3Scalar Saved1 = m_extents[e1];
m_extents[e0] += Step;
m_extents[e1] += Step;
if(!testContainment())
{
m_extents[e0] = Saved0;
m_extents[e1] = Saved1;
break;
}
}
}
}
#endif
}

View File

@@ -0,0 +1,62 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef _BT_CONVEX_UTILITY_H
#define _BT_CONVEX_UTILITY_H
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Common/b3Transform.h"
#include "b3ConvexPolyhedronCL.h"
struct b3MyFace
{
b3AlignedObjectArray<int> m_indices;
b3Scalar m_plane[4];
};
B3_ATTRIBUTE_ALIGNED16(class) b3ConvexUtility
{
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
b3Vector3 m_localCenter;
b3Vector3 m_extents;
b3Vector3 mC;
b3Vector3 mE;
b3Scalar m_radius;
b3AlignedObjectArray<b3Vector3> m_vertices;
b3AlignedObjectArray<b3MyFace> m_faces;
b3AlignedObjectArray<b3Vector3> m_uniqueEdges;
b3ConvexUtility()
{
}
virtual ~b3ConvexUtility();
bool initializePolyhedralFeatures(const b3Vector3* orgVertices, int numVertices, bool mergeCoplanarTriangles=true);
void initialize();
bool testContainment() const;
};
#endif

View File

@@ -0,0 +1,390 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "b3OptimizedBvh.h"
#include "b3StridingMeshInterface.h"
#include "Bullet3Geometry/b3AabbUtil.h"
b3OptimizedBvh::b3OptimizedBvh()
{
}
b3OptimizedBvh::~b3OptimizedBvh()
{
}
void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax)
{
m_useQuantization = useQuantizedAabbCompression;
// NodeArray triangleNodes;
struct NodeTriangleCallback : public b3InternalTriangleIndexCallback
{
NodeArray& m_triangleNodes;
NodeTriangleCallback& operator=(NodeTriangleCallback& other)
{
m_triangleNodes.copyFromArray(other.m_triangleNodes);
return *this;
}
NodeTriangleCallback(NodeArray& triangleNodes)
:m_triangleNodes(triangleNodes)
{
}
virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex)
{
b3OptimizedBvhNode node;
b3Vector3 aabbMin,aabbMax;
aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
aabbMin.setMin(triangle[0]);
aabbMax.setMax(triangle[0]);
aabbMin.setMin(triangle[1]);
aabbMax.setMax(triangle[1]);
aabbMin.setMin(triangle[2]);
aabbMax.setMax(triangle[2]);
//with quantization?
node.m_aabbMinOrg = aabbMin;
node.m_aabbMaxOrg = aabbMax;
node.m_escapeIndex = -1;
//for child nodes
node.m_subPart = partId;
node.m_triangleIndex = triangleIndex;
m_triangleNodes.push_back(node);
}
};
struct QuantizedNodeTriangleCallback : public b3InternalTriangleIndexCallback
{
QuantizedNodeArray& m_triangleNodes;
const b3QuantizedBvh* m_optimizedTree; // for quantization
QuantizedNodeTriangleCallback& operator=(QuantizedNodeTriangleCallback& other)
{
m_triangleNodes.copyFromArray(other.m_triangleNodes);
m_optimizedTree = other.m_optimizedTree;
return *this;
}
QuantizedNodeTriangleCallback(QuantizedNodeArray& triangleNodes,const b3QuantizedBvh* tree)
:m_triangleNodes(triangleNodes),m_optimizedTree(tree)
{
}
virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex)
{
// The partId and triangle index must fit in the same (positive) integer
b3Assert(partId < (1<<MAX_NUM_PARTS_IN_BITS));
b3Assert(triangleIndex < (1<<(31-MAX_NUM_PARTS_IN_BITS)));
//negative indices are reserved for escapeIndex
b3Assert(triangleIndex>=0);
b3QuantizedBvhNode node;
b3Vector3 aabbMin,aabbMax;
aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
aabbMin.setMin(triangle[0]);
aabbMax.setMax(triangle[0]);
aabbMin.setMin(triangle[1]);
aabbMax.setMax(triangle[1]);
aabbMin.setMin(triangle[2]);
aabbMax.setMax(triangle[2]);
//PCK: add these checks for zero dimensions of aabb
const b3Scalar MIN_AABB_DIMENSION = b3Scalar(0.002);
const b3Scalar MIN_AABB_HALF_DIMENSION = b3Scalar(0.001);
if (aabbMax.getX() - aabbMin.getX() < MIN_AABB_DIMENSION)
{
aabbMax.setX(aabbMax.getX() + MIN_AABB_HALF_DIMENSION);
aabbMin.setX(aabbMin.getX() - MIN_AABB_HALF_DIMENSION);
}
if (aabbMax.getY() - aabbMin.getY() < MIN_AABB_DIMENSION)
{
aabbMax.setY(aabbMax.getY() + MIN_AABB_HALF_DIMENSION);
aabbMin.setY(aabbMin.getY() - MIN_AABB_HALF_DIMENSION);
}
if (aabbMax.getZ() - aabbMin.getZ() < MIN_AABB_DIMENSION)
{
aabbMax.setZ(aabbMax.getZ() + MIN_AABB_HALF_DIMENSION);
aabbMin.setZ(aabbMin.getZ() - MIN_AABB_HALF_DIMENSION);
}
m_optimizedTree->quantize(&node.m_quantizedAabbMin[0],aabbMin,0);
m_optimizedTree->quantize(&node.m_quantizedAabbMax[0],aabbMax,1);
node.m_escapeIndexOrTriangleIndex = (partId<<(31-MAX_NUM_PARTS_IN_BITS)) | triangleIndex;
m_triangleNodes.push_back(node);
}
};
int numLeafNodes = 0;
if (m_useQuantization)
{
//initialize quantization values
setQuantizationValues(bvhAabbMin,bvhAabbMax);
QuantizedNodeTriangleCallback callback(m_quantizedLeafNodes,this);
triangles->InternalProcessAllTriangles(&callback,m_bvhAabbMin,m_bvhAabbMax);
//now we have an array of leafnodes in m_leafNodes
numLeafNodes = m_quantizedLeafNodes.size();
m_quantizedContiguousNodes.resize(2*numLeafNodes);
} else
{
NodeTriangleCallback callback(m_leafNodes);
b3Vector3 aabbMin(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
b3Vector3 aabbMax(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
triangles->InternalProcessAllTriangles(&callback,aabbMin,aabbMax);
//now we have an array of leafnodes in m_leafNodes
numLeafNodes = m_leafNodes.size();
m_contiguousNodes.resize(2*numLeafNodes);
}
m_curNodeIndex = 0;
buildTree(0,numLeafNodes);
///if the entire tree is small then subtree size, we need to create a header info for the tree
if(m_useQuantization && !m_SubtreeHeaders.size())
{
b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]);
subtree.m_rootNodeIndex = 0;
subtree.m_subtreeSize = m_quantizedContiguousNodes[0].isLeafNode() ? 1 : m_quantizedContiguousNodes[0].getEscapeIndex();
}
//PCK: update the copy of the size
m_subtreeHeaderCount = m_SubtreeHeaders.size();
//PCK: clear m_quantizedLeafNodes and m_leafNodes, they are temporary
m_quantizedLeafNodes.clear();
m_leafNodes.clear();
}
void b3OptimizedBvh::refit(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax)
{
if (m_useQuantization)
{
setQuantizationValues(aabbMin,aabbMax);
updateBvhNodes(meshInterface,0,m_curNodeIndex,0);
///now update all subtree headers
int i;
for (i=0;i<m_SubtreeHeaders.size();i++)
{
b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
}
} else
{
}
}
void b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax)
{
//incrementally initialize quantization values
b3Assert(m_useQuantization);
b3Assert(aabbMin.getX() > m_bvhAabbMin.getX());
b3Assert(aabbMin.getY() > m_bvhAabbMin.getY());
b3Assert(aabbMin.getZ() > m_bvhAabbMin.getZ());
b3Assert(aabbMax.getX() < m_bvhAabbMax.getX());
b3Assert(aabbMax.getY() < m_bvhAabbMax.getY());
b3Assert(aabbMax.getZ() < m_bvhAabbMax.getZ());
///we should update all quantization values, using updateBvhNodes(meshInterface);
///but we only update chunks that overlap the given aabb
unsigned short quantizedQueryAabbMin[3];
unsigned short quantizedQueryAabbMax[3];
quantize(&quantizedQueryAabbMin[0],aabbMin,0);
quantize(&quantizedQueryAabbMax[0],aabbMax,1);
int i;
for (i=0;i<this->m_SubtreeHeaders.size();i++)
{
b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
//PCK: unsigned instead of bool
unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
if (overlap != 0)
{
updateBvhNodes(meshInterface,subtree.m_rootNodeIndex,subtree.m_rootNodeIndex+subtree.m_subtreeSize,i);
subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
}
}
}
void b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index)
{
(void)index;
b3Assert(m_useQuantization);
int curNodeSubPart=-1;
//get access info to trianglemesh data
const unsigned char *vertexbase = 0;
int numverts = 0;
PHY_ScalarType type = PHY_INTEGER;
int stride = 0;
const unsigned char *indexbase = 0;
int indexstride = 0;
int numfaces = 0;
PHY_ScalarType indicestype = PHY_INTEGER;
b3Vector3 triangleVerts[3];
b3Vector3 aabbMin,aabbMax;
const b3Vector3& meshScaling = meshInterface->getScaling();
int i;
for (i=endNode-1;i>=firstNode;i--)
{
b3QuantizedBvhNode& curNode = m_quantizedContiguousNodes[i];
if (curNode.isLeafNode())
{
//recalc aabb from triangle data
int nodeSubPart = curNode.getPartId();
int nodeTriangleIndex = curNode.getTriangleIndex();
if (nodeSubPart != curNodeSubPart)
{
if (curNodeSubPart >= 0)
meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase,numverts, type,stride,&indexbase,indexstride,numfaces,indicestype,nodeSubPart);
curNodeSubPart = nodeSubPart;
b3Assert(indicestype==PHY_INTEGER||indicestype==PHY_SHORT);
}
//triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts,
unsigned int* gfxbase = (unsigned int*)(indexbase+nodeTriangleIndex*indexstride);
for (int j=2;j>=0;j--)
{
int graphicsindex = indicestype==PHY_SHORT?((unsigned short*)gfxbase)[j]:gfxbase[j];
if (type == PHY_FLOAT)
{
float* graphicsbase = (float*)(vertexbase+graphicsindex*stride);
triangleVerts[j] = b3Vector3(
graphicsbase[0]*meshScaling.getX(),
graphicsbase[1]*meshScaling.getY(),
graphicsbase[2]*meshScaling.getZ());
}
else
{
double* graphicsbase = (double*)(vertexbase+graphicsindex*stride);
triangleVerts[j] = b3Vector3( b3Scalar(graphicsbase[0]*meshScaling.getX()), b3Scalar(graphicsbase[1]*meshScaling.getY()), b3Scalar(graphicsbase[2]*meshScaling.getZ()));
}
}
aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
aabbMin.setMin(triangleVerts[0]);
aabbMax.setMax(triangleVerts[0]);
aabbMin.setMin(triangleVerts[1]);
aabbMax.setMax(triangleVerts[1]);
aabbMin.setMin(triangleVerts[2]);
aabbMax.setMax(triangleVerts[2]);
quantize(&curNode.m_quantizedAabbMin[0],aabbMin,0);
quantize(&curNode.m_quantizedAabbMax[0],aabbMax,1);
} else
{
//combine aabb from both children
b3QuantizedBvhNode* leftChildNode = &m_quantizedContiguousNodes[i+1];
b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? &m_quantizedContiguousNodes[i+2] :
&m_quantizedContiguousNodes[i+1+leftChildNode->getEscapeIndex()];
{
for (int i=0;i<3;i++)
{
curNode.m_quantizedAabbMin[i] = leftChildNode->m_quantizedAabbMin[i];
if (curNode.m_quantizedAabbMin[i]>rightChildNode->m_quantizedAabbMin[i])
curNode.m_quantizedAabbMin[i]=rightChildNode->m_quantizedAabbMin[i];
curNode.m_quantizedAabbMax[i] = leftChildNode->m_quantizedAabbMax[i];
if (curNode.m_quantizedAabbMax[i] < rightChildNode->m_quantizedAabbMax[i])
curNode.m_quantizedAabbMax[i] = rightChildNode->m_quantizedAabbMax[i];
}
}
}
}
if (curNodeSubPart >= 0)
meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
}
///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
b3OptimizedBvh* b3OptimizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian)
{
b3QuantizedBvh* bvh = b3QuantizedBvh::deSerializeInPlace(i_alignedDataBuffer,i_dataBufferSize,i_swapEndian);
//we don't add additional data so just do a static upcast
return static_cast<b3OptimizedBvh*>(bvh);
}

View File

@@ -0,0 +1,65 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///Contains contributions from Disney Studio's
#ifndef B3_OPTIMIZED_BVH_H
#define B3_OPTIMIZED_BVH_H
#include "b3QuantizedBvh.h"
class b3StridingMeshInterface;
///The b3OptimizedBvh extends the b3QuantizedBvh to create AABB tree for triangle meshes, through the b3StridingMeshInterface.
B3_ATTRIBUTE_ALIGNED16(class) b3OptimizedBvh : public b3QuantizedBvh
{
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
protected:
public:
b3OptimizedBvh();
virtual ~b3OptimizedBvh();
void build(b3StridingMeshInterface* triangles,bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax);
void refit(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin,const b3Vector3& aabbMax);
void refitPartial(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin, const b3Vector3& aabbMax);
void updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index);
/// Data buffer MUST be 16 byte aligned
virtual bool serializeInPlace(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const
{
return b3QuantizedBvh::serialize(o_alignedDataBuffer,i_dataBufferSize,i_swapEndian);
}
///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
static b3OptimizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
};
#endif //B3_OPTIMIZED_BVH_H

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,582 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_QUANTIZED_BVH_H
#define B3_QUANTIZED_BVH_H
class b3Serializer;
//#define DEBUG_CHECK_DEQUANTIZATION 1
#ifdef DEBUG_CHECK_DEQUANTIZATION
#ifdef __SPU__
#define printf spu_printf
#endif //__SPU__
#include <stdio.h>
#include <stdlib.h>
#endif //DEBUG_CHECK_DEQUANTIZATION
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3Common/b3AlignedAllocator.h"
#ifdef B3_USE_DOUBLE_PRECISION
#define b3QuantizedBvhData b3QuantizedBvhDoubleData
#define b3OptimizedBvhNodeData b3OptimizedBvhNodeDoubleData
#define b3QuantizedBvhDataName "b3QuantizedBvhDoubleData"
#else
#define b3QuantizedBvhData b3QuantizedBvhFloatData
#define b3OptimizedBvhNodeData b3OptimizedBvhNodeFloatData
#define b3QuantizedBvhDataName "b3QuantizedBvhFloatData"
#endif
//http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclang/html/vclrf__m128.asp
//Note: currently we have 16 bytes per quantized node
#define MAX_SUBTREE_SIZE_IN_BYTES 2048
// 10 gives the potential for 1024 parts, with at most 2^21 (2097152) (minus one
// actually) triangles each (since the sign bit is reserved
#define MAX_NUM_PARTS_IN_BITS 10
///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.
///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
B3_ATTRIBUTE_ALIGNED16 (struct) b3QuantizedBvhNode
{
B3_DECLARE_ALIGNED_ALLOCATOR();
//12 bytes
unsigned short int m_quantizedAabbMin[3];
unsigned short int m_quantizedAabbMax[3];
//4 bytes
int m_escapeIndexOrTriangleIndex;
bool isLeafNode() const
{
//skipindex is negative (internal node), triangleindex >=0 (leafnode)
return (m_escapeIndexOrTriangleIndex >= 0);
}
int getEscapeIndex() const
{
b3Assert(!isLeafNode());
return -m_escapeIndexOrTriangleIndex;
}
int getTriangleIndex() const
{
b3Assert(isLeafNode());
unsigned int x=0;
unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
// Get only the lower bits where the triangle index is stored
return (m_escapeIndexOrTriangleIndex&~(y));
}
int getPartId() const
{
b3Assert(isLeafNode());
// Get only the highest bits where the part index is stored
return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS));
}
}
;
/// b3OptimizedBvhNode contains both internal and leaf node information.
/// Total node size is 44 bytes / node. You can use the compressed version of 16 bytes.
B3_ATTRIBUTE_ALIGNED16 (struct) b3OptimizedBvhNode
{
B3_DECLARE_ALIGNED_ALLOCATOR();
//32 bytes
b3Vector3 m_aabbMinOrg;
b3Vector3 m_aabbMaxOrg;
//4
int m_escapeIndex;
//8
//for child nodes
int m_subPart;
int m_triangleIndex;
//pad the size to 64 bytes
char m_padding[20];
};
///b3BvhSubtreeInfo provides info to gather a subtree of limited size
B3_ATTRIBUTE_ALIGNED16(class) b3BvhSubtreeInfo
{
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
//12 bytes
unsigned short int m_quantizedAabbMin[3];
unsigned short int m_quantizedAabbMax[3];
//4 bytes, points to the root of the subtree
int m_rootNodeIndex;
//4 bytes
int m_subtreeSize;
int m_padding[3];
b3BvhSubtreeInfo()
{
//memset(&m_padding[0], 0, sizeof(m_padding));
}
void setAabbFromQuantizeNode(const b3QuantizedBvhNode& quantizedNode)
{
m_quantizedAabbMin[0] = quantizedNode.m_quantizedAabbMin[0];
m_quantizedAabbMin[1] = quantizedNode.m_quantizedAabbMin[1];
m_quantizedAabbMin[2] = quantizedNode.m_quantizedAabbMin[2];
m_quantizedAabbMax[0] = quantizedNode.m_quantizedAabbMax[0];
m_quantizedAabbMax[1] = quantizedNode.m_quantizedAabbMax[1];
m_quantizedAabbMax[2] = quantizedNode.m_quantizedAabbMax[2];
}
}
;
class b3NodeOverlapCallback
{
public:
virtual ~b3NodeOverlapCallback() {};
virtual void processNode(int subPart, int triangleIndex) = 0;
};
#include "Bullet3Common/b3AlignedAllocator.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
///for code readability:
typedef b3AlignedObjectArray<b3OptimizedBvhNode> NodeArray;
typedef b3AlignedObjectArray<b3QuantizedBvhNode> QuantizedNodeArray;
typedef b3AlignedObjectArray<b3BvhSubtreeInfo> BvhSubtreeInfoArray;
///The b3QuantizedBvh class stores an AABB tree that can be quickly traversed on CPU and Cell SPU.
///It is used by the b3BvhTriangleMeshShape as midphase, and by the b3MultiSapBroadphase.
///It is recommended to use quantization for better performance and lower memory requirements.
B3_ATTRIBUTE_ALIGNED16(class) b3QuantizedBvh
{
public:
enum b3TraversalMode
{
TRAVERSAL_STACKLESS = 0,
TRAVERSAL_STACKLESS_CACHE_FRIENDLY,
TRAVERSAL_RECURSIVE
};
b3Vector3 m_bvhAabbMin;
b3Vector3 m_bvhAabbMax;
b3Vector3 m_bvhQuantization;
protected:
int m_bulletVersion; //for serialization versioning. It could also be used to detect endianess.
int m_curNodeIndex;
//quantization data
bool m_useQuantization;
NodeArray m_leafNodes;
NodeArray m_contiguousNodes;
QuantizedNodeArray m_quantizedLeafNodes;
QuantizedNodeArray m_quantizedContiguousNodes;
b3TraversalMode m_traversalMode;
BvhSubtreeInfoArray m_SubtreeHeaders;
//This is only used for serialization so we don't have to add serialization directly to b3AlignedObjectArray
mutable int m_subtreeHeaderCount;
///two versions, one for quantized and normal nodes. This allows code-reuse while maintaining readability (no template/macro!)
///this might be refactored into a virtual, it is usually not calculated at run-time
void setInternalNodeAabbMin(int nodeIndex, const b3Vector3& aabbMin)
{
if (m_useQuantization)
{
quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] ,aabbMin,0);
} else
{
m_contiguousNodes[nodeIndex].m_aabbMinOrg = aabbMin;
}
}
void setInternalNodeAabbMax(int nodeIndex,const b3Vector3& aabbMax)
{
if (m_useQuantization)
{
quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0],aabbMax,1);
} else
{
m_contiguousNodes[nodeIndex].m_aabbMaxOrg = aabbMax;
}
}
b3Vector3 getAabbMin(int nodeIndex) const
{
if (m_useQuantization)
{
return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMin[0]);
}
//non-quantized
return m_leafNodes[nodeIndex].m_aabbMinOrg;
}
b3Vector3 getAabbMax(int nodeIndex) const
{
if (m_useQuantization)
{
return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMax[0]);
}
//non-quantized
return m_leafNodes[nodeIndex].m_aabbMaxOrg;
}
void setInternalNodeEscapeIndex(int nodeIndex, int escapeIndex)
{
if (m_useQuantization)
{
m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = -escapeIndex;
}
else
{
m_contiguousNodes[nodeIndex].m_escapeIndex = escapeIndex;
}
}
void mergeInternalNodeAabb(int nodeIndex,const b3Vector3& newAabbMin,const b3Vector3& newAabbMax)
{
if (m_useQuantization)
{
unsigned short int quantizedAabbMin[3];
unsigned short int quantizedAabbMax[3];
quantize(quantizedAabbMin,newAabbMin,0);
quantize(quantizedAabbMax,newAabbMax,1);
for (int i=0;i<3;i++)
{
if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] > quantizedAabbMin[i])
m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] = quantizedAabbMin[i];
if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] < quantizedAabbMax[i])
m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] = quantizedAabbMax[i];
}
} else
{
//non-quantized
m_contiguousNodes[nodeIndex].m_aabbMinOrg.setMin(newAabbMin);
m_contiguousNodes[nodeIndex].m_aabbMaxOrg.setMax(newAabbMax);
}
}
void swapLeafNodes(int firstIndex,int secondIndex);
void assignInternalNodeFromLeafNode(int internalNode,int leafNodeIndex);
protected:
void buildTree (int startIndex,int endIndex);
int calcSplittingAxis(int startIndex,int endIndex);
int sortAndCalcSplittingIndex(int startIndex,int endIndex,int splitAxis);
void walkStacklessTree(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
void walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const;
void walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,int startNodeIndex,int endNodeIndex) const;
void walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const;
///tree traversal designed for small-memory processors like PS3 SPU
void walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const;
///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
void walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode,b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const;
///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
void walkRecursiveQuantizedTreeAgainstQuantizedTree(const b3QuantizedBvhNode* treeNodeA,const b3QuantizedBvhNode* treeNodeB,b3NodeOverlapCallback* nodeCallback) const;
void updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex);
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
b3QuantizedBvh();
virtual ~b3QuantizedBvh();
///***************************************** expert/internal use only *************************
void setQuantizationValues(const b3Vector3& bvhAabbMin,const b3Vector3& bvhAabbMax,b3Scalar quantizationMargin=b3Scalar(1.0));
QuantizedNodeArray& getLeafNodeArray() { return m_quantizedLeafNodes; }
///buildInternal is expert use only: assumes that setQuantizationValues and LeafNodeArray are initialized
void buildInternal();
///***************************************** expert/internal use only *************************
void reportAabbOverlappingNodex(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
void reportRayOverlappingNodex (b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const;
void reportBoxCastOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
B3_FORCE_INLINE void quantize(unsigned short* out, const b3Vector3& point,int isMax) const
{
b3Assert(m_useQuantization);
b3Assert(point.getX() <= m_bvhAabbMax.getX());
b3Assert(point.getY() <= m_bvhAabbMax.getY());
b3Assert(point.getZ() <= m_bvhAabbMax.getZ());
b3Assert(point.getX() >= m_bvhAabbMin.getX());
b3Assert(point.getY() >= m_bvhAabbMin.getY());
b3Assert(point.getZ() >= m_bvhAabbMin.getZ());
b3Vector3 v = (point - m_bvhAabbMin) * m_bvhQuantization;
///Make sure rounding is done in a way that unQuantize(quantizeWithClamp(...)) is conservative
///end-points always set the first bit, so that they are sorted properly (so that neighbouring AABBs overlap properly)
///@todo: double-check this
if (isMax)
{
out[0] = (unsigned short) (((unsigned short)(v.getX()+b3Scalar(1.)) | 1));
out[1] = (unsigned short) (((unsigned short)(v.getY()+b3Scalar(1.)) | 1));
out[2] = (unsigned short) (((unsigned short)(v.getZ()+b3Scalar(1.)) | 1));
} else
{
out[0] = (unsigned short) (((unsigned short)(v.getX()) & 0xfffe));
out[1] = (unsigned short) (((unsigned short)(v.getY()) & 0xfffe));
out[2] = (unsigned short) (((unsigned short)(v.getZ()) & 0xfffe));
}
#ifdef DEBUG_CHECK_DEQUANTIZATION
b3Vector3 newPoint = unQuantize(out);
if (isMax)
{
if (newPoint.getX() < point.getX())
{
printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX());
}
if (newPoint.getY() < point.getY())
{
printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY());
}
if (newPoint.getZ() < point.getZ())
{
printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ());
}
} else
{
if (newPoint.getX() > point.getX())
{
printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX());
}
if (newPoint.getY() > point.getY())
{
printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY());
}
if (newPoint.getZ() > point.getZ())
{
printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ());
}
}
#endif //DEBUG_CHECK_DEQUANTIZATION
}
B3_FORCE_INLINE void quantizeWithClamp(unsigned short* out, const b3Vector3& point2,int isMax) const
{
b3Assert(m_useQuantization);
b3Vector3 clampedPoint(point2);
clampedPoint.setMax(m_bvhAabbMin);
clampedPoint.setMin(m_bvhAabbMax);
quantize(out,clampedPoint,isMax);
}
B3_FORCE_INLINE b3Vector3 unQuantize(const unsigned short* vecIn) const
{
b3Vector3 vecOut;
vecOut.setValue(
(b3Scalar)(vecIn[0]) / (m_bvhQuantization.getX()),
(b3Scalar)(vecIn[1]) / (m_bvhQuantization.getY()),
(b3Scalar)(vecIn[2]) / (m_bvhQuantization.getZ()));
vecOut += m_bvhAabbMin;
return vecOut;
}
///setTraversalMode let's you choose between stackless, recursive or stackless cache friendly tree traversal. Note this is only implemented for quantized trees.
void setTraversalMode(b3TraversalMode traversalMode)
{
m_traversalMode = traversalMode;
}
B3_FORCE_INLINE QuantizedNodeArray& getQuantizedNodeArray()
{
return m_quantizedContiguousNodes;
}
B3_FORCE_INLINE BvhSubtreeInfoArray& getSubtreeInfoArray()
{
return m_SubtreeHeaders;
}
////////////////////////////////////////////////////////////////////
/////Calculate space needed to store BVH for serialization
unsigned calculateSerializeBufferSize() const;
/// Data buffer MUST be 16 byte aligned
virtual bool serialize(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const;
///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
static b3QuantizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
static unsigned int getAlignmentSerializationPadding();
//////////////////////////////////////////////////////////////////////
virtual int calculateSerializeBufferSizeNew() const;
///fills the dataBuffer and returns the struct name (and 0 on failure)
virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const;
virtual void deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedBvhFloatData);
virtual void deSerializeDouble(struct b3QuantizedBvhDoubleData& quantizedBvhDoubleData);
////////////////////////////////////////////////////////////////////
B3_FORCE_INLINE bool isQuantized()
{
return m_useQuantization;
}
private:
// Special "copy" constructor that allows for in-place deserialization
// Prevents b3Vector3's default constructor from being called, but doesn't inialize much else
// ownsMemory should most likely be false if deserializing, and if you are not, don't call this (it also changes the function signature, which we need)
b3QuantizedBvh(b3QuantizedBvh &other, bool ownsMemory);
}
;
struct b3BvhSubtreeInfoData
{
int m_rootNodeIndex;
int m_subtreeSize;
unsigned short m_quantizedAabbMin[3];
unsigned short m_quantizedAabbMax[3];
};
struct b3OptimizedBvhNodeFloatData
{
b3Vector3FloatData m_aabbMinOrg;
b3Vector3FloatData m_aabbMaxOrg;
int m_escapeIndex;
int m_subPart;
int m_triangleIndex;
char m_pad[4];
};
struct b3OptimizedBvhNodeDoubleData
{
b3Vector3DoubleData m_aabbMinOrg;
b3Vector3DoubleData m_aabbMaxOrg;
int m_escapeIndex;
int m_subPart;
int m_triangleIndex;
char m_pad[4];
};
struct b3QuantizedBvhNodeData
{
unsigned short m_quantizedAabbMin[3];
unsigned short m_quantizedAabbMax[3];
int m_escapeIndexOrTriangleIndex;
};
struct b3QuantizedBvhFloatData
{
b3Vector3FloatData m_bvhAabbMin;
b3Vector3FloatData m_bvhAabbMax;
b3Vector3FloatData m_bvhQuantization;
int m_curNodeIndex;
int m_useQuantization;
int m_numContiguousLeafNodes;
int m_numQuantizedContiguousNodes;
b3OptimizedBvhNodeFloatData *m_contiguousNodesPtr;
b3QuantizedBvhNodeData *m_quantizedContiguousNodesPtr;
b3BvhSubtreeInfoData *m_subTreeInfoPtr;
int m_traversalMode;
int m_numSubtreeHeaders;
};
struct b3QuantizedBvhDoubleData
{
b3Vector3DoubleData m_bvhAabbMin;
b3Vector3DoubleData m_bvhAabbMax;
b3Vector3DoubleData m_bvhQuantization;
int m_curNodeIndex;
int m_useQuantization;
int m_numContiguousLeafNodes;
int m_numQuantizedContiguousNodes;
b3OptimizedBvhNodeDoubleData *m_contiguousNodesPtr;
b3QuantizedBvhNodeData *m_quantizedContiguousNodesPtr;
int m_traversalMode;
int m_numSubtreeHeaders;
b3BvhSubtreeInfoData *m_subTreeInfoPtr;
};
B3_FORCE_INLINE int b3QuantizedBvh::calculateSerializeBufferSizeNew() const
{
return sizeof(b3QuantizedBvhData);
}
#endif //B3_QUANTIZED_BVH_H

View File

@@ -0,0 +1,214 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "b3StridingMeshInterface.h"
b3StridingMeshInterface::~b3StridingMeshInterface()
{
}
void b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const
{
(void)aabbMin;
(void)aabbMax;
int numtotalphysicsverts = 0;
int part,graphicssubparts = getNumSubParts();
const unsigned char * vertexbase;
const unsigned char * indexbase;
int indexstride;
PHY_ScalarType type;
PHY_ScalarType gfxindextype;
int stride,numverts,numtriangles;
int gfxindex;
b3Vector3 triangle[3];
b3Vector3 meshScaling = getScaling();
///if the number of parts is big, the performance might drop due to the innerloop switch on indextype
for (part=0;part<graphicssubparts ;part++)
{
getLockedReadOnlyVertexIndexBase(&vertexbase,numverts,type,stride,&indexbase,indexstride,numtriangles,gfxindextype,part);
numtotalphysicsverts+=numtriangles*3; //upper bound
///unlike that developers want to pass in double-precision meshes in single-precision Bullet build
///so disable this feature by default
///see patch http://code.google.com/p/bullet/issues/detail?id=213
switch (type)
{
case PHY_FLOAT:
{
float* graphicsbase;
switch (gfxindextype)
{
case PHY_INTEGER:
{
for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
{
unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride);
graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ());
graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle,part,gfxindex);
}
break;
}
case PHY_SHORT:
{
for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
{
unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride);
graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ());
graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle,part,gfxindex);
}
break;
}
case PHY_UCHAR:
{
for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
{
unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride);
graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ());
graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle,part,gfxindex);
}
break;
}
default:
b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
}
break;
}
case PHY_DOUBLE:
{
double* graphicsbase;
switch (gfxindextype)
{
case PHY_INTEGER:
{
for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
{
unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride);
graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ());
graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle,part,gfxindex);
}
break;
}
case PHY_SHORT:
{
for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
{
unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride);
graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ());
graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle,part,gfxindex);
}
break;
}
case PHY_UCHAR:
{
for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
{
unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride);
graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ());
graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle,part,gfxindex);
}
break;
}
default:
b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
}
break;
}
default:
b3Assert((type == PHY_FLOAT) || (type == PHY_DOUBLE));
}
unLockReadOnlyVertexBase(part);
}
}
void b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax)
{
struct AabbCalculationCallback : public b3InternalTriangleIndexCallback
{
b3Vector3 m_aabbMin;
b3Vector3 m_aabbMax;
AabbCalculationCallback()
{
m_aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
m_aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
}
virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex)
{
(void)partId;
(void)triangleIndex;
m_aabbMin.setMin(triangle[0]);
m_aabbMax.setMax(triangle[0]);
m_aabbMin.setMin(triangle[1]);
m_aabbMax.setMax(triangle[1]);
m_aabbMin.setMin(triangle[2]);
m_aabbMax.setMax(triangle[2]);
}
};
//first calculate the total aabb for all triangles
AabbCalculationCallback aabbCallback;
aabbMin.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
InternalProcessAllTriangles(&aabbCallback,aabbMin,aabbMax);
aabbMin = aabbCallback.m_aabbMin;
aabbMax = aabbCallback.m_aabbMax;
}

View File

@@ -0,0 +1,167 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_STRIDING_MESHINTERFACE_H
#define B3_STRIDING_MESHINTERFACE_H
#include "Bullet3Common/b3Vector3.h"
#include "b3TriangleCallback.h"
//#include "b3ConcaveShape.h"
enum PHY_ScalarType {
PHY_FLOAT, PHY_DOUBLE, PHY_INTEGER, PHY_SHORT,
PHY_FIXEDPOINT88, PHY_UCHAR
};
/// The b3StridingMeshInterface is the interface class for high performance generic access to triangle meshes, used in combination with b3BvhTriangleMeshShape and some other collision shapes.
/// Using index striding of 3*sizeof(integer) it can use triangle arrays, using index striding of 1*sizeof(integer) it can handle triangle strips.
/// It allows for sharing graphics and collision meshes. Also it provides locking/unlocking of graphics meshes that are in gpu memory.
B3_ATTRIBUTE_ALIGNED16(class ) b3StridingMeshInterface
{
protected:
b3Vector3 m_scaling;
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
b3StridingMeshInterface() :m_scaling(b3Scalar(1.),b3Scalar(1.),b3Scalar(1.))
{
}
virtual ~b3StridingMeshInterface();
virtual void InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
///brute force method to calculate aabb
void calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax);
/// get read and write access to a subpart of a triangle mesh
/// this subpart has a continuous array of vertices and indices
/// in this way the mesh can be handled as chunks of memory with striding
/// very similar to OpenGL vertexarray support
/// make a call to unLockVertexBase when the read and write access is finished
virtual void getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0)=0;
virtual void getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const=0;
/// unLockVertexBase finishes the access to a subpart of the triangle mesh
/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
virtual void unLockVertexBase(int subpart)=0;
virtual void unLockReadOnlyVertexBase(int subpart) const=0;
/// getNumSubParts returns the number of seperate subparts
/// each subpart has a continuous array of vertices and indices
virtual int getNumSubParts() const=0;
virtual void preallocateVertices(int numverts)=0;
virtual void preallocateIndices(int numindices)=0;
virtual bool hasPremadeAabb() const { return false; }
virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const
{
(void) aabbMin;
(void) aabbMax;
}
virtual void getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const
{
(void) aabbMin;
(void) aabbMax;
}
const b3Vector3& getScaling() const {
return m_scaling;
}
void setScaling(const b3Vector3& scaling)
{
m_scaling = scaling;
}
virtual int calculateSerializeBufferSize() const;
///fills the dataBuffer and returns the struct name (and 0 on failure)
//virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const;
};
struct b3IntIndexData
{
int m_value;
};
struct b3ShortIntIndexData
{
short m_value;
char m_pad[2];
};
struct b3ShortIntIndexTripletData
{
short m_values[3];
char m_pad[2];
};
struct b3CharIndexTripletData
{
unsigned char m_values[3];
char m_pad;
};
///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
struct b3MeshPartData
{
b3Vector3FloatData *m_vertices3f;
b3Vector3DoubleData *m_vertices3d;
b3IntIndexData *m_indices32;
b3ShortIntIndexTripletData *m_3indices16;
b3CharIndexTripletData *m_3indices8;
b3ShortIntIndexData *m_indices16;//backwards compatibility
int m_numTriangles;//length of m_indices = m_numTriangles
int m_numVertices;
};
///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
struct b3StridingMeshInterfaceData
{
b3MeshPartData *m_meshPartsPtr;
b3Vector3FloatData m_scaling;
int m_numMeshParts;
char m_padding[4];
};
B3_FORCE_INLINE int b3StridingMeshInterface::calculateSerializeBufferSize() const
{
return sizeof(b3StridingMeshInterfaceData);
}
#endif //B3_STRIDING_MESHINTERFACE_H

View File

@@ -0,0 +1,28 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "b3TriangleCallback.h"
b3TriangleCallback::~b3TriangleCallback()
{
}
b3InternalTriangleIndexCallback::~b3InternalTriangleIndexCallback()
{
}

View File

@@ -0,0 +1,42 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_TRIANGLE_CALLBACK_H
#define B3_TRIANGLE_CALLBACK_H
#include "Bullet3Common/b3Vector3.h"
///The b3TriangleCallback provides a callback for each overlapping triangle when calling processAllTriangles.
///This callback is called by processAllTriangles for all b3ConcaveShape derived class, such as b3BvhTriangleMeshShape, b3StaticPlaneShape and b3HeightfieldTerrainShape.
class b3TriangleCallback
{
public:
virtual ~b3TriangleCallback();
virtual void processTriangle(b3Vector3* triangle, int partId, int triangleIndex) = 0;
};
class b3InternalTriangleIndexCallback
{
public:
virtual ~b3InternalTriangleIndexCallback();
virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex) = 0;
};
#endif //B3_TRIANGLE_CALLBACK_H

View File

@@ -0,0 +1,95 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "b3TriangleIndexVertexArray.h"
b3TriangleIndexVertexArray::b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride)
: m_hasAabb(0)
{
b3IndexedMesh mesh;
mesh.m_numTriangles = numTriangles;
mesh.m_triangleIndexBase = (const unsigned char *)triangleIndexBase;
mesh.m_triangleIndexStride = triangleIndexStride;
mesh.m_numVertices = numVertices;
mesh.m_vertexBase = (const unsigned char *)vertexBase;
mesh.m_vertexStride = vertexStride;
addIndexedMesh(mesh);
}
b3TriangleIndexVertexArray::~b3TriangleIndexVertexArray()
{
}
void b3TriangleIndexVertexArray::getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart)
{
b3Assert(subpart< getNumSubParts() );
b3IndexedMesh& mesh = m_indexedMeshes[subpart];
numverts = mesh.m_numVertices;
(*vertexbase) = (unsigned char *) mesh.m_vertexBase;
type = mesh.m_vertexType;
vertexStride = mesh.m_vertexStride;
numfaces = mesh.m_numTriangles;
(*indexbase) = (unsigned char *)mesh.m_triangleIndexBase;
indexstride = mesh.m_triangleIndexStride;
indicestype = mesh.m_indexType;
}
void b3TriangleIndexVertexArray::getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart) const
{
const b3IndexedMesh& mesh = m_indexedMeshes[subpart];
numverts = mesh.m_numVertices;
(*vertexbase) = (const unsigned char *)mesh.m_vertexBase;
type = mesh.m_vertexType;
vertexStride = mesh.m_vertexStride;
numfaces = mesh.m_numTriangles;
(*indexbase) = (const unsigned char *)mesh.m_triangleIndexBase;
indexstride = mesh.m_triangleIndexStride;
indicestype = mesh.m_indexType;
}
bool b3TriangleIndexVertexArray::hasPremadeAabb() const
{
return (m_hasAabb == 1);
}
void b3TriangleIndexVertexArray::setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const
{
m_aabbMin = aabbMin;
m_aabbMax = aabbMax;
m_hasAabb = 1; // this is intentionally an int see notes in header
}
void b3TriangleIndexVertexArray::getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const
{
*aabbMin = m_aabbMin;
*aabbMax = m_aabbMax;
}

View File

@@ -0,0 +1,133 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_TRIANGLE_INDEX_VERTEX_ARRAY_H
#define B3_TRIANGLE_INDEX_VERTEX_ARRAY_H
#include "b3StridingMeshInterface.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Common/b3Scalar.h"
///The b3IndexedMesh indexes a single vertex and index array. Multiple b3IndexedMesh objects can be passed into a b3TriangleIndexVertexArray using addIndexedMesh.
///Instead of the number of indices, we pass the number of triangles.
B3_ATTRIBUTE_ALIGNED16( struct) b3IndexedMesh
{
B3_DECLARE_ALIGNED_ALLOCATOR();
int m_numTriangles;
const unsigned char * m_triangleIndexBase;
// Size in byte of the indices for one triangle (3*sizeof(index_type) if the indices are tightly packed)
int m_triangleIndexStride;
int m_numVertices;
const unsigned char * m_vertexBase;
// Size of a vertex, in bytes
int m_vertexStride;
// The index type is set when adding an indexed mesh to the
// b3TriangleIndexVertexArray, do not set it manually
PHY_ScalarType m_indexType;
// The vertex type has a default type similar to Bullet's precision mode (float or double)
// but can be set manually if you for example run Bullet with double precision but have
// mesh data in single precision..
PHY_ScalarType m_vertexType;
b3IndexedMesh()
:m_indexType(PHY_INTEGER),
#ifdef B3_USE_DOUBLE_PRECISION
m_vertexType(PHY_DOUBLE)
#else // B3_USE_DOUBLE_PRECISION
m_vertexType(PHY_FLOAT)
#endif // B3_USE_DOUBLE_PRECISION
{
}
}
;
typedef b3AlignedObjectArray<b3IndexedMesh> IndexedMeshArray;
///The b3TriangleIndexVertexArray allows to access multiple triangle meshes, by indexing into existing triangle/index arrays.
///Additional meshes can be added using addIndexedMesh
///No duplcate is made of the vertex/index data, it only indexes into external vertex/index arrays.
///So keep those arrays around during the lifetime of this b3TriangleIndexVertexArray.
B3_ATTRIBUTE_ALIGNED16( class) b3TriangleIndexVertexArray : public b3StridingMeshInterface
{
protected:
IndexedMeshArray m_indexedMeshes;
int m_pad[2];
mutable int m_hasAabb; // using int instead of bool to maintain alignment
mutable b3Vector3 m_aabbMin;
mutable b3Vector3 m_aabbMax;
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
b3TriangleIndexVertexArray() : m_hasAabb(0)
{
}
virtual ~b3TriangleIndexVertexArray();
//just to be backwards compatible
b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride);
void addIndexedMesh(const b3IndexedMesh& mesh, PHY_ScalarType indexType = PHY_INTEGER)
{
m_indexedMeshes.push_back(mesh);
m_indexedMeshes[m_indexedMeshes.size()-1].m_indexType = indexType;
}
virtual void getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0);
virtual void getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const;
/// unLockVertexBase finishes the access to a subpart of the triangle mesh
/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
virtual void unLockVertexBase(int subpart) {(void)subpart;}
virtual void unLockReadOnlyVertexBase(int subpart) const {(void)subpart;}
/// getNumSubParts returns the number of seperate subparts
/// each subpart has a continuous array of vertices and indices
virtual int getNumSubParts() const {
return (int)m_indexedMeshes.size();
}
IndexedMeshArray& getIndexedMeshArray()
{
return m_indexedMeshes;
}
const IndexedMeshArray& getIndexedMeshArray() const
{
return m_indexedMeshes;
}
virtual void preallocateVertices(int numverts){(void) numverts;}
virtual void preallocateIndices(int numindices){(void) numindices;}
virtual bool hasPremadeAabb() const;
virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const;
virtual void getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const;
}
;
#endif //B3_TRIANGLE_INDEX_VERTEX_ARRAY_H

View File

@@ -0,0 +1,310 @@
//keep this enum in sync with the CPU version (in btCollidable.h)
//written by Erwin Coumans
#define SHAPE_CONVEX_HULL 3
#define SHAPE_CONCAVE_TRIMESH 5
#define TRIANGLE_NUM_CONVEX_FACES 5
#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6
#define SHAPE_SPHERE 7
typedef unsigned int u32;
#define MAX_NUM_PARTS_IN_BITS 10
///btQuantizedBvhNode is a compressed aabb node, 16 bytes.
///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
typedef struct
{
//12 bytes
unsigned short int m_quantizedAabbMin[3];
unsigned short int m_quantizedAabbMax[3];
//4 bytes
int m_escapeIndexOrTriangleIndex;
} btQuantizedBvhNode;
typedef struct
{
float4 m_aabbMin;
float4 m_aabbMax;
float4 m_quantization;
int m_numNodes;
int m_numSubTrees;
int m_nodeOffset;
int m_subTreeOffset;
} b3BvhInfo;
/*
bool isLeafNode() const
{
//skipindex is negative (internal node), triangleindex >=0 (leafnode)
return (m_escapeIndexOrTriangleIndex >= 0);
}
int getEscapeIndex() const
{
btAssert(!isLeafNode());
return -m_escapeIndexOrTriangleIndex;
}
int getTriangleIndex() const
{
btAssert(isLeafNode());
unsigned int x=0;
unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
// Get only the lower bits where the triangle index is stored
return (m_escapeIndexOrTriangleIndex&~(y));
}
int getPartId() const
{
btAssert(isLeafNode());
// Get only the highest bits where the part index is stored
return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS));
}
*/
int getTriangleIndex(const btQuantizedBvhNode* rootNode)
{
unsigned int x=0;
unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
// Get only the lower bits where the triangle index is stored
return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
}
int isLeaf(const btQuantizedBvhNode* rootNode)
{
//skipindex is negative (internal node), triangleindex >=0 (leafnode)
return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
}
int getEscapeIndex(const btQuantizedBvhNode* rootNode)
{
return -rootNode->m_escapeIndexOrTriangleIndex;
}
typedef struct
{
//12 bytes
unsigned short int m_quantizedAabbMin[3];
unsigned short int m_quantizedAabbMax[3];
//4 bytes, points to the root of the subtree
int m_rootNodeIndex;
//4 bytes
int m_subtreeSize;
int m_padding[3];
} btBvhSubtreeInfo;
///keep this in sync with btCollidable.h
typedef struct
{
int m_numChildShapes;
int blaat2;
int m_shapeType;
int m_shapeIndex;
} btCollidableGpu;
typedef struct
{
float4 m_childPosition;
float4 m_childOrientation;
int m_shapeIndex;
int m_unused0;
int m_unused1;
int m_unused2;
} btGpuChildShape;
typedef struct
{
float4 m_pos;
float4 m_quat;
float4 m_linVel;
float4 m_angVel;
u32 m_collidableIdx;
float m_invMass;
float m_restituitionCoeff;
float m_frictionCoeff;
} BodyData;
typedef struct
{
union
{
float4 m_min;
float m_minElems[4];
int m_minIndices[4];
};
union
{
float4 m_max;
float m_maxElems[4];
int m_maxIndices[4];
};
} btAabbCL;
int testQuantizedAabbAgainstQuantizedAabb(
const unsigned short int* aabbMin1,
const unsigned short int* aabbMax1,
const unsigned short int* aabbMin2,
const unsigned short int* aabbMax2)
{
//int overlap = 1;
if (aabbMin1[0] > aabbMax2[0])
return 0;
if (aabbMax1[0] < aabbMin2[0])
return 0;
if (aabbMin1[1] > aabbMax2[1])
return 0;
if (aabbMax1[1] < aabbMin2[1])
return 0;
if (aabbMin1[2] > aabbMax2[2])
return 0;
if (aabbMax1[2] < aabbMin2[2])
return 0;
return 1;
//overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;
//overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;
//overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;
//return overlap;
}
void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)
{
float4 clampedPoint = max(point2,bvhAabbMin);
clampedPoint = min (clampedPoint, bvhAabbMax);
float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;
if (isMax)
{
out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));
out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));
out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));
} else
{
out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));
out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));
out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));
}
}
// work-in-progress
__kernel void bvhTraversalKernel( __global const int2* pairs,
__global const BodyData* rigidBodies,
__global const btCollidableGpu* collidables,
__global btAabbCL* aabbs,
__global int4* concavePairsOut,
__global volatile int* numConcavePairsOut,
__global const btBvhSubtreeInfo* subtreeHeadersRoot,
__global const btQuantizedBvhNode* quantizedNodesRoot,
__global const b3BvhInfo* bvhInfos,
int numPairs,
int maxNumConcavePairsCapacity)
{
int id = get_global_id(0);
if (id>=numPairs)
return;
int bodyIndexA = pairs[id].x;
int bodyIndexB = pairs[id].y;
int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
//once the broadphase avoids static-static pairs, we can remove this test
if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
{
return;
}
if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)
return;
int shapeTypeB = collidables[collidableIndexB].m_shapeType;
if (shapeTypeB!=SHAPE_CONVEX_HULL &&
shapeTypeB!=SHAPE_SPHERE &&
shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS
)
return;
b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];
float4 bvhAabbMin = bvhInfo.m_aabbMin;
float4 bvhAabbMax = bvhInfo.m_aabbMax;
float4 bvhQuantization = bvhInfo.m_quantization;
int numSubtreeHeaders = bvhInfo.m_numSubTrees;
__global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];
__global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];
unsigned short int quantizedQueryAabbMin[3];
unsigned short int quantizedQueryAabbMax[3];
quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);
quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);
for (int i=0;i<numSubtreeHeaders;i++)
{
btBvhSubtreeInfo subtree = subtreeHeaders[i];
int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
if (overlap != 0)
{
int startNodeIndex = subtree.m_rootNodeIndex;
int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;
int curIndex = startNodeIndex;
int escapeIndex;
int isLeafNode;
int aabbOverlap;
while (curIndex < endNodeIndex)
{
btQuantizedBvhNode rootNode = quantizedNodes[curIndex];
aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);
isLeafNode = isLeaf(&rootNode);
if (aabbOverlap)
{
if (isLeafNode)
{
int triangleIndex = getTriangleIndex(&rootNode);
if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)
{
int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);
for (int b=0;b<numChildrenB;b++)
{
if ((pairIdx+b)<maxNumConcavePairsCapacity)
{
int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);
concavePairsOut[pairIdx+b] = newPair;
}
}
} else
{
int pairIdx = atomic_inc(numConcavePairsOut);
if (pairIdx<maxNumConcavePairsCapacity)
{
int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);
concavePairsOut[pairIdx] = newPair;
}
}
}
curIndex++;
} else
{
if (isLeafNode)
{
curIndex++;
} else
{
escapeIndex = getEscapeIndex(&rootNode);
curIndex += escapeIndex;
}
}
}
}
}
}

View File

@@ -0,0 +1,313 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* bvhTraversalKernelCL= \
"//keep this enum in sync with the CPU version (in b3Collidable.h)\n"
"//written by Erwin Coumans\n"
"\n"
"#define SHAPE_CONVEX_HULL 3\n"
"#define SHAPE_CONCAVE_TRIMESH 5\n"
"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
"#define SHAPE_SPHERE 7\n"
"\n"
"typedef unsigned int u32;\n"
"\n"
"#define MAX_NUM_PARTS_IN_BITS 10\n"
"\n"
"///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
"typedef struct\n"
"{\n"
" //12 bytes\n"
" unsigned short int m_quantizedAabbMin[3];\n"
" unsigned short int m_quantizedAabbMax[3];\n"
" //4 bytes\n"
" int m_escapeIndexOrTriangleIndex;\n"
"} b3QuantizedBvhNode;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_aabbMin;\n"
" float4 m_aabbMax;\n"
" float4 m_quantization;\n"
" int m_numNodes;\n"
" int m_numSubTrees;\n"
" int m_nodeOffset;\n"
" int m_subTreeOffset;\n"
"\n"
"} b3BvhInfo;\n"
"\n"
"/*\n"
" bool isLeafNode() const\n"
" {\n"
" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
" return (m_escapeIndexOrTriangleIndex >= 0);\n"
" }\n"
" int getEscapeIndex() const\n"
" {\n"
" b3Assert(!isLeafNode());\n"
" return -m_escapeIndexOrTriangleIndex;\n"
" }\n"
" int getTriangleIndex() const\n"
" {\n"
" b3Assert(isLeafNode());\n"
" unsigned int x=0;\n"
" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
" // Get only the lower bits where the triangle index is stored\n"
" return (m_escapeIndexOrTriangleIndex&~(y));\n"
" }\n"
" int getPartId() const\n"
" {\n"
" b3Assert(isLeafNode());\n"
" // Get only the highest bits where the part index is stored\n"
" return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS));\n"
" }\n"
"*/\n"
"\n"
"int getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n"
"{\n"
" unsigned int x=0;\n"
" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
" // Get only the lower bits where the triangle index is stored\n"
" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
"}\n"
"\n"
"int isLeaf(const b3QuantizedBvhNode* rootNode)\n"
"{\n"
" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
"}\n"
" \n"
"int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n"
"{\n"
" return -rootNode->m_escapeIndexOrTriangleIndex;\n"
"}\n"
"\n"
"typedef struct\n"
"{\n"
" //12 bytes\n"
" unsigned short int m_quantizedAabbMin[3];\n"
" unsigned short int m_quantizedAabbMax[3];\n"
" //4 bytes, points to the root of the subtree\n"
" int m_rootNodeIndex;\n"
" //4 bytes\n"
" int m_subtreeSize;\n"
" int m_padding[3];\n"
"} b3BvhSubtreeInfo;\n"
"\n"
"///keep this in sync with b3Collidable.h\n"
"typedef struct\n"
"{\n"
" int m_numChildShapes;\n"
" int blaat2;\n"
" int m_shapeType;\n"
" int m_shapeIndex;\n"
" \n"
"} b3CollidableGpu;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_childPosition;\n"
" float4 m_childOrientation;\n"
" int m_shapeIndex;\n"
" int m_unused0;\n"
" int m_unused1;\n"
" int m_unused2;\n"
"} b3GpuChildShape;\n"
"\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" float4 m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
"\n"
" u32 m_collidableIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} BodyData;\n"
"\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} b3AabbCL;\n"
"\n"
"\n"
"int testQuantizedAabbAgainstQuantizedAabb(\n"
" const unsigned short int* aabbMin1,\n"
" const unsigned short int* aabbMax1,\n"
" const unsigned short int* aabbMin2,\n"
" const unsigned short int* aabbMax2)\n"
"{\n"
" //int overlap = 1;\n"
" if (aabbMin1[0] > aabbMax2[0])\n"
" return 0;\n"
" if (aabbMax1[0] < aabbMin2[0])\n"
" return 0;\n"
" if (aabbMin1[1] > aabbMax2[1])\n"
" return 0;\n"
" if (aabbMax1[1] < aabbMin2[1])\n"
" return 0;\n"
" if (aabbMin1[2] > aabbMax2[2])\n"
" return 0;\n"
" if (aabbMax1[2] < aabbMin2[2])\n"
" return 0;\n"
" return 1;\n"
" //overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;\n"
" //overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;\n"
" //overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;\n"
" //return overlap;\n"
"}\n"
"\n"
"\n"
"void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)\n"
"{\n"
" float4 clampedPoint = max(point2,bvhAabbMin);\n"
" clampedPoint = min (clampedPoint, bvhAabbMax);\n"
"\n"
" float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;\n"
" if (isMax)\n"
" {\n"
" out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));\n"
" out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));\n"
" out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));\n"
" } else\n"
" {\n"
" out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));\n"
" out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));\n"
" out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));\n"
" }\n"
"\n"
"}\n"
"\n"
"\n"
"// work-in-progress\n"
"__kernel void bvhTraversalKernel( __global const int2* pairs, \n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global b3AabbCL* aabbs,\n"
" __global int4* concavePairsOut,\n"
" __global volatile int* numConcavePairsOut,\n"
" __global const b3BvhSubtreeInfo* subtreeHeadersRoot,\n"
" __global const b3QuantizedBvhNode* quantizedNodesRoot,\n"
" __global const b3BvhInfo* bvhInfos,\n"
" int numPairs,\n"
" int maxNumConcavePairsCapacity)\n"
"{\n"
" int id = get_global_id(0);\n"
" if (id>=numPairs)\n"
" return;\n"
" \n"
" int bodyIndexA = pairs[id].x;\n"
" int bodyIndexB = pairs[id].y;\n"
" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
" \n"
" //once the broadphase avoids static-static pairs, we can remove this test\n"
" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
" {\n"
" return;\n"
" }\n"
" \n"
" if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)\n"
" return;\n"
"\n"
" int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
" \n"
" if (shapeTypeB!=SHAPE_CONVEX_HULL &&\n"
" shapeTypeB!=SHAPE_SPHERE &&\n"
" shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
" )\n"
" return;\n"
"\n"
" b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];\n"
"\n"
" float4 bvhAabbMin = bvhInfo.m_aabbMin;\n"
" float4 bvhAabbMax = bvhInfo.m_aabbMax;\n"
" float4 bvhQuantization = bvhInfo.m_quantization;\n"
" int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n"
" __global const b3BvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
" __global const b3QuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
" \n"
"\n"
" unsigned short int quantizedQueryAabbMin[3];\n"
" unsigned short int quantizedQueryAabbMax[3];\n"
" quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
" quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
" \n"
" for (int i=0;i<numSubtreeHeaders;i++)\n"
" {\n"
" b3BvhSubtreeInfo subtree = subtreeHeaders[i];\n"
" \n"
" int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n"
" if (overlap != 0)\n"
" {\n"
" int startNodeIndex = subtree.m_rootNodeIndex;\n"
" int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;\n"
" int curIndex = startNodeIndex;\n"
" int escapeIndex;\n"
" int isLeafNode;\n"
" int aabbOverlap;\n"
" while (curIndex < endNodeIndex)\n"
" {\n"
" b3QuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
" aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n"
" isLeafNode = isLeaf(&rootNode);\n"
" if (aabbOverlap)\n"
" {\n"
" if (isLeafNode)\n"
" {\n"
" int triangleIndex = getTriangleIndex(&rootNode);\n"
" if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
" {\n"
" int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
" int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);\n"
" for (int b=0;b<numChildrenB;b++)\n"
" {\n"
" if ((pairIdx+b)<maxNumConcavePairsCapacity)\n"
" {\n"
" int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
" int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);\n"
" concavePairsOut[pairIdx+b] = newPair;\n"
" }\n"
" }\n"
" } else\n"
" {\n"
" int pairIdx = atomic_inc(numConcavePairsOut);\n"
" if (pairIdx<maxNumConcavePairsCapacity)\n"
" {\n"
" int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);\n"
" concavePairsOut[pairIdx] = newPair;\n"
" }\n"
" }\n"
" } \n"
" curIndex++;\n"
" } else\n"
" {\n"
" if (isLeafNode)\n"
" {\n"
" curIndex++;\n"
" } else\n"
" {\n"
" escapeIndex = getEscapeIndex(&rootNode);\n"
" curIndex += escapeIndex;\n"
" }\n"
" }\n"
" }\n"
" }\n"
" }\n"
"\n"
"}\n"
;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,213 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
//Host-code rewritten by Erwin Coumans
#define BOUNDSEARCH_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl"
#define KERNEL0 "SearchSortDataLowerKernel"
#define KERNEL1 "SearchSortDataUpperKernel"
#define KERNEL2 "SubtractKernel"
#include "b3BoundSearchCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "b3LauncherCL.h"
#include "kernels/BoundSearchKernelsCL.h"
b3BoundSearchCL::b3BoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
:m_context(ctx),
m_device(device),
m_queue(queue)
{
const char* additionalMacros = "";
const char* srcFileNameForCaching="";
cl_int pErrNum;
const char* kernelSource = boundSearchKernelsCL;
cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH);
b3Assert(boundSearchProg);
m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros );
b3Assert(m_lowerSortDataKernel );
m_upperSortDataKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros );
b3Assert(m_upperSortDataKernel);
m_subtractKernel = 0;
if( maxSize )
{
m_subtractKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros );
b3Assert(m_subtractKernel);
}
//m_constBuffer = new b3OpenCLArray<b3Int4>( device, 1, BufferBase::BUFFER_CONST );
m_lower = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue,maxSize );
m_upper = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue, maxSize );
m_filler = new b3FillCL(ctx,device,queue);
}
b3BoundSearchCL::~b3BoundSearchCL()
{
delete m_lower;
delete m_upper;
delete m_filler;
clReleaseKernel(m_lowerSortDataKernel);
clReleaseKernel(m_upperSortDataKernel);
clReleaseKernel(m_subtractKernel);
}
void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option )
{
b3Int4 constBuffer;
constBuffer.x = nSrc;
constBuffer.y = nDst;
if( option == BOUND_LOWER )
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL()) };
b3LauncherCL launcher( m_queue, m_lowerSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
launcher.launch1D( nSrc, 64 );
}
else if( option == BOUND_UPPER )
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
b3LauncherCL launcher(m_queue, m_upperSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
launcher.launch1D( nSrc, 64 );
}
else if( option == COUNT )
{
b3Assert( m_lower );
b3Assert( m_upper );
b3Assert( m_lower->capacity() <= (int)nDst );
b3Assert( m_upper->capacity() <= (int)nDst );
int zero = 0;
m_filler->execute( *m_lower, zero, nDst );
m_filler->execute( *m_upper, zero, nDst );
execute( src, nSrc, *m_lower, nDst, BOUND_LOWER );
execute( src, nSrc, *m_upper, nDst, BOUND_UPPER );
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_upper->getBufferCL(), true ), b3BufferInfoCL( m_lower->getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
b3LauncherCL launcher( m_queue, m_subtractKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
launcher.launch1D( nDst, 64 );
}
}
else
{
b3Assert( 0 );
}
}
void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc,
b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option )
{
for(int i=0; i<nSrc-1; i++)
b3Assert( src[i].m_key <= src[i+1].m_key );
b3SortData minData,zeroData,maxData;
minData.m_key = -1;
minData.m_value = -1;
zeroData.m_key=0;
zeroData.m_value=0;
maxData.m_key = nDst;
maxData.m_value = nDst;
if( option == BOUND_LOWER )
{
for(int i=0; i<nSrc; i++)
{
b3SortData& iData = (i==0)? minData: src[i-1];
b3SortData& jData = (i==nSrc)? maxData: src[i];
if( iData.m_key != jData.m_key )
{
int k = jData.m_key;
{
dst[k] = i;
}
}
}
}
else if( option == BOUND_UPPER )
{
for(int i=1; i<nSrc+1; i++)
{
b3SortData& iData = src[i-1];
b3SortData& jData = (i==nSrc)? maxData: src[i];
if( iData.m_key != jData.m_key )
{
int k = iData.m_key;
{
dst[k] = i;
}
}
}
}
else if( option == COUNT )
{
b3AlignedObjectArray<unsigned int> lower;
lower.resize(nDst );
b3AlignedObjectArray<unsigned int> upper;
upper.resize(nDst );
for(int i=0; i<nDst; i++)
{
lower[i] = upper[i] = 0;
}
executeHost( src, nSrc, lower, nDst, BOUND_LOWER );
executeHost( src, nSrc, upper, nDst, BOUND_UPPER );
for( int i=0; i<nDst; i++)
{
dst[i] = upper[i] - lower[i];
}
}
else
{
b3Assert( 0 );
}
}

View File

@@ -0,0 +1,67 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#ifndef B3_BOUNDSEARCH_H
#define B3_BOUNDSEARCH_H
#pragma once
/*#include <Adl/Adl.h>
#include <AdlPrimitives/Math/Math.h>
#include <AdlPrimitives/Sort/SortData.h>
#include <AdlPrimitives/Fill/Fill.h>
*/
#include "b3OpenCLArray.h"
#include "b3FillCL.h"
#include "b3RadixSort32CL.h" //for b3SortData (perhaps move it?)
class b3BoundSearchCL
{
public:
enum Option
{
BOUND_LOWER,
BOUND_UPPER,
COUNT,
};
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_lowerSortDataKernel;
cl_kernel m_upperSortDataKernel;
cl_kernel m_subtractKernel;
b3OpenCLArray<b3Int4>* m_constbtOpenCLArray;
b3OpenCLArray<unsigned int>* m_lower;
b3OpenCLArray<unsigned int>* m_upper;
b3FillCL* m_filler;
b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
virtual ~b3BoundSearchCL();
// src has to be src[i].m_key <= src[i+1].m_key
void execute( b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER );
void executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
};
#endif //B3_BOUNDSEARCH_H

View File

@@ -0,0 +1,19 @@
#ifndef B3_BUFFER_INFO_CL_H
#define B3_BUFFER_INFO_CL_H
#include "b3OpenCLArray.h"
struct b3BufferInfoCL
{
//b3BufferInfoCL(){}
// template<typename T>
b3BufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){}
cl_mem m_clBuffer;
bool m_isReadOnly;
};
#endif //B3_BUFFER_INFO_CL_H

View File

@@ -0,0 +1,126 @@
#include "b3FillCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "b3BufferInfoCL.h"
#include "b3LauncherCL.h"
#define FILL_CL_PROGRAM_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl"
#include "kernels/FillKernelsCL.h"
b3FillCL::b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
:m_commandQueue(queue)
{
const char* kernelSource = fillKernelsCL;
cl_int pErrNum;
const char* additionalMacros = "";
cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH);
b3Assert(fillProg);
m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros );
b3Assert(m_fillIntKernel);
m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros );
b3Assert(m_fillIntKernel);
m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros );
b3Assert(m_fillFloatKernel);
m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros );
b3Assert(m_fillKernelInt2);
}
b3FillCL::~b3FillCL()
{
clReleaseKernel(m_fillKernelInt2);
clReleaseKernel(m_fillIntKernel);
clReleaseKernel(m_fillUnsignedIntKernel);
clReleaseKernel(m_fillFloatKernel);
}
void b3FillCL::execute(b3OpenCLArray<float>& src, const float value, int n, int offset)
{
b3Assert( n>0 );
{
b3LauncherCL launcher( m_commandQueue, m_fillFloatKernel );
launcher.setBuffer( src.getBufferCL());
launcher.setConst( n );
launcher.setConst( value );
launcher.setConst( offset);
launcher.launch1D( n );
}
}
void b3FillCL::execute(b3OpenCLArray<int>& src, const int value, int n, int offset)
{
b3Assert( n>0 );
{
b3LauncherCL launcher( m_commandQueue, m_fillIntKernel );
launcher.setBuffer(src.getBufferCL());
launcher.setConst( n);
launcher.setConst( value);
launcher.setConst( offset);
launcher.launch1D( n );
}
}
void b3FillCL::execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
{
b3Assert( n>0 );
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( n );
launcher.setConst(value);
launcher.setConst(offset);
launcher.launch1D( n );
}
}
void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
{
for (int i=0;i<n;i++)
{
src[i+offset]=value;
}
}
void b3FillCL::executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset)
{
for (int i=0;i<n;i++)
{
src[i+offset]=value;
}
}
void b3FillCL::execute(b3OpenCLArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
{
b3Assert( n>0 );
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(n);
launcher.setConst(value);
launcher.setConst(offset);
//( constBuffer );
launcher.launch1D( n );
}
}

View File

@@ -0,0 +1,63 @@
#ifndef B3_FILL_CL_H
#define B3_FILL_CL_H
#include "b3OpenCLArray.h"
#include "Bullet3Common/b3Scalar.h"
#include "Bullet3Common/b3Int2.h"
#include "Bullet3Common/b3Int4.h"
class b3FillCL
{
cl_command_queue m_commandQueue;
cl_kernel m_fillKernelInt2;
cl_kernel m_fillIntKernel;
cl_kernel m_fillUnsignedIntKernel;
cl_kernel m_fillFloatKernel;
public:
struct b3ConstData
{
union
{
b3Int4 m_data;
b3UnsignedInt4 m_UnsignedData;
};
int m_offset;
int m_n;
int m_padding[2];
};
protected:
public:
b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
virtual ~b3FillCL();
void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0);
void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0);
void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0);
void executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset);
void executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset);
// void execute(b3OpenCLArray<b3Int4>& src, const b3Int4& value, int n, int offset = 0);
};
#endif //B3_FILL_CL_H

View File

@@ -0,0 +1,363 @@
#ifndef B3_LAUNCHER_CL_H
#define B3_LAUNCHER_CL_H
#include "b3BufferInfoCL.h"
#include "Bullet3Common/b3MinMax.h"
#include "b3OpenCLArray.h"
#include <stdio.h>
#ifdef _WIN32
#pragma warning(disable :4996)
#endif
#define B3_CL_MAX_ARG_SIZE 16
struct b3KernelArgData
{
int m_isBuffer;
int m_argIndex;
int m_argSizeInBytes;
union
{
cl_mem m_clBuffer;
unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
};
};
class b3LauncherCL
{
cl_command_queue m_commandQueue;
cl_kernel m_kernel;
int m_idx;
b3AlignedObjectArray<b3KernelArgData> m_kernelArguments;
int m_serializationSizeInBytes;
public:
b3AlignedObjectArray<b3OpenCLArray<unsigned char>* > m_arrays;
b3LauncherCL(cl_command_queue queue, cl_kernel kernel)
:m_commandQueue(queue),
m_kernel(kernel),
m_idx(0)
{
m_serializationSizeInBytes = sizeof(int);
}
virtual ~b3LauncherCL()
{
for (int i=0;i<m_arrays.size();i++)
{
clReleaseMemObject(m_arrays[i]->getBufferCL());
}
}
inline void setBuffer( cl_mem clBuffer)
{
b3KernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 1;
kernelArg.m_clBuffer = clBuffer;
cl_mem_info param_name = CL_MEM_SIZE;
size_t param_value;
size_t sizeInBytes = sizeof(size_t);
size_t actualSizeInBytes;
cl_int err;
err = clGetMemObjectInfo ( kernelArg.m_clBuffer,
param_name,
sizeInBytes,
&param_value,
&actualSizeInBytes);
b3Assert( err == CL_SUCCESS );
kernelArg.m_argSizeInBytes = param_value;
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+= sizeof(b3KernelArgData);
m_serializationSizeInBytes+=param_value;
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
b3Assert( status == CL_SUCCESS );
}
inline void setBuffers( b3BufferInfoCL* buffInfo, int n )
{
for(int i=0; i<n; i++)
{
b3KernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 1;
kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
cl_mem_info param_name = CL_MEM_SIZE;
size_t param_value;
size_t sizeInBytes = sizeof(size_t);
size_t actualSizeInBytes;
cl_int err;
err = clGetMemObjectInfo ( kernelArg.m_clBuffer,
param_name,
sizeInBytes,
&param_value,
&actualSizeInBytes);
b3Assert( err == CL_SUCCESS );
kernelArg.m_argSizeInBytes = param_value;
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+= sizeof(b3KernelArgData);
m_serializationSizeInBytes+=param_value;
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
b3Assert( status == CL_SUCCESS );
}
}
int getSerializationBufferSize() const
{
return m_serializationSizeInBytes;
}
inline int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx)
{
int index=0;
int numArguments = *(int*) &buf[index];
index+=sizeof(int);
for (int i=0;i<numArguments;i++)
{
b3KernelArgData* arg = (b3KernelArgData*)&buf[index];
index+=sizeof(b3KernelArgData);
if (arg->m_isBuffer)
{
b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes);
clData->resize(arg->m_argSizeInBytes);
clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
arg->m_clBuffer = clData->getBufferCL();
m_arrays.push_back(clData);
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
b3Assert( status == CL_SUCCESS );
index+=arg->m_argSizeInBytes;
} else
{
cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
b3Assert( status == CL_SUCCESS );
}
m_kernelArguments.push_back(*arg);
}
m_serializationSizeInBytes = index;
return index;
}
inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx)
{
int index=0;
int numArguments = *(int*) &goldBuffer[index];
index+=sizeof(int);
if (numArguments != m_kernelArguments.size())
{
printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size());
return -1;
}
for (int ii=0;ii<numArguments;ii++)
{
b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index];
if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
{
printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
return -2;
}
{
int expected = argGold->m_isBuffer;
int found = m_kernelArguments[ii].m_isBuffer;
if (expected != found)
{
printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found);
return -3;
}
}
index+=sizeof(b3KernelArgData);
if (argGold->m_isBuffer)
{
unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes);
unsigned char* goldBuf = &goldBuffer[index];
for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++)
{
memBuf[j] = 0xaa;
}
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
memBuf, 0,0,0 );
b3Assert( status==CL_SUCCESS );
clFinish(m_commandQueue);
for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
{
int expected = goldBuf[b];
int found = memBuf[b];
if (expected != found)
{
printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n",
ii, b, expected, found);
return -4;
}
}
index+=argGold->m_argSizeInBytes;
} else
{
//compare content
for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
{
int expected = argGold->m_argData[b];
int found =m_kernelArguments[ii].m_argData[b];
if (expected != found)
{
printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n",
ii, b, expected, found);
return -5;
}
}
}
}
return index;
}
inline int serializeArguments(unsigned char* destBuffer, int destBufferCapacity)
{
//initialize to known values
for (int i=0;i<destBufferCapacity;i++)
destBuffer[i] = 0xec;
assert(destBufferCapacity>=m_serializationSizeInBytes);
//todo: use the b3Serializer for this to allow for 32/64bit, endianness etc
int numArguments = m_kernelArguments.size();
int curBufferSize = 0;
int* dest = (int*)&destBuffer[curBufferSize];
*dest = numArguments;
curBufferSize += sizeof(int);
for (int i=0;i<this->m_kernelArguments.size();i++)
{
b3KernelArgData* arg = (b3KernelArgData*) &destBuffer[curBufferSize];
*arg = m_kernelArguments[i];
curBufferSize+=sizeof(b3KernelArgData);
if (arg->m_isBuffer==1)
{
//copy the OpenCL buffer content
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
&destBuffer[curBufferSize], 0,0,0 );
b3Assert( status==CL_SUCCESS );
clFinish(m_commandQueue);
curBufferSize+=arg->m_argSizeInBytes;
}
}
return curBufferSize;
}
void serializeToFile(const char* fileName, int numWorkItems)
{
int num = numWorkItems;
int buffSize = getSerializationBufferSize();
unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
for (int i=0;i<buffSize+1;i++)
{
unsigned char* ptr = (unsigned char*)&buf[i];
*ptr = 0xff;
}
int actualWrite = serializeArguments(buf,buffSize);
unsigned char* cptr = (unsigned char*)&buf[buffSize];
// printf("buf[buffSize] = %d\n",*cptr);
assert(buf[buffSize]==0xff);//check for buffer overrun
int* ptr = (int*)&buf[buffSize];
*ptr = num;
FILE* f = fopen(fileName,"wb");
fwrite(buf,buffSize+sizeof(int),1,f);
fclose(f);
delete[] buf;
}
template<typename T>
inline void setConst( const T& consts )
{
int sz=sizeof(T);
b3Assert(sz<=B3_CL_MAX_ARG_SIZE);
b3KernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 0;
T* destArg = (T*)kernelArg.m_argData;
*destArg = consts;
kernelArg.m_argSizeInBytes = sizeof(T);
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+=sizeof(b3KernelArgData);
cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts );
b3Assert( status == CL_SUCCESS );
}
inline void launch1D( int numThreads, int localSize = 64)
{
launch2D( numThreads, 1, localSize, 1 );
}
inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
{
size_t gRange[3] = {1,1,1};
size_t lRange[3] = {1,1,1};
lRange[0] = localSizeX;
lRange[1] = localSizeY;
gRange[0] = b3Max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
gRange[0] *= lRange[0];
gRange[1] = b3Max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
gRange[1] *= lRange[1];
cl_int status = clEnqueueNDRangeKernel( m_commandQueue,
m_kernel, 2, NULL, gRange, lRange, 0,0,0 );
if (status != CL_SUCCESS)
{
printf("Error: OpenCL status = %d\n",status);
}
b3Assert( status == CL_SUCCESS );
}
};
#endif //B3_LAUNCHER_CL_H

View File

@@ -0,0 +1,274 @@
#ifndef B3_OPENCL_ARRAY_H
#define B3_OPENCL_ARRAY_H
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
template <typename T>
class b3OpenCLArray
{
int m_size;
int m_capacity;
cl_mem m_clBuffer;
cl_context m_clContext;
cl_command_queue m_commandQueue;
bool m_ownsMemory;
bool m_allowGrowingCapacity;
void deallocate()
{
if (m_clBuffer && m_ownsMemory)
{
clReleaseMemObject(m_clBuffer);
}
m_clBuffer = 0;
m_capacity=0;
}
b3OpenCLArray<T>& operator=(const b3OpenCLArray<T>& src);
B3_FORCE_INLINE int allocSize(int size)
{
return (size ? size*2 : 1);
}
public:
b3OpenCLArray(cl_context ctx, cl_command_queue queue, int initialCapacity=0, bool allowGrowingCapacity=true)
:m_size(0), m_capacity(0),m_clBuffer(0),
m_clContext(ctx),m_commandQueue(queue),
m_ownsMemory(true),m_allowGrowingCapacity(true)
{
if (initialCapacity)
{
reserve(initialCapacity);
}
m_allowGrowingCapacity = allowGrowingCapacity;
}
///this is an error-prone method with no error checking, be careful!
void setFromOpenCLBuffer(cl_mem buffer, int sizeInElements)
{
deallocate();
m_ownsMemory = false;
m_allowGrowingCapacity = false;
m_clBuffer = buffer;
m_size = sizeInElements;
m_capacity = sizeInElements;
}
// we could enable this assignment, but need to make sure to avoid accidental deep copies
// b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
// {
// copyFromArray(src);
// return *this;
// }
cl_mem getBufferCL() const
{
return m_clBuffer;
}
virtual ~b3OpenCLArray()
{
deallocate();
m_size=0;
m_capacity=0;
}
B3_FORCE_INLINE void push_back(const T& _Val,bool waitForCompletion=true)
{
int sz = size();
if( sz == capacity() )
{
reserve( allocSize(size()) );
}
copyFromHostPointer(&_Val, 1, sz, waitForCompletion);
m_size++;
}
B3_FORCE_INLINE T forcedAt(int n) const
{
b3Assert(n>=0);
b3Assert(n<capacity());
T elem;
copyToHostPointer(&elem,1,n,true);
return elem;
}
B3_FORCE_INLINE T at(int n) const
{
b3Assert(n>=0);
b3Assert(n<size());
T elem;
copyToHostPointer(&elem,1,n,true);
return elem;
}
B3_FORCE_INLINE void resize(int newsize, bool copyOldContents=true)
{
int curSize = size();
if (newsize < curSize)
{
//leave the OpenCL memory for now
} else
{
if (newsize > size())
{
reserve(newsize,copyOldContents);
}
//leave new data uninitialized (init in debug mode?)
//for (int i=curSize;i<newsize;i++) ...
}
m_size = newsize;
}
B3_FORCE_INLINE int size() const
{
return m_size;
}
B3_FORCE_INLINE int capacity() const
{
return m_capacity;
}
B3_FORCE_INLINE void reserve(int _Count, bool copyOldContents=true)
{ // determine new minimum length of allocated storage
if (capacity() < _Count)
{ // not enough room, reallocate
if (m_allowGrowingCapacity)
{
cl_int ciErrNum;
//create a new OpenCL buffer
int memSizeInBytes = sizeof(T)*_Count;
cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
b3Assert(ciErrNum==CL_SUCCESS);
//#define B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
#ifdef B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
for (int i=0;i<memSizeInBytes;i++)
src[i] = 0xbb;
ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 );
b3Assert(ciErrNum==CL_SUCCESS);
clFinish(m_commandQueue);
free(src);
#endif //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
if (copyOldContents)
copyToCL(buf, size());
//deallocate the old buffer
deallocate();
m_clBuffer = buf;
m_capacity = _Count;
} else
{
//fail: assert and
b3Assert(0);
deallocate();
}
}
}
void copyToCL(cl_mem destination, int numElements, int firstElem=0, int dstOffsetInElems=0) const
{
if (numElements<=0)
return;
b3Assert(m_clBuffer);
b3Assert(destination);
//likely some error, destination is same as source
b3Assert(m_clBuffer != destination);
b3Assert((firstElem+numElements)<=m_size);
cl_int status = 0;
b3Assert(numElements>0);
b3Assert(numElements<=m_size);
int srcOffsetBytes = sizeof(T)*firstElem;
int dstOffsetInBytes = sizeof(T)*dstOffsetInElems;
status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination,
srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 );
b3Assert( status == CL_SUCCESS );
}
void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion=true)
{
int newSize = srcArray.size();
bool copyOldContents = false;
resize (newSize,copyOldContents);
if (newSize)
copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion);
}
void copyFromHostPointer(const T* src, int numElems, int destFirstElem= 0, bool waitForCompletion=true)
{
b3Assert(numElems+destFirstElem <= capacity());
cl_int status = 0;
int sizeInBytes=sizeof(T)*numElems;
status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes,
src, 0,0,0 );
b3Assert(status == CL_SUCCESS );
if (waitForCompletion)
clFinish(m_commandQueue);
}
void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion=true) const
{
destArray.resize(this->size());
if (size())
copyToHostPointer(&destArray[0], size(),0,waitForCompletion);
}
void copyToHostPointer(T* destPtr, int numElem, int srcFirstElem=0, bool waitForCompletion=true) const
{
b3Assert(numElem+srcFirstElem <= capacity());
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem,
destPtr, 0,0,0 );
b3Assert( status==CL_SUCCESS );
if (waitForCompletion)
clFinish(m_commandQueue);
}
void copyFromOpenCLArray(const b3OpenCLArray& src)
{
int newSize = src.size();
resize(newSize);
if (size())
{
src.copyToCL(m_clBuffer,size());
}
}
};
#endif //B3_OPENCL_ARRAY_H

View File

@@ -0,0 +1,126 @@
#include "b3PrefixScanCL.h"
#include "b3FillCL.h"
#define B3_PREFIXSCAN_PROG_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl"
#include "b3LauncherCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "kernels/PrefixScanKernelsCL.h"
b3PrefixScanCL::b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
:m_commandQueue(queue)
{
const char* scanKernelSource = prefixScanKernelsCL;
cl_int pErrNum;
char* additionalMacros=0;
m_workBuffer = new b3OpenCLArray<unsigned int>(ctx,queue,size);
cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_PROG_PATH);
b3Assert(scanProg);
m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
b3Assert(m_localScanKernel );
m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
b3Assert(m_blockSumKernel );
m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
b3Assert(m_propagationKernel );
}
b3PrefixScanCL::~b3PrefixScanCL()
{
delete m_workBuffer;
clReleaseKernel(m_localScanKernel);
clReleaseKernel(m_blockSumKernel);
clReleaseKernel(m_propagationKernel);
}
template<class T>
T b3NextPowerOf2(T n)
{
n -= 1;
for(int i=0; i<sizeof(T)*8; i++)
n = n | (n>>i);
return n+1;
}
void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
{
// b3Assert( data->m_option == EXCLUSIVE );
const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
dst.resize(src.size());
m_workBuffer->resize(src.size());
b3Int4 constBuffer;
constBuffer.x = n;
constBuffer.y = numBlocks;
constBuffer.z = (int)b3NextPowerOf2( numBlocks );
b3OpenCLArray<unsigned int>* srcNative = &src;
b3OpenCLArray<unsigned int>* dstNative = &dst;
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_localScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
}
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_blockSumKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
}
if( numBlocks > 1 )
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_propagationKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
}
if( sum )
{
clFinish(m_commandQueue);
dstNative->copyToHostPointer(sum,1,n-1,true);
}
}
void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
{
unsigned int s = 0;
//if( data->m_option == EXCLUSIVE )
{
for(int i=0; i<n; i++)
{
dst[i] = s;
s += src[i];
}
}
/*else
{
for(int i=0; i<n; i++)
{
s += hSrc[i];
hDst[i] = s;
}
}
*/
if( sum )
{
*sum = dst[n-1];
}
}

View File

@@ -0,0 +1,37 @@
#ifndef B3_PREFIX_SCAN_CL_H
#define B3_PREFIX_SCAN_CL_H
#include "b3OpenCLArray.h"
#include "b3BufferInfoCL.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
class b3PrefixScanCL
{
enum
{
BLOCK_SIZE = 128
};
// Option m_option;
cl_command_queue m_commandQueue;
cl_kernel m_localScanKernel;
cl_kernel m_blockSumKernel;
cl_kernel m_propagationKernel;
b3OpenCLArray<unsigned int>* m_workBuffer;
public:
b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
virtual ~b3PrefixScanCL();
void execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum);
};
#endif //B3_PREFIX_SCAN_CL_H

View File

@@ -0,0 +1,712 @@
#include "b3RadixSort32CL.h"
#include "b3LauncherCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "b3PrefixScanCL.h"
#include "b3FillCL.h"
#define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl"
#include "kernels/RadixSort32KernelsCL.h"
b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
:m_commandQueue(queue)
{
b3OpenCLDeviceInfo info;
b3OpenCLUtils::getDeviceInfo(device,&info);
m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0;
m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx,queue);
m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx,queue);
m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx,queue);
m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx,queue);
m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx,queue);
m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx,queue);
if (initialCapacity>0)
{
m_workBuffer1->resize(initialCapacity);
m_workBuffer3->resize(initialCapacity);
m_workBuffer3a->resize(initialCapacity);
m_workBuffer4->resize(initialCapacity);
m_workBuffer4a->resize(initialCapacity);
}
m_scan = new b3PrefixScanCL(ctx,device,queue);
m_fill = new b3FillCL(ctx,device,queue);
const char* additionalMacros = "";
const char* srcFileNameForCaching="";
cl_int pErrNum;
const char* kernelSource = radixSort32KernelsCL;
cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH);
b3Assert(sortProg);
m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros );
b3Assert(m_streamCountSortDataKernel );
m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros );
b3Assert(m_streamCountKernel);
if (m_deviceCPU)
{
m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros );
b3Assert(m_sortAndScatterSortDataKernel);
m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros );
b3Assert(m_sortAndScatterKernel);
} else
{
m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros );
b3Assert(m_sortAndScatterSortDataKernel);
m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros );
b3Assert(m_sortAndScatterKernel);
}
m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros );
b3Assert(m_prefixScanKernel);
}
b3RadixSort32CL::~b3RadixSort32CL()
{
delete m_scan;
delete m_fill;
delete m_workBuffer1;
delete m_workBuffer2;
delete m_workBuffer3;
delete m_workBuffer3a;
delete m_workBuffer4;
delete m_workBuffer4a;
clReleaseKernel(m_streamCountSortDataKernel);
clReleaseKernel(m_streamCountKernel);
clReleaseKernel(m_sortAndScatterSortDataKernel);
clReleaseKernel(m_sortAndScatterKernel);
clReleaseKernel(m_prefixScanKernel);
}
void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
{
int n = inout.size();
const int BITS_PER_PASS = 8;
const int NUM_TABLES = (1<<BITS_PER_PASS);
int tables[NUM_TABLES];
int counter[NUM_TABLES];
b3SortData* src = &inout[0];
b3AlignedObjectArray<b3SortData> workbuffer;
workbuffer.resize(inout.size());
b3SortData* dst = &workbuffer[0];
int count=0;
for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
{
for(int i=0; i<NUM_TABLES; i++)
{
tables[i] = 0;
}
for(int i=0; i<n; i++)
{
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
tables[tableIdx]++;
}
//#define TEST
#ifdef TEST
printf("histogram size=%d\n",NUM_TABLES);
for (int i=0;i<NUM_TABLES;i++)
{
if (tables[i]!=0)
{
printf("tables[%d]=%d]\n",i,tables[i]);
}
}
#endif //TEST
// prefix scan
int sum = 0;
for(int i=0; i<NUM_TABLES; i++)
{
int iData = tables[i];
tables[i] = sum;
sum += iData;
counter[i] = 0;
}
// distribute
for(int i=0; i<n; i++)
{
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
dst[tables[tableIdx] + counter[tableIdx]] = src[i];
counter[tableIdx] ++;
}
b3Swap( src, dst );
count++;
}
if (count&1)
{
b3Assert(0);//need to copy
}
}
void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
{
b3AlignedObjectArray<b3SortData> inout;
keyValuesInOut.copyToHost(inout);
executeHost(inout,sortBits);
keyValuesInOut.copyFromHost(inout);
}
void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
{
}
//#define DEBUG_RADIXSORT
//#define DEBUG_RADIXSORT2
void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
{
int originalSize = keyValuesInOut.size();
int workingSize = originalSize;
int dataAlignment = DATA_ALIGNMENT;
#ifdef DEBUG_RADIXSORT2
b3AlignedObjectArray<b3SortData> test2;
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n",test2.size());
for (int i=0;i<test2.size();i++)
{
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
}
#endif //DEBUG_RADIXSORT2
b3OpenCLArray<b3SortData>* src = 0;
if (workingSize%dataAlignment)
{
workingSize += dataAlignment-(workingSize%dataAlignment);
m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
m_workBuffer4->resize(workingSize);
b3SortData fillValue;
fillValue.m_key = 0xffffffff;
fillValue.m_value = 0xffffffff;
#define USE_BTFILL
#ifdef USE_BTFILL
m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize);
#else
//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
for (int i=originalSize; i<workingSize;i++)
{
m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
}
#endif//USE_BTFILL
src = m_workBuffer4;
} else
{
src = &keyValuesInOut;
m_workBuffer4->resize(0);
}
b3Assert( workingSize%DATA_ALIGNMENT == 0 );
int minCap = NUM_BUCKET*NUM_WGS;
int n = workingSize;
m_workBuffer1->resize(minCap);
m_workBuffer3->resize(workingSize);
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
b3Assert( BITS_PER_PASS == 4 );
b3Assert( WG_SIZE == 64 );
b3Assert( (sortBits&0x3) == 0 );
b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
int nWGs = NUM_WGS;
b3ConstData cdata;
{
int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
int nBlocks = (n+blockSize-1)/(blockSize);
cdata.m_n = n;
cdata.m_nWGs = NUM_WGS;
cdata.m_startBit = 0;
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
if( nBlocks < NUM_WGS )
{
cdata.m_nBlocksPerWG = 1;
nWGs = nBlocks;
}
}
int count=0;
for(int ib=0; ib<sortBits; ib+=4)
{
#ifdef DEBUG_RADIXSORT2
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n",test2.size());
for (int i=0;i<test2.size();i++)
{
if (test2[i].m_key != test2[i].m_value)
{
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
}
}
#endif //DEBUG_RADIXSORT2
cdata.m_startBit = ib;
if (src->size())
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
int num = NUM_WGS*WG_SIZE;
launcher.launch1D( num, WG_SIZE );
}
#ifdef DEBUG_RADIXSORT
b3AlignedObjectArray<unsigned int> testHist;
srcHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
for (int i=0;i<testHist.size();i++)
{
if (testHist[i]!=0)
printf("testHist[%d]=%d\n",i,testHist[i]);
}
#endif //DEBUG_RADIXSORT
//fast prefix scan is not working properly on Mac OSX yet
#ifdef _WIN32
bool fastScan=!m_deviceCPU;//only use fast scan on GPU
#else
bool fastScan=false;
#endif
if (fastScan)
{// prefix scan group histogram
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( 128, 128 );
destHisto = srcHisto;
}else
{
//unsigned int sum; //for debugging
m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
}
#ifdef DEBUG_RADIXSORT
destHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
for (int i=0;i<testHist.size();i++)
{
if (testHist[i]!=0)
printf("testHist[%d]=%d\n",i,testHist[i]);
}
for (int i=0;i<testHist.size();i+=NUM_WGS)
{
printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
}
#endif //DEBUG_RADIXSORT
#define USE_GPU
#ifdef USE_GPU
if (src->size())
{// local sort and distribute
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
}
#else
{
#define NUM_TABLES 16
//#define SEQUENTIAL
#ifdef SEQUENTIAL
int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int tables[NUM_TABLES];
int startBit = ib;
destHisto->copyToHost(testHist);
b3AlignedObjectArray<b3SortData> srcHost;
b3AlignedObjectArray<b3SortData> dstHost;
dstHost.resize(src->size());
src->copyToHost(srcHost);
for (int i=0;i<NUM_TABLES;i++)
{
tables[i] = testHist[i*NUM_WGS];
}
// distribute
for(int i=0; i<n; i++)
{
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
counter2[tableIdx] ++;
}
#else
int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int tables[NUM_TABLES];
b3AlignedObjectArray<b3SortData> dstHostOK;
dstHostOK.resize(src->size());
destHisto->copyToHost(testHist);
b3AlignedObjectArray<b3SortData> srcHost;
src->copyToHost(srcHost);
int blockSize = 256;
int nBlocksPerWG = cdata.m_nBlocksPerWG;
int startBit = ib;
{
for (int i=0;i<NUM_TABLES;i++)
{
tables[i] = testHist[i*NUM_WGS];
}
// distribute
for(int i=0; i<n; i++)
{
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
counter2[tableIdx] ++;
}
}
b3AlignedObjectArray<b3SortData> dstHost;
dstHost.resize(src->size());
int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
{
int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
{
for (int lIdx = 0;lIdx < 64;lIdx++)
{
int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
// MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
// Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
// AMD: AtomInc performs better while NV prefers ++
for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
{
if( addr+j < n )
{
// printf ("addr+j=%d\n", addr+j);
int i = addr+j;
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
b3SortData ok = dstHostOK[destIndex];
if (ok.m_key != srcHost[i].m_key)
{
printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
}
if (ok.m_value != srcHost[i].m_value)
{
printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );
}
dstHost[destIndex] = srcHost[i];
counter[tableIdx] ++;
}
}
}
}
}
#endif //SEQUENTIAL
dst->copyFromHost(dstHost);
}
#endif//USE_GPU
#ifdef DEBUG_RADIXSORT
destHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
for (int i=0;i<testHist.size();i++)
{
if (testHist[i]!=0)
printf("testHist[%d]=%d\n",i,testHist[i]);
}
#endif //DEBUG_RADIXSORT
b3Swap(src, dst );
b3Swap(srcHisto,destHisto);
#ifdef DEBUG_RADIXSORT2
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n",test2.size());
for (int i=0;i<test2.size();i++)
{
if (test2[i].m_key != test2[i].m_value)
{
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
}
}
#endif //DEBUG_RADIXSORT2
count++;
}
if (count&1)
{
b3Assert(0);//need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4->size())
{
m_workBuffer4->resize(originalSize);
keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
}
#ifdef DEBUG_RADIXSORT
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n",test2.size());
for (int i=0;i<test2.size();i++)
{
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
}
#endif
}
void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
{
int originalSize = keysInOut.size();
int workingSize = originalSize;
int dataAlignment = DATA_ALIGNMENT;
b3OpenCLArray<unsigned int>* src = 0;
if (workingSize%dataAlignment)
{
workingSize += dataAlignment-(workingSize%dataAlignment);
m_workBuffer4a->copyFromOpenCLArray(keysInOut);
m_workBuffer4a->resize(workingSize);
unsigned int fillValue = 0xffffffff;
m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize);
src = m_workBuffer4a;
} else
{
src = &keysInOut;
m_workBuffer4a->resize(0);
}
b3Assert( workingSize%DATA_ALIGNMENT == 0 );
int minCap = NUM_BUCKET*NUM_WGS;
int n = workingSize;
m_workBuffer1->resize(minCap);
m_workBuffer3->resize(workingSize);
m_workBuffer3a->resize(workingSize);
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
b3Assert( BITS_PER_PASS == 4 );
b3Assert( WG_SIZE == 64 );
b3Assert( (sortBits&0x3) == 0 );
b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
int nWGs = NUM_WGS;
b3ConstData cdata;
{
int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
int nBlocks = (n+blockSize-1)/(blockSize);
cdata.m_n = n;
cdata.m_nWGs = NUM_WGS;
cdata.m_startBit = 0;
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
if( nBlocks < NUM_WGS )
{
cdata.m_nBlocksPerWG = 1;
nWGs = nBlocks;
}
}
int count=0;
for(int ib=0; ib<sortBits; ib+=4)
{
cdata.m_startBit = ib;
if (src->size())
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
b3LauncherCL launcher(m_commandQueue, m_streamCountKernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
int num = NUM_WGS*WG_SIZE;
launcher.launch1D( num, WG_SIZE );
}
//fast prefix scan is not working properly on Mac OSX yet
#ifdef _WIN32
bool fastScan=!m_deviceCPU;
#else
bool fastScan=false;
#endif
if (fastScan)
{// prefix scan group histogram
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( 128, 128 );
destHisto = srcHisto;
}else
{
//unsigned int sum; //for debugging
m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
}
if (src->size())
{// local sort and distribute
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
b3LauncherCL launcher( m_commandQueue, m_sortAndScatterKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
}
b3Swap(src, dst );
b3Swap(srcHisto,destHisto);
count++;
}
if (count&1)
{
b3Assert(0);//need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4a->size())
{
m_workBuffer4a->resize(originalSize);
keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
}
}

View File

@@ -0,0 +1,85 @@
#ifndef B3_RADIXSORT32_H
#define B3_RADIXSORT32_H
#include "b3OpenCLArray.h"
struct b3SortData
{
int m_key;
int m_value;
};
#include "b3BufferInfoCL.h"
class b3RadixSort32CL
{
b3OpenCLArray<unsigned int>* m_workBuffer1;
b3OpenCLArray<unsigned int>* m_workBuffer2;
b3OpenCLArray<b3SortData>* m_workBuffer3;
b3OpenCLArray<b3SortData>* m_workBuffer4;
b3OpenCLArray<unsigned int>* m_workBuffer3a;
b3OpenCLArray<unsigned int>* m_workBuffer4a;
cl_command_queue m_commandQueue;
cl_kernel m_streamCountSortDataKernel;
cl_kernel m_streamCountKernel;
cl_kernel m_prefixScanKernel;
cl_kernel m_sortAndScatterSortDataKernel;
cl_kernel m_sortAndScatterKernel;
bool m_deviceCPU;
class b3PrefixScanCL* m_scan;
class b3FillCL* m_fill;
public:
struct b3ConstData
{
int m_n;
int m_nWGs;
int m_startBit;
int m_nBlocksPerWG;
};
enum
{
DATA_ALIGNMENT = 256,
WG_SIZE = 64,
BLOCK_SIZE = 256,
ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
BITS_PER_PASS = 4,
NUM_BUCKET=(1<<BITS_PER_PASS),
// if you change this, change nPerWI in kernel as well
NUM_WGS = 20*6, // cypress
// NUM_WGS = 24*6, // cayman
// NUM_WGS = 32*4, // nv
};
private:
public:
b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
virtual ~b3RadixSort32CL();
void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
///keys only
void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32 );
void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32 );
void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
};
#endif //B3_RADIXSORT32_H

View File

@@ -0,0 +1,106 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
typedef struct
{
u32 m_key;
u32 m_value;
}SortData;
typedef struct
{
u32 m_nSrc;
u32 m_nDst;
u32 m_padding[2];
} ConstBuffer;
__attribute__((reqd_work_group_size(64,1,1)))
__kernel
void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst,
unsigned int nSrc, unsigned int nDst)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < nSrc )
{
SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);
SortData end; end.m_key = nDst; end.m_value = nDst;
SortData iData = (gIdx==0)? first: src[gIdx-1];
SortData jData = (gIdx==nSrc)? end: src[gIdx];
if( iData.m_key != jData.m_key )
{
// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
u32 k = jData.m_key;
{
dst[k] = gIdx;
}
}
}
}
__attribute__((reqd_work_group_size(64,1,1)))
__kernel
void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst,
unsigned int nSrc, unsigned int nDst)
{
int gIdx = GET_GLOBAL_IDX+1;
if( gIdx < nSrc+1 )
{
SortData first; first.m_key = 0; first.m_value = 0;
SortData end; end.m_key = nDst; end.m_value = nDst;
SortData iData = src[gIdx-1];
SortData jData = (gIdx==nSrc)? end: src[gIdx];
if( iData.m_key != jData.m_key )
{
u32 k = iData.m_key;
{
dst[k] = gIdx;
}
}
}
}
__attribute__((reqd_work_group_size(64,1,1)))
__kernel
void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C,
unsigned int nSrc, unsigned int nDst)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < nDst )
{
C[gIdx] = A[gIdx] - B[gIdx];
}
}

View File

@@ -0,0 +1,110 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* boundSearchKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"\n"
"typedef struct\n"
"{\n"
" u32 m_key; \n"
" u32 m_value;\n"
"}SortData;\n"
"\n"
"\n"
"\n"
"typedef struct\n"
"{\n"
" u32 m_nSrc;\n"
" u32 m_nDst;\n"
" u32 m_padding[2];\n"
"} ConstBuffer;\n"
"\n"
"\n"
"\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < nSrc )\n"
" {\n"
" SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
"\n"
" SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
"\n"
" if( iData.m_key != jData.m_key )\n"
" {\n"
"// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
" u32 k = jData.m_key;\n"
" {\n"
" dst[k] = gIdx;\n"
" }\n"
" }\n"
" }\n"
"}\n"
"\n"
"\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX+1;\n"
"\n"
" if( gIdx < nSrc+1 )\n"
" {\n"
" SortData first; first.m_key = 0; first.m_value = 0;\n"
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
"\n"
" SortData iData = src[gIdx-1];\n"
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
"\n"
" if( iData.m_key != jData.m_key )\n"
" {\n"
" u32 k = iData.m_key;\n"
" {\n"
" dst[k] = gIdx;\n"
" }\n"
" }\n"
" }\n"
"}\n"
"\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" \n"
"\n"
" if( gIdx < nDst )\n"
" {\n"
" C[gIdx] = A[gIdx] - B[gIdx];\n"
" }\n"
"}\n"
"\n"
"\n"
;

View File

@@ -0,0 +1,128 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
#define AtomInc(x) atom_inc(&(x))
#define AtomInc1(x, out) out = atom_inc(&(x))
#define make_uint4 (uint4)
#define make_uint2 (uint2)
#define make_int2 (int2)
typedef struct
{
int m_n;
int m_padding[3];
} ConstBuffer;
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void Copy1F4Kernel(__global float4* dst, __global float4* src,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < cb.m_n )
{
float4 a0 = src[gIdx];
dst[ gIdx ] = a0;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void Copy2F4Kernel(__global float4* dst, __global float4* src,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( 2*gIdx <= cb.m_n )
{
float4 a0 = src[gIdx*2+0];
float4 a1 = src[gIdx*2+1];
dst[ gIdx*2+0 ] = a0;
dst[ gIdx*2+1 ] = a1;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void Copy4F4Kernel(__global float4* dst, __global float4* src,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( 4*gIdx <= cb.m_n )
{
int idx0 = gIdx*4+0;
int idx1 = gIdx*4+1;
int idx2 = gIdx*4+2;
int idx3 = gIdx*4+3;
float4 a0 = src[idx0];
float4 a1 = src[idx1];
float4 a2 = src[idx2];
float4 a3 = src[idx3];
dst[ idx0 ] = a0;
dst[ idx1 ] = a1;
dst[ idx2 ] = a2;
dst[ idx3 ] = a3;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void CopyF1Kernel(__global float* dstF1, __global float* srcF1,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < cb.m_n )
{
float a0 = srcF1[gIdx];
dstF1[ gIdx ] = a0;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < cb.m_n )
{
float2 a0 = srcF2[gIdx];
dstF2[ gIdx ] = a0;
}
}

View File

@@ -0,0 +1,132 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* copyKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"\n"
"#define make_uint4 (uint4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"typedef struct\n"
"{\n"
" int m_n;\n"
" int m_padding[3];\n"
"} ConstBuffer;\n"
"\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float4 a0 = src[gIdx];\n"
"\n"
" dst[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( 2*gIdx <= cb.m_n )\n"
" {\n"
" float4 a0 = src[gIdx*2+0];\n"
" float4 a1 = src[gIdx*2+1];\n"
"\n"
" dst[ gIdx*2+0 ] = a0;\n"
" dst[ gIdx*2+1 ] = a1;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( 4*gIdx <= cb.m_n )\n"
" {\n"
" int idx0 = gIdx*4+0;\n"
" int idx1 = gIdx*4+1;\n"
" int idx2 = gIdx*4+2;\n"
" int idx3 = gIdx*4+3;\n"
"\n"
" float4 a0 = src[idx0];\n"
" float4 a1 = src[idx1];\n"
" float4 a2 = src[idx2];\n"
" float4 a3 = src[idx3];\n"
"\n"
" dst[ idx0 ] = a0;\n"
" dst[ idx1 ] = a1;\n"
" dst[ idx2 ] = a2;\n"
" dst[ idx3 ] = a3;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float a0 = srcF1[gIdx];\n"
"\n"
" dstF1[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float2 a0 = srcF2[gIdx];\n"
"\n"
" dstF2[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"\n"
;

View File

@@ -0,0 +1,107 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
#define AtomInc(x) atom_inc(&(x))
#define AtomInc1(x, out) out = atom_inc(&(x))
#define make_uint4 (uint4)
#define make_uint2 (uint2)
#define make_int2 (int2)
typedef struct
{
union
{
int4 m_data;
uint4 m_unsignedData;
float m_floatData;
};
int m_offset;
int m_n;
int m_padding[2];
} ConstBuffer;
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num_elements )
{
dstInt[ offset+gIdx ] = value;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num_elements )
{
dstFloat[ offset+gIdx ] = value;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num )
{
dstInt[ offset+gIdx ] = value;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num )
{
dstInt2[ gIdx + offset] = make_int2( value.x, value.y );
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num )
{
dstInt4[ offset+gIdx ] = value;
}
}

View File

@@ -0,0 +1,111 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* fillKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"\n"
"#define make_uint4 (uint4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"typedef struct\n"
"{\n"
" union\n"
" {\n"
" int4 m_data;\n"
" uint4 m_unsignedData;\n"
" float m_floatData;\n"
" };\n"
" int m_offset;\n"
" int m_n;\n"
" int m_padding[2];\n"
"} ConstBuffer;\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < num_elements )\n"
" {\n"
" dstInt[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < num_elements )\n"
" {\n"
" dstFloat[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < num )\n"
" {\n"
" dstInt[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < num )\n"
" {\n"
" dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < num )\n"
" {\n"
" dstInt4[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"\n"
"\n"
;

View File

@@ -0,0 +1,154 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
// takahiro end
#define WG_SIZE 128
#define m_numElems x
#define m_numBlocks y
#define m_numScanBlocks z
/*typedef struct
{
uint m_numElems;
uint m_numBlocks;
uint m_numScanBlocks;
uint m_padding[1];
} ConstBuffer;
*/
u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)
{
u32 blocksum;
int offset = 1;
for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
{
GROUP_LDS_BARRIER;
for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
{
int ai = offset*(2*iIdx+1)-1;
int bi = offset*(2*iIdx+2)-1;
data[bi] += data[ai];
}
}
GROUP_LDS_BARRIER;
if( lIdx == 0 )
{
blocksum = data[ n-1 ];
data[ n-1 ] = 0;
}
GROUP_LDS_BARRIER;
offset >>= 1;
for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
{
GROUP_LDS_BARRIER;
for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
{
int ai = offset*(2*iIdx+1)-1;
int bi = offset*(2*iIdx+2)-1;
u32 temp = data[ai];
data[ai] = data[bi];
data[bi] += temp;
}
}
GROUP_LDS_BARRIER;
return blocksum;
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,
uint4 cb)
{
__local u32 ldsData[WG_SIZE*2];
int gIdx = GET_GLOBAL_IDX;
int lIdx = GET_LOCAL_IDX;
ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;
if( (2*gIdx) < cb.m_numElems )
{
dst[2*gIdx] = ldsData[2*lIdx];
}
if( (2*gIdx + 1) < cb.m_numElems )
{
dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
}
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)
{
const u32 blockSize = WG_SIZE*2;
int myIdx = GET_GROUP_IDX+1;
int lIdx = GET_LOCAL_IDX;
u32 iBlockSum = blockSum[myIdx];
int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
{
dst[i] += iBlockSum;
}
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void TopLevelScanKernel(__global u32* dst, uint4 cb)
{
__local u32 ldsData[2048];
int gIdx = GET_GLOBAL_IDX;
int lIdx = GET_LOCAL_IDX;
int lSize = GET_GROUP_SIZE;
for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
{
ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
}
GROUP_LDS_BARRIER;
u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
{
dst[i] = ldsData[i];
}
if( gIdx == 0 )
{
dst[cb.m_numBlocks] = sum;
}
}

View File

@@ -0,0 +1,158 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* prefixScanKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"\n"
"// takahiro end\n"
"#define WG_SIZE 128 \n"
"#define m_numElems x\n"
"#define m_numBlocks y\n"
"#define m_numScanBlocks z\n"
"\n"
"/*typedef struct\n"
"{\n"
" uint m_numElems;\n"
" uint m_numBlocks;\n"
" uint m_numScanBlocks;\n"
" uint m_padding[1];\n"
"} ConstBuffer;\n"
"*/\n"
"\n"
"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
"{\n"
" u32 blocksum;\n"
" int offset = 1;\n"
" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" data[bi] += data[ai];\n"
" }\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" if( lIdx == 0 )\n"
" {\n"
" blocksum = data[ n-1 ];\n"
" data[ n-1 ] = 0;\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" offset >>= 1;\n"
" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" u32 temp = data[ai];\n"
" data[ai] = data[bi];\n"
" data[bi] += temp;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" return blocksum;\n"
"}\n"
"\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
" uint4 cb)\n"
"{\n"
" __local u32 ldsData[WG_SIZE*2];\n"
"\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
"\n"
" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
"\n"
" u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
"\n"
" if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
"\n"
" if( (2*gIdx) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx] = ldsData[2*lIdx];\n"
" }\n"
" if( (2*gIdx + 1) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
" }\n"
"}\n"
"\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
"{\n"
" const u32 blockSize = WG_SIZE*2;\n"
"\n"
" int myIdx = GET_GROUP_IDX+1;\n"
" int lIdx = GET_LOCAL_IDX;\n"
"\n"
" u32 iBlockSum = blockSum[myIdx];\n"
"\n"
" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
" {\n"
" dst[i] += iBlockSum;\n"
" }\n"
"}\n"
"\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
"{\n"
" __local u32 ldsData[2048];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int lSize = GET_GROUP_SIZE;\n"
"\n"
" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
" {\n"
" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
"\n"
" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
" {\n"
" dst[i] = ldsData[i];\n"
" }\n"
"\n"
" if( gIdx == 0 )\n"
" {\n"
" dst[cb.m_numBlocks] = sum;\n"
" }\n"
"}\n"
"\n"
;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,40 @@
#ifndef B3_CONFIG_H
#define B3_CONFIG_H
struct b3Config
{
int m_maxConvexBodies;
int m_maxConvexShapes;
int m_maxBroadphasePairs;
int m_maxContactCapacity;
int m_maxVerticesPerFace;
int m_maxFacesPerShape;
int m_maxConvexVertices;
int m_maxConvexIndices;
int m_maxConvexUniqueEdges;
int m_maxCompoundChildShapes;
int m_maxTriConvexPairCapacity;
b3Config()
:m_maxConvexBodies(32*1024),
m_maxConvexShapes(8192),
m_maxVerticesPerFace(64),
m_maxFacesPerShape(64),
m_maxConvexVertices(8192),
m_maxConvexIndices(8192),
m_maxConvexUniqueEdges(8192),
m_maxCompoundChildShapes(8192),
//m_maxTriConvexPairCapacity(512*1024)
m_maxTriConvexPairCapacity(256*1024)
{
m_maxBroadphasePairs = 16*m_maxConvexBodies;
m_maxContactCapacity = m_maxBroadphasePairs;
}
};
#endif//B3_CONFIG_H

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,40 @@
#ifndef B3_GPU_BATCHING_PGS_SOLVER_H
#define B3_GPU_BATCHING_PGS_SOLVER_H
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
#include "b3GpuConstraint4.h"
class b3GpuBatchingPgsSolver
{
protected:
struct b3GpuBatchingPgsSolverInternalData* m_data;
void batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx );
inline int sortConstraintByBatch( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies);
inline int sortConstraintByBatch2( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies);
inline int sortConstraintByBatch3( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies);
void solveContactConstraint( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf, const b3OpenCLArray<b3InertiaCL>* shapeBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, int numIterations);
public:
b3GpuBatchingPgsSolver(cl_context ctx,cl_device_id device, cl_command_queue q,int pairCapacity);
virtual ~b3GpuBatchingPgsSolver();
void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config);
};
#endif //B3_GPU_BATCHING_PGS_SOLVER_H

View File

@@ -0,0 +1,29 @@
#ifndef B3_CONSTRAINT4_h
#define B3_CONSTRAINT4_h
#include "Bullet3Common/b3Vector3.h"
B3_ATTRIBUTE_ALIGNED16(struct) b3GpuConstraint4
{
B3_DECLARE_ALIGNED_ALLOCATOR();
b3Vector3 m_linear;//normal?
b3Vector3 m_worldPos[4];
b3Vector3 m_center; // friction
float m_jacCoeffInv[4];
float m_b[4];
float m_appliedRambdaDt[4];
float m_fJacCoeffInv[2]; // friction
float m_fAppliedRambdaDt[2]; // friction
unsigned int m_bodyA;
unsigned int m_bodyB;
int m_batchIdx;
unsigned int m_paddings;
inline void setFrictionCoeff(float value) { m_linear[3] = value; }
inline float getFrictionCoeff() const { return m_linear[3]; }
};
#endif //B3_CONSTRAINT4_h

View File

@@ -0,0 +1,901 @@
#include "b3GpuNarrowPhase.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h"
#include "Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
#include <string.h>
#include "b3Config.h"
#include "Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h"
#include "Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h"
#include "Bullet3Geometry/b3AabbUtil.h"
#include "Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h"
struct b3GpuNarrowPhaseInternalData
{
b3AlignedObjectArray<b3ConvexUtility*>* m_convexData;
b3AlignedObjectArray<b3ConvexPolyhedronCL> m_convexPolyhedra;
b3AlignedObjectArray<b3Vector3> m_uniqueEdges;
b3AlignedObjectArray<b3Vector3> m_convexVertices;
b3AlignedObjectArray<int> m_convexIndices;
b3OpenCLArray<b3ConvexPolyhedronCL>* m_convexPolyhedraGPU;
b3OpenCLArray<b3Vector3>* m_uniqueEdgesGPU;
b3OpenCLArray<b3Vector3>* m_convexVerticesGPU;
b3OpenCLArray<int>* m_convexIndicesGPU;
b3OpenCLArray<b3Vector3>* m_worldVertsB1GPU;
b3OpenCLArray<b3Int4>* m_clippingFacesOutGPU;
b3OpenCLArray<b3Vector3>* m_worldNormalsAGPU;
b3OpenCLArray<b3Vector3>* m_worldVertsA1GPU;
b3OpenCLArray<b3Vector3>* m_worldVertsB2GPU;
b3AlignedObjectArray<b3GpuChildShape> m_cpuChildShapes;
b3OpenCLArray<b3GpuChildShape>* m_gpuChildShapes;
b3AlignedObjectArray<b3GpuFace> m_convexFaces;
b3OpenCLArray<b3GpuFace>* m_convexFacesGPU;
GpuSatCollision* m_gpuSatCollision;
b3AlignedObjectArray<b3Int2>* m_pBufPairsCPU;
b3OpenCLArray<b3Int2>* m_convexPairsOutGPU;
b3OpenCLArray<b3Int2>* m_planePairs;
b3OpenCLArray<b3Contact4>* m_pBufContactOutGPU;
b3AlignedObjectArray<b3Contact4>* m_pBufContactOutCPU;
b3AlignedObjectArray<b3RigidBodyCL>* m_bodyBufferCPU;
b3OpenCLArray<b3RigidBodyCL>* m_bodyBufferGPU;
b3AlignedObjectArray<b3InertiaCL>* m_inertiaBufferCPU;
b3OpenCLArray<b3InertiaCL>* m_inertiaBufferGPU;
int m_numAcceleratedShapes;
int m_numAcceleratedRigidBodies;
b3AlignedObjectArray<b3Collidable> m_collidablesCPU;
b3OpenCLArray<b3Collidable>* m_collidablesGPU;
b3OpenCLArray<b3SapAabb>* m_localShapeAABBGPU;
b3AlignedObjectArray<b3SapAabb>* m_localShapeAABBCPU;
b3AlignedObjectArray<class b3OptimizedBvh*> m_bvhData;
b3AlignedObjectArray<b3QuantizedBvhNode> m_treeNodesCPU;
b3AlignedObjectArray<b3BvhSubtreeInfo> m_subTreesCPU;
b3AlignedObjectArray<b3BvhInfo> m_bvhInfoCPU;
b3OpenCLArray<b3BvhInfo>* m_bvhInfoGPU;
b3OpenCLArray<b3QuantizedBvhNode>* m_treeNodesGPU;
b3OpenCLArray<b3BvhSubtreeInfo>* m_subTreesGPU;
b3Config m_config;
};
b3GpuNarrowPhase::b3GpuNarrowPhase(cl_context ctx, cl_device_id device, cl_command_queue queue, const b3Config& config)
:m_data(0) ,m_planeBodyIndex(-1),m_static0Index(-1),
m_context(ctx),
m_device(device),
m_queue(queue)
{
m_data = new b3GpuNarrowPhaseInternalData();
memset(m_data,0,sizeof(b3GpuNarrowPhaseInternalData));
m_data->m_config = config;
m_data->m_gpuSatCollision = new GpuSatCollision(ctx,device,queue);
m_data->m_pBufPairsCPU = new b3AlignedObjectArray<b3Int2>;
m_data->m_pBufPairsCPU->resize(config.m_maxBroadphasePairs);
m_data->m_convexPairsOutGPU = new b3OpenCLArray<b3Int2>(ctx,queue,config.m_maxBroadphasePairs,false);
m_data->m_planePairs = new b3OpenCLArray<b3Int2>(ctx,queue,config.m_maxBroadphasePairs,false);
m_data->m_pBufContactOutCPU = new b3AlignedObjectArray<b3Contact4>();
m_data->m_pBufContactOutCPU->resize(config.m_maxBroadphasePairs);
m_data->m_bodyBufferCPU = new b3AlignedObjectArray<b3RigidBodyCL>();
m_data->m_bodyBufferCPU->resize(config.m_maxConvexBodies);
m_data->m_inertiaBufferCPU = new b3AlignedObjectArray<b3InertiaCL>();
m_data->m_inertiaBufferCPU->resize(config.m_maxConvexBodies);
m_data->m_pBufContactOutGPU = new b3OpenCLArray<b3Contact4>(ctx,queue, config.m_maxContactCapacity,true);
m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaCL>(ctx,queue,config.m_maxConvexBodies,false);
m_data->m_collidablesGPU = new b3OpenCLArray<b3Collidable>(ctx,queue,config.m_maxConvexShapes);
m_data->m_localShapeAABBCPU = new b3AlignedObjectArray<b3SapAabb>;
m_data->m_localShapeAABBGPU = new b3OpenCLArray<b3SapAabb>(ctx,queue,config.m_maxConvexShapes);
//m_data->m_solverDataGPU = adl::Solver<adl::TYPE_CL>::allocate(ctx,queue, config.m_maxBroadphasePairs,false);
m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyCL>(ctx,queue, config.m_maxConvexBodies,false);
m_data->m_convexFacesGPU = new b3OpenCLArray<b3GpuFace>(ctx,queue,config.m_maxConvexShapes*config.m_maxFacesPerShape,false);
m_data->m_gpuChildShapes = new b3OpenCLArray<b3GpuChildShape>(ctx,queue,config.m_maxCompoundChildShapes,false);
m_data->m_convexPolyhedraGPU = new b3OpenCLArray<b3ConvexPolyhedronCL>(ctx,queue,config.m_maxConvexShapes,false);
m_data->m_uniqueEdgesGPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexUniqueEdges,true);
m_data->m_convexVerticesGPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexVertices,true);
m_data->m_convexIndicesGPU = new b3OpenCLArray<int>(ctx,queue,config.m_maxConvexIndices,true);
m_data->m_worldVertsB1GPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace);
m_data->m_clippingFacesOutGPU = new b3OpenCLArray<b3Int4>(ctx,queue,config.m_maxConvexBodies);
m_data->m_worldNormalsAGPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies);
m_data->m_worldVertsA1GPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace);
m_data->m_worldVertsB2GPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace);
m_data->m_convexData = new b3AlignedObjectArray<b3ConvexUtility* >();
m_data->m_convexData->resize(config.m_maxConvexShapes);
m_data->m_convexPolyhedra.resize(config.m_maxConvexShapes);
m_data->m_numAcceleratedShapes = 0;
m_data->m_numAcceleratedRigidBodies = 0;
m_data->m_subTreesGPU = new b3OpenCLArray<b3BvhSubtreeInfo>(this->m_context,this->m_queue);
m_data->m_treeNodesGPU = new b3OpenCLArray<b3QuantizedBvhNode>(this->m_context,this->m_queue);
m_data->m_bvhInfoGPU = new b3OpenCLArray<b3BvhInfo>(this->m_context,this->m_queue);
//m_data->m_contactCGPU = new b3OpenCLArray<Constraint4>(ctx,queue,config.m_maxBroadphasePairs,false);
//m_data->m_frictionCGPU = new b3OpenCLArray<adl::Solver<adl::TYPE_CL>::allocateFrictionConstraint( m_data->m_deviceCL, config.m_maxBroadphasePairs);
}
b3GpuNarrowPhase::~b3GpuNarrowPhase()
{
delete m_data->m_gpuSatCollision;
delete m_data->m_pBufPairsCPU;
delete m_data->m_convexPairsOutGPU;
delete m_data->m_planePairs;
delete m_data->m_pBufContactOutCPU;
delete m_data->m_bodyBufferCPU;
delete m_data->m_inertiaBufferCPU;
delete m_data->m_pBufContactOutGPU;
delete m_data->m_inertiaBufferGPU;
delete m_data->m_collidablesGPU;
delete m_data->m_localShapeAABBCPU;
delete m_data->m_localShapeAABBGPU;
delete m_data->m_bodyBufferGPU;
delete m_data->m_convexFacesGPU;
delete m_data->m_gpuChildShapes;
delete m_data->m_convexPolyhedraGPU;
delete m_data->m_uniqueEdgesGPU;
delete m_data->m_convexVerticesGPU;
delete m_data->m_convexIndicesGPU;
delete m_data->m_worldVertsB1GPU;
delete m_data->m_clippingFacesOutGPU;
delete m_data->m_worldNormalsAGPU;
delete m_data->m_worldVertsA1GPU;
delete m_data->m_worldVertsB2GPU;
delete m_data->m_bvhInfoGPU;
delete m_data->m_treeNodesGPU;
delete m_data->m_subTreesGPU;
delete m_data->m_convexData;
delete m_data;
}
int b3GpuNarrowPhase::allocateCollidable()
{
int curSize = m_data->m_collidablesCPU.size();
m_data->m_collidablesCPU.expand();
return curSize;
}
int b3GpuNarrowPhase::registerSphereShape(float radius)
{
int collidableIndex = allocateCollidable();
b3Collidable& col = getCollidableCpu(collidableIndex);
col.m_shapeType = SHAPE_SPHERE;
col.m_shapeIndex = 0;
col.m_radius = radius;
if (col.m_shapeIndex>=0)
{
b3SapAabb aabb;
b3Vector3 myAabbMin(-radius,-radius,-radius);
b3Vector3 myAabbMax(radius,radius,radius);
aabb.m_min[0] = myAabbMin[0];//s_convexHeightField->m_aabb.m_min.x;
aabb.m_min[1] = myAabbMin[1];//s_convexHeightField->m_aabb.m_min.y;
aabb.m_min[2] = myAabbMin[2];//s_convexHeightField->m_aabb.m_min.z;
aabb.m_minIndices[3] = 0;
aabb.m_max[0] = myAabbMax[0];//s_convexHeightField->m_aabb.m_max.x;
aabb.m_max[1] = myAabbMax[1];//s_convexHeightField->m_aabb.m_max.y;
aabb.m_max[2] = myAabbMax[2];//s_convexHeightField->m_aabb.m_max.z;
aabb.m_signedMaxIndices[3] = 0;
m_data->m_localShapeAABBCPU->push_back(aabb);
m_data->m_localShapeAABBGPU->push_back(aabb);
clFinish(m_queue);
}
return collidableIndex;
}
int b3GpuNarrowPhase::registerFace(const b3Vector3& faceNormal, float faceConstant)
{
int faceOffset = m_data->m_convexFaces.size();
b3GpuFace& face = m_data->m_convexFaces.expand();
face.m_plane[0] = faceNormal.getX();
face.m_plane[1] = faceNormal.getY();
face.m_plane[2] = faceNormal.getZ();
face.m_plane[3] = faceConstant;
m_data->m_convexFacesGPU->copyFromHost(m_data->m_convexFaces);
return faceOffset;
}
int b3GpuNarrowPhase::registerPlaneShape(const b3Vector3& planeNormal, float planeConstant)
{
int collidableIndex = allocateCollidable();
b3Collidable& col = getCollidableCpu(collidableIndex);
col.m_shapeType = SHAPE_PLANE;
col.m_shapeIndex = registerFace(planeNormal,planeConstant);
col.m_radius = planeConstant;
if (col.m_shapeIndex>=0)
{
b3SapAabb aabb;
aabb.m_min[0] = -1e30f;
aabb.m_min[1] = -1e30f;
aabb.m_min[2] = -1e30f;
aabb.m_minIndices[3] = 0;
aabb.m_max[0] = 1e30f;
aabb.m_max[1] = 1e30f;
aabb.m_max[2] = 1e30f;
aabb.m_signedMaxIndices[3] = 0;
m_data->m_localShapeAABBCPU->push_back(aabb);
m_data->m_localShapeAABBGPU->push_back(aabb);
clFinish(m_queue);
}
return collidableIndex;
}
int b3GpuNarrowPhase::registerConvexHullShape(b3ConvexUtility* convexPtr,b3Collidable& col)
{
m_data->m_convexData->resize(m_data->m_numAcceleratedShapes+1);
m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes+1);
b3ConvexPolyhedronCL& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size()-1);
convex.mC = convexPtr->mC;
convex.mE = convexPtr->mE;
convex.m_extents= convexPtr->m_extents;
convex.m_localCenter = convexPtr->m_localCenter;
convex.m_radius = convexPtr->m_radius;
convex.m_numUniqueEdges = convexPtr->m_uniqueEdges.size();
int edgeOffset = m_data->m_uniqueEdges.size();
convex.m_uniqueEdgesOffset = edgeOffset;
m_data->m_uniqueEdges.resize(edgeOffset+convex.m_numUniqueEdges);
//convex data here
int i;
for ( i=0;i<convexPtr->m_uniqueEdges.size();i++)
{
m_data->m_uniqueEdges[edgeOffset+i] = convexPtr->m_uniqueEdges[i];
}
int faceOffset = m_data->m_convexFaces.size();
convex.m_faceOffset = faceOffset;
convex.m_numFaces = convexPtr->m_faces.size();
m_data->m_convexFaces.resize(faceOffset+convex.m_numFaces);
for (i=0;i<convexPtr->m_faces.size();i++)
{
m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[0] = convexPtr->m_faces[i].m_plane[0];
m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[1] = convexPtr->m_faces[i].m_plane[1];
m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[2] = convexPtr->m_faces[i].m_plane[2];
m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[3] = convexPtr->m_faces[i].m_plane[3];
int indexOffset = m_data->m_convexIndices.size();
int numIndices = convexPtr->m_faces[i].m_indices.size();
m_data->m_convexFaces[convex.m_faceOffset+i].m_numIndices = numIndices;
m_data->m_convexFaces[convex.m_faceOffset+i].m_indexOffset = indexOffset;
m_data->m_convexIndices.resize(indexOffset+numIndices);
for (int p=0;p<numIndices;p++)
{
m_data->m_convexIndices[indexOffset+p] = convexPtr->m_faces[i].m_indices[p];
}
}
convex.m_numVertices = convexPtr->m_vertices.size();
int vertexOffset = m_data->m_convexVertices.size();
convex.m_vertexOffset =vertexOffset;
m_data->m_convexVertices.resize(vertexOffset+convex.m_numVertices);
for (int i=0;i<convexPtr->m_vertices.size();i++)
{
m_data->m_convexVertices[vertexOffset+i] = convexPtr->m_vertices[i];
}
(*m_data->m_convexData)[m_data->m_numAcceleratedShapes] = convexPtr;
m_data->m_convexFacesGPU->copyFromHost(m_data->m_convexFaces);
m_data->m_convexPolyhedraGPU->copyFromHost(m_data->m_convexPolyhedra);
m_data->m_uniqueEdgesGPU->copyFromHost(m_data->m_uniqueEdges);
m_data->m_convexVerticesGPU->copyFromHost(m_data->m_convexVertices);
m_data->m_convexIndicesGPU->copyFromHost(m_data->m_convexIndices);
return m_data->m_numAcceleratedShapes++;
}
int b3GpuNarrowPhase::registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling)
{
b3AlignedObjectArray<b3Vector3> verts;
unsigned char* vts = (unsigned char*) vertices;
for (int i=0;i<numVertices;i++)
{
float* vertex = (float*) &vts[i*strideInBytes];
verts.push_back(b3Vector3(vertex[0]*scaling[0],vertex[1]*scaling[1],vertex[2]*scaling[2]));
}
b3ConvexUtility* utilPtr = new b3ConvexUtility();
bool merge = true;
if (numVertices)
{
utilPtr->initializePolyhedralFeatures(&verts[0],verts.size(),merge);
}
int collidableIndex = registerConvexHullShape(utilPtr);
return collidableIndex;
}
int b3GpuNarrowPhase::registerConvexHullShape(b3ConvexUtility* utilPtr)
{
int collidableIndex = allocateCollidable();
b3Collidable& col = getCollidableCpu(collidableIndex);
col.m_shapeType = SHAPE_CONVEX_HULL;
col.m_shapeIndex = -1;
{
b3Vector3 localCenter(0,0,0);
for (int i=0;i<utilPtr->m_vertices.size();i++)
localCenter+=utilPtr->m_vertices[i];
localCenter*= (1.f/utilPtr->m_vertices.size());
utilPtr->m_localCenter = localCenter;
col.m_shapeIndex = registerConvexHullShape(utilPtr,col);
}
if (col.m_shapeIndex>=0)
{
b3SapAabb aabb;
b3Vector3 myAabbMin(1e30f,1e30f,1e30f);
b3Vector3 myAabbMax(-1e30f,-1e30f,-1e30f);
for (int i=0;i<utilPtr->m_vertices.size();i++)
{
myAabbMin.setMin(utilPtr->m_vertices[i]);
myAabbMax.setMax(utilPtr->m_vertices[i]);
}
aabb.m_min[0] = myAabbMin[0];
aabb.m_min[1] = myAabbMin[1];
aabb.m_min[2] = myAabbMin[2];
aabb.m_minIndices[3] = 0;
aabb.m_max[0] = myAabbMax[0];
aabb.m_max[1] = myAabbMax[1];
aabb.m_max[2] = myAabbMax[2];
aabb.m_signedMaxIndices[3] = 0;
m_data->m_localShapeAABBCPU->push_back(aabb);
m_data->m_localShapeAABBGPU->push_back(aabb);
}
return collidableIndex;
}
int b3GpuNarrowPhase::registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes)
{
int collidableIndex = allocateCollidable();
b3Collidable& col = getCollidableCpu(collidableIndex);
col.m_shapeType = SHAPE_COMPOUND_OF_CONVEX_HULLS;
col.m_shapeIndex = m_data->m_cpuChildShapes.size();
{
b3Assert(col.m_shapeIndex+childShapes->size()<m_data->m_config.m_maxCompoundChildShapes);
for (int i=0;i<childShapes->size();i++)
{
m_data->m_cpuChildShapes.push_back(childShapes->at(i));
}
//if writing the data directly is too slow, we can delay it and do it all at once in
m_data->m_gpuChildShapes->copyFromHost(m_data->m_cpuChildShapes);
}
col.m_numChildShapes = childShapes->size();
b3SapAabb aabbWS;
b3Vector3 myAabbMin(1e30f,1e30f,1e30f);
b3Vector3 myAabbMax(-1e30f,-1e30f,-1e30f);
//compute local AABB of the compound of all children
for (int i=0;i<childShapes->size();i++)
{
int childColIndex = childShapes->at(i).m_shapeIndex;
b3Collidable& childCol = getCollidableCpu(childColIndex);
b3SapAabb aabbLoc =m_data->m_localShapeAABBCPU->at(childColIndex);
b3Vector3 childLocalAabbMin(aabbLoc.m_min[0],aabbLoc.m_min[1],aabbLoc.m_min[2]);
b3Vector3 childLocalAabbMax(aabbLoc.m_max[0],aabbLoc.m_max[1],aabbLoc.m_max[2]);
b3Vector3 aMin,aMax;
b3Scalar margin(0.f);
b3Transform childTr;
childTr.setIdentity();
childTr.setOrigin(b3Vector3(childShapes->at(i).m_childPosition[0],
childShapes->at(i).m_childPosition[1],
childShapes->at(i).m_childPosition[2]));
childTr.setRotation(b3Quaternion(childShapes->at(i).m_childOrientation[0],
childShapes->at(i).m_childOrientation[1],
childShapes->at(i).m_childOrientation[2],
childShapes->at(i).m_childOrientation[3]));
b3TransformAabb(childLocalAabbMin,childLocalAabbMax,margin,childTr,aMin,aMax);
myAabbMin.setMin(aMin);
myAabbMax.setMax(aMax);
}
aabbWS.m_min[0] = myAabbMin[0];//s_convexHeightField->m_aabb.m_min.x;
aabbWS.m_min[1]= myAabbMin[1];//s_convexHeightField->m_aabb.m_min.y;
aabbWS.m_min[2]= myAabbMin[2];//s_convexHeightField->m_aabb.m_min.z;
aabbWS.m_minIndices[3] = 0;
aabbWS.m_max[0] = myAabbMax[0];//s_convexHeightField->m_aabb.m_max.x;
aabbWS.m_max[1]= myAabbMax[1];//s_convexHeightField->m_aabb.m_max.y;
aabbWS.m_max[2]= myAabbMax[2];//s_convexHeightField->m_aabb.m_max.z;
aabbWS.m_signedMaxIndices[3] = 0;
m_data->m_localShapeAABBCPU->push_back(aabbWS);
m_data->m_localShapeAABBGPU->push_back(aabbWS);
clFinish(m_queue);
return collidableIndex;
}
int b3GpuNarrowPhase::registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,const float* scaling1)
{
b3Vector3 scaling(scaling1[0],scaling1[1],scaling1[2]);
int collidableIndex = allocateCollidable();
b3Collidable& col = getCollidableCpu(collidableIndex);
col.m_shapeType = SHAPE_CONCAVE_TRIMESH;
col.m_shapeIndex = registerConcaveMeshShape(vertices,indices,col,scaling);
col.m_bvhIndex = m_data->m_bvhInfoCPU.size();
b3SapAabb aabb;
b3Vector3 myAabbMin(1e30f,1e30f,1e30f);
b3Vector3 myAabbMax(-1e30f,-1e30f,-1e30f);
for (int i=0;i<vertices->size();i++)
{
b3Vector3 vtx(vertices->at(i)*scaling);
myAabbMin.setMin(vtx);
myAabbMax.setMax(vtx);
}
aabb.m_min[0] = myAabbMin[0];
aabb.m_min[1] = myAabbMin[1];
aabb.m_min[2] = myAabbMin[2];
aabb.m_minIndices[3] = 0;
aabb.m_max[0] = myAabbMax[0];
aabb.m_max[1]= myAabbMax[1];
aabb.m_max[2]= myAabbMax[2];
aabb.m_signedMaxIndices[3]= 0;
m_data->m_localShapeAABBCPU->push_back(aabb);
m_data->m_localShapeAABBGPU->push_back(aabb);
b3OptimizedBvh* bvh = new b3OptimizedBvh();
//void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax)
bool useQuantizedAabbCompression = true;
b3TriangleIndexVertexArray* meshInterface=new b3TriangleIndexVertexArray();
b3IndexedMesh mesh;
mesh.m_numTriangles = indices->size()/3;
mesh.m_numVertices = vertices->size();
mesh.m_vertexBase = (const unsigned char *)&vertices->at(0).getX();
mesh.m_vertexStride = sizeof(b3Vector3);
mesh.m_triangleIndexStride = 3 * sizeof(int);// or sizeof(int)
mesh.m_triangleIndexBase = (const unsigned char *)&indices->at(0);
meshInterface->addIndexedMesh(mesh);
bvh->build(meshInterface, useQuantizedAabbCompression, (b3Vector3&)aabb.m_min, (b3Vector3&)aabb.m_max);
m_data->m_bvhData.push_back(bvh);
int numNodes = bvh->getQuantizedNodeArray().size();
//b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU = new b3OpenCLArray<b3QuantizedBvhNode>(this->m_context,this->m_queue,numNodes);
//treeNodesGPU->copyFromHost(bvh->getQuantizedNodeArray());
int numSubTrees = bvh->getSubtreeInfoArray().size();
b3BvhInfo bvhInfo;
bvhInfo.m_aabbMin = bvh->m_bvhAabbMin;
bvhInfo.m_aabbMax = bvh->m_bvhAabbMax;
bvhInfo.m_quantization = bvh->m_bvhQuantization;
bvhInfo.m_numNodes = numNodes;
bvhInfo.m_numSubTrees = numSubTrees;
bvhInfo.m_nodeOffset = m_data->m_treeNodesCPU.size();
bvhInfo.m_subTreeOffset = m_data->m_subTreesCPU.size();
m_data->m_bvhInfoCPU.push_back(bvhInfo);
m_data->m_bvhInfoGPU->copyFromHost(m_data->m_bvhInfoCPU);
int numNewSubtrees = bvh->getSubtreeInfoArray().size();
m_data->m_subTreesCPU.reserve(m_data->m_subTreesCPU.size()+numNewSubtrees);
for (int i=0;i<numNewSubtrees;i++)
{
m_data->m_subTreesCPU.push_back(bvh->getSubtreeInfoArray()[i]);
}
int numNewTreeNodes = bvh->getQuantizedNodeArray().size();
for (int i=0;i<numNewTreeNodes;i++)
{
m_data->m_treeNodesCPU.push_back(bvh->getQuantizedNodeArray()[i]);
}
//b3OpenCLArray<b3BvhSubtreeInfo>* subTreesGPU = new b3OpenCLArray<b3BvhSubtreeInfo>(this->m_context,this->m_queue,numSubTrees);
//subTreesGPU->copyFromHost(bvh->getSubtreeInfoArray());
m_data->m_treeNodesGPU->copyFromHost(m_data->m_treeNodesCPU);
m_data->m_subTreesGPU->copyFromHost(m_data->m_subTreesCPU);
return collidableIndex;
}
int b3GpuNarrowPhase::registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,b3Collidable& col, const float* scaling1)
{
b3Vector3 scaling(scaling1[0],scaling1[1],scaling1[2]);
m_data->m_convexData->resize(m_data->m_numAcceleratedShapes+1);
m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes+1);
b3ConvexPolyhedronCL& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size()-1);
convex.mC = b3Vector3(0,0,0);
convex.mE = b3Vector3(0,0,0);
convex.m_extents= b3Vector3(0,0,0);
convex.m_localCenter = b3Vector3(0,0,0);
convex.m_radius = 0.f;
convex.m_numUniqueEdges = 0;
int edgeOffset = m_data->m_uniqueEdges.size();
convex.m_uniqueEdgesOffset = edgeOffset;
int faceOffset = m_data->m_convexFaces.size();
convex.m_faceOffset = faceOffset;
convex.m_numFaces = indices->size()/3;
m_data->m_convexFaces.resize(faceOffset+convex.m_numFaces);
m_data->m_convexIndices.reserve(convex.m_numFaces*3);
for (int i=0;i<convex.m_numFaces;i++)
{
if (i%256==0)
{
//printf("i=%d out of %d", i,convex.m_numFaces);
}
b3Vector3 vert0(vertices->at(indices->at(i*3))*scaling);
b3Vector3 vert1(vertices->at(indices->at(i*3+1))*scaling);
b3Vector3 vert2(vertices->at(indices->at(i*3+2))*scaling);
b3Vector3 normal = ((vert1-vert0).cross(vert2-vert0)).normalize();
b3Scalar c = -(normal.dot(vert0));
m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[0] = normal.getX();
m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[1] = normal.getY();
m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[2] = normal.getZ();
m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[3] = c;
int indexOffset = m_data->m_convexIndices.size();
int numIndices = 3;
m_data->m_convexFaces[convex.m_faceOffset+i].m_numIndices = numIndices;
m_data->m_convexFaces[convex.m_faceOffset+i].m_indexOffset = indexOffset;
m_data->m_convexIndices.resize(indexOffset+numIndices);
for (int p=0;p<numIndices;p++)
{
int vi = indices->at(i*3+p);
m_data->m_convexIndices[indexOffset+p] = vi;//convexPtr->m_faces[i].m_indices[p];
}
}
convex.m_numVertices = vertices->size();
int vertexOffset = m_data->m_convexVertices.size();
convex.m_vertexOffset =vertexOffset;
m_data->m_convexVertices.resize(vertexOffset+convex.m_numVertices);
for (int i=0;i<vertices->size();i++)
{
m_data->m_convexVertices[vertexOffset+i] = vertices->at(i)*scaling;
}
(*m_data->m_convexData)[m_data->m_numAcceleratedShapes] = 0;
m_data->m_convexFacesGPU->copyFromHost(m_data->m_convexFaces);
m_data->m_convexPolyhedraGPU->copyFromHost(m_data->m_convexPolyhedra);
m_data->m_uniqueEdgesGPU->copyFromHost(m_data->m_uniqueEdges);
m_data->m_convexVerticesGPU->copyFromHost(m_data->m_convexVertices);
m_data->m_convexIndicesGPU->copyFromHost(m_data->m_convexIndices);
return m_data->m_numAcceleratedShapes++;
}
cl_mem b3GpuNarrowPhase::getBodiesGpu()
{
return (cl_mem)m_data->m_bodyBufferGPU->getBufferCL();
}
int b3GpuNarrowPhase::getNumBodiesGpu() const
{
return m_data->m_bodyBufferGPU->size();
}
cl_mem b3GpuNarrowPhase::getBodyInertiasGpu()
{
return (cl_mem)m_data->m_inertiaBufferGPU->getBufferCL();
}
int b3GpuNarrowPhase::getNumBodyInertiasGpu() const
{
return m_data->m_inertiaBufferGPU->size();
}
b3Collidable& b3GpuNarrowPhase::getCollidableCpu(int collidableIndex)
{
return m_data->m_collidablesCPU[collidableIndex];
}
const b3Collidable& b3GpuNarrowPhase::getCollidableCpu(int collidableIndex) const
{
return m_data->m_collidablesCPU[collidableIndex];
}
cl_mem b3GpuNarrowPhase::getCollidablesGpu()
{
return m_data->m_collidablesGPU->getBufferCL();
}
cl_mem b3GpuNarrowPhase::getAabbBufferGpu()
{
return m_data->m_localShapeAABBGPU->getBufferCL();
}
int b3GpuNarrowPhase::getNumCollidablesGpu() const
{
return m_data->m_collidablesGPU->size();
}
int b3GpuNarrowPhase::getNumContactsGpu() const
{
return m_data->m_pBufContactOutGPU->size();
}
cl_mem b3GpuNarrowPhase::getContactsGpu()
{
return m_data->m_pBufContactOutGPU->getBufferCL();
}
const b3Contact4* b3GpuNarrowPhase::getContactsCPU() const
{
m_data->m_pBufContactOutGPU->copyToHost(*m_data->m_pBufContactOutCPU);
return &m_data->m_pBufContactOutCPU->at(0);
}
void b3GpuNarrowPhase::computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWS, int numObjects)
{
int nContactOut = 0;
int maxTriConvexPairCapacity = m_data->m_config.m_maxTriConvexPairCapacity;
b3OpenCLArray<b3Int4> triangleConvexPairs(m_context,m_queue, maxTriConvexPairCapacity);
int numTriConvexPairsOut=0;
b3OpenCLArray<b3Int2> broadphasePairsGPU(m_context,m_queue);
broadphasePairsGPU.setFromOpenCLBuffer(broadphasePairs,numBroadphasePairs);
b3OpenCLArray<b3YetAnotherAabb> clAabbArray(this->m_context,this->m_queue);
clAabbArray.setFromOpenCLBuffer(aabbsWS,numObjects);
m_data->m_gpuSatCollision->computeConvexConvexContactsGPUSAT(
&broadphasePairsGPU, numBroadphasePairs,
m_data->m_bodyBufferGPU,
m_data->m_pBufContactOutGPU,
nContactOut,
m_data->m_config.m_maxContactCapacity,
*m_data->m_convexPolyhedraGPU,
*m_data->m_convexVerticesGPU,
*m_data->m_uniqueEdgesGPU,
*m_data->m_convexFacesGPU,
*m_data->m_convexIndicesGPU,
*m_data->m_collidablesGPU,
*m_data->m_gpuChildShapes,
clAabbArray,
*m_data->m_worldVertsB1GPU,
*m_data->m_clippingFacesOutGPU,
*m_data->m_worldNormalsAGPU,
*m_data->m_worldVertsA1GPU,
*m_data->m_worldVertsB2GPU,
m_data->m_bvhData,
m_data->m_treeNodesGPU,
m_data->m_subTreesGPU,
m_data->m_bvhInfoGPU,
numObjects,
maxTriConvexPairCapacity,
triangleConvexPairs,
numTriConvexPairsOut
);
}
const b3SapAabb& b3GpuNarrowPhase::getLocalSpaceAabb(int collidableIndex) const
{
return m_data->m_localShapeAABBCPU->at(collidableIndex);
}
int b3GpuNarrowPhase::registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation , const float* aabbMinPtr, const float* aabbMaxPtr,bool writeToGpu)
{
b3Vector3 aabbMin(aabbMinPtr[0],aabbMinPtr[1],aabbMinPtr[2]);
b3Vector3 aabbMax (aabbMaxPtr[0],aabbMaxPtr[1],aabbMaxPtr[2]);
b3Assert(m_data->m_numAcceleratedRigidBodies< (m_data->m_config.m_maxConvexBodies-1));
m_data->m_bodyBufferGPU->resize(m_data->m_numAcceleratedRigidBodies+1);
b3RigidBodyCL& body = m_data->m_bodyBufferCPU->at(m_data->m_numAcceleratedRigidBodies);
float friction = 1.f;
float restitution = 0.f;
body.m_frictionCoeff = friction;
body.m_restituitionCoeff = restitution;
body.m_angVel.setZero();
body.m_linVel.setValue(0,0,0);//.setZero();
body.m_pos.setValue(position[0],position[1],position[2]);
body.m_quat.setValue(orientation[0],orientation[1],orientation[2],orientation[3]);
body.m_collidableIdx = collidableIndex;
if (collidableIndex>=0)
{
// body.m_shapeType = m_data->m_collidablesCPU.at(collidableIndex).m_shapeType;
} else
{
// body.m_shapeType = CollisionShape::SHAPE_PLANE;
m_planeBodyIndex = m_data->m_numAcceleratedRigidBodies;
}
//body.m_shapeType = shapeType;
body.m_invMass = mass? 1.f/mass : 0.f;
if (writeToGpu)
{
m_data->m_bodyBufferGPU->copyFromHostPointer(&body,1,m_data->m_numAcceleratedRigidBodies);
}
b3InertiaCL& shapeInfo = m_data->m_inertiaBufferCPU->at(m_data->m_numAcceleratedRigidBodies);
if (mass==0.f)
{
if (m_data->m_numAcceleratedRigidBodies==0)
m_static0Index = 0;
shapeInfo.m_initInvInertia.setValue(0,0,0,0,0,0,0,0,0);
shapeInfo.m_invInertiaWorld.setValue(0,0,0,0,0,0,0,0,0);
} else
{
assert(body.m_collidableIdx>=0);
//approximate using the aabb of the shape
//Aabb aabb = (*m_data->m_shapePointers)[shapeIndex]->m_aabb;
b3Vector3 halfExtents = (aabbMax-aabbMin);//*0.5f;//fake larger inertia makes demos more stable ;-)
b3Vector3 localInertia;
float lx=2.f*halfExtents[0];
float ly=2.f*halfExtents[1];
float lz=2.f*halfExtents[2];
localInertia.setValue( (mass/12.0f) * (ly*ly + lz*lz),
(mass/12.0f) * (lx*lx + lz*lz),
(mass/12.0f) * (lx*lx + ly*ly));
b3Vector3 invLocalInertia;
invLocalInertia[0] = 1.f/localInertia[0];
invLocalInertia[1] = 1.f/localInertia[1];
invLocalInertia[2] = 1.f/localInertia[2];
invLocalInertia[3] = 0.f;
shapeInfo.m_initInvInertia.setValue(
invLocalInertia[0], 0, 0,
0, invLocalInertia[1], 0,
0, 0, invLocalInertia[2]);
b3Matrix3x3 m (body.m_quat);
shapeInfo.m_invInertiaWorld = m.scaled(invLocalInertia) * m.transpose();
}
if (writeToGpu)
m_data->m_inertiaBufferGPU->copyFromHostPointer(&shapeInfo,1,m_data->m_numAcceleratedRigidBodies);
return m_data->m_numAcceleratedRigidBodies++;
}
int b3GpuNarrowPhase::getNumRigidBodies() const
{
return m_data->m_numAcceleratedRigidBodies;
}
void b3GpuNarrowPhase::writeAllBodiesToGpu()
{
m_data->m_bodyBufferGPU->resize(m_data->m_numAcceleratedRigidBodies);
m_data->m_inertiaBufferGPU->resize(m_data->m_numAcceleratedRigidBodies);
m_data->m_bodyBufferGPU->copyFromHostPointer(&m_data->m_bodyBufferCPU->at(0),m_data->m_numAcceleratedRigidBodies);
m_data->m_inertiaBufferGPU->copyFromHostPointer(&m_data->m_inertiaBufferCPU->at(0),m_data->m_numAcceleratedRigidBodies);
m_data->m_collidablesGPU->copyFromHost(m_data->m_collidablesCPU);
}

View File

@@ -0,0 +1,86 @@
#ifndef B3_GPU_NARROWPHASE_H
#define B3_GPU_NARROWPHASE_H
#include "Bullet3OpenCL/NarrowphaseCollision/b3Collidable.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Common/b3Vector3.h"
class b3GpuNarrowPhase
{
protected:
struct b3GpuNarrowPhaseInternalData* m_data;
int m_acceleratedCompanionShapeIndex;
int m_planeBodyIndex;
int m_static0Index;
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
int registerConvexHullShape(class b3ConvexUtility* convexPtr, b3Collidable& col);
int registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, b3Collidable& col, const float* scaling);
public:
b3GpuNarrowPhase(cl_context vtx, cl_device_id dev, cl_command_queue q, const struct b3Config& config);
virtual ~b3GpuNarrowPhase(void);
int registerSphereShape(float radius);
int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);
int registerFace(const b3Vector3& faceNormal, float faceConstant);
int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,const float* scaling);
//do they need to be merged?
int registerConvexHullShape(b3ConvexUtility* utilPtr);
int registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax,bool writeToGpu);
void setObjectTransform(const float* position, const float* orientation , int bodyIndex);
void writeAllBodiesToGpu();
void readbackAllBodiesToCpu();
void getObjectTransformFromCpu(float* position, float* orientation , int bodyIndex) const;
virtual void computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbs, int numObjects);
cl_mem getBodiesGpu();
int getNumBodiesGpu() const;
cl_mem getBodyInertiasGpu();
int getNumBodyInertiasGpu() const;
cl_mem getCollidablesGpu();
int getNumCollidablesGpu() const;
const struct b3Contact4* getContactsCPU() const;
cl_mem getContactsGpu();
int getNumContactsGpu() const;
cl_mem getAabbBufferGpu();
int getNumRigidBodies() const;
int allocateCollidable();
b3Collidable& getCollidableCpu(int collidableIndex);
const b3Collidable& getCollidableCpu(int collidableIndex) const;
const struct b3SapAabb& getLocalSpaceAabb(int collidableIndex) const;
};
#endif //B3_GPU_NARROWPHASE_H

View File

@@ -0,0 +1,427 @@
#include "b3GpuRigidBodyPipeline.h"
#include "b3GpuRigidBodyPipelineInternalData.h"
#include "kernels/integrateKernel.h"
#include "kernels/updateAabbsKernel.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "b3GpuNarrowPhase.h"
#include "Bullet3Geometry/b3AabbUtil.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.h"
#include "Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h"
//#define TEST_OTHER_GPU_SOLVER
#define B3_RIGIDBODY_INTEGRATE_PATH "src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl"
#define B3_RIGIDBODY_UPDATEAABB_PATH "src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl"
bool useDbvt = false;
bool useBullet2CpuSolver = false;//false;
bool dumpContactStats = false;
#ifdef TEST_OTHER_GPU_SOLVER
#include "b3GpuJacobiSolver.h"
#endif //TEST_OTHER_GPU_SOLVER
#include "Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
#include "b3GpuBatchingPgsSolver.h"
#include "b3Solver.h"
#include "Bullet3Common/b3Quickprof.h"
#include "b3Config.h"
b3GpuRigidBodyPipeline::b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue q,class b3GpuNarrowPhase* narrowphase, class b3GpuSapBroadphase* broadphaseSap , class b3DynamicBvhBroadphase* broadphaseDbvt)
{
m_data = new b3GpuRigidBodyPipelineInternalData;
m_data->m_context = ctx;
m_data->m_device = device;
m_data->m_queue = q;
m_data->m_solver = new b3PgsJacobiSolver(true);
b3Config config;
m_data->m_allAabbsGPU = new b3OpenCLArray<b3SapAabb>(ctx,q,config.m_maxConvexBodies);
m_data->m_overlappingPairsGPU = new b3OpenCLArray<b3BroadphasePair>(ctx,q,config.m_maxBroadphasePairs);
#ifdef TEST_OTHER_GPU_SOLVER
m_data->m_solver3 = new b3GpuJacobiSolver(ctx,device,q,config.m_maxBroadphasePairs);
#endif // TEST_OTHER_GPU_SOLVER
m_data->m_solver2 = new b3GpuBatchingPgsSolver(ctx,device,q,config.m_maxBroadphasePairs);
m_data->m_broadphaseDbvt = broadphaseDbvt;
m_data->m_broadphaseSap = broadphaseSap;
m_data->m_narrowphase = narrowphase;
cl_int errNum=0;
{
cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,integrateKernelCL,&errNum,"",B3_RIGIDBODY_INTEGRATE_PATH);
b3Assert(errNum==CL_SUCCESS);
m_data->m_integrateTransformsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,integrateKernelCL, "integrateTransformsKernel",&errNum,prog);
b3Assert(errNum==CL_SUCCESS);
clReleaseProgram(prog);
}
{
cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,updateAabbsKernelCL,&errNum,"",B3_RIGIDBODY_UPDATEAABB_PATH);
b3Assert(errNum==CL_SUCCESS);
m_data->m_updateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,updateAabbsKernelCL, "initializeGpuAabbsFull",&errNum,prog);
b3Assert(errNum==CL_SUCCESS);
clReleaseProgram(prog);
}
}
b3GpuRigidBodyPipeline::~b3GpuRigidBodyPipeline()
{
clReleaseKernel(m_data->m_integrateTransformsKernel);
delete m_data->m_solver;
delete m_data->m_allAabbsGPU;
delete m_data->m_overlappingPairsGPU;
#ifdef TEST_OTHER_GPU_SOLVER
delete m_data->m_solver3;
#endif //TEST_OTHER_GPU_SOLVER
delete m_data->m_solver2;
delete m_data;
}
void b3GpuRigidBodyPipeline::addConstraint(b3TypedConstraint* constraint)
{
m_data->m_joints.push_back(constraint);
}
void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
{
//update worldspace AABBs from local AABB/worldtransform
{
setupGpuAabbsFull();
}
int numPairs =0;
//compute overlapping pairs
{
if (useDbvt)
{
{
B3_PROFILE("setAabb");
m_data->m_allAabbsGPU->copyToHost(m_data->m_allAabbsCPU);
for (int i=0;i<m_data->m_allAabbsCPU.size();i++)
{
b3BroadphaseProxy* proxy = &m_data->m_broadphaseDbvt->m_proxies[i];
b3Vector3 aabbMin(m_data->m_allAabbsCPU[i].m_min[0],m_data->m_allAabbsCPU[i].m_min[1],m_data->m_allAabbsCPU[i].m_min[2]);
b3Vector3 aabbMax(m_data->m_allAabbsCPU[i].m_max[0],m_data->m_allAabbsCPU[i].m_max[1],m_data->m_allAabbsCPU[i].m_max[2]);
m_data->m_broadphaseDbvt->setAabb(proxy,aabbMin,aabbMax,0);
}
}
{
B3_PROFILE("calculateOverlappingPairs");
m_data->m_broadphaseDbvt->calculateOverlappingPairs();
}
numPairs = m_data->m_broadphaseDbvt->getOverlappingPairCache()->getNumOverlappingPairs();
} else
{
m_data->m_broadphaseSap->calculateOverlappingPairs();
numPairs = m_data->m_broadphaseSap->getNumOverlap();
}
}
//compute contact points
int numContacts = 0;
int numBodies = m_data->m_narrowphase->getNumBodiesGpu();
if (numPairs)
{
cl_mem pairs =0;
cl_mem aabbsWS =0;
if (useDbvt)
{
B3_PROFILE("m_overlappingPairsGPU->copyFromHost");
m_data->m_overlappingPairsGPU->copyFromHost(m_data->m_broadphaseDbvt->getOverlappingPairCache()->getOverlappingPairArray());
pairs = m_data->m_overlappingPairsGPU->getBufferCL();
aabbsWS = m_data->m_allAabbsGPU->getBufferCL();
} else
{
pairs = m_data->m_broadphaseSap->getOverlappingPairBuffer();
aabbsWS = m_data->m_broadphaseSap->getAabbBufferWS();
}
m_data->m_narrowphase->computeContacts(pairs,numPairs,aabbsWS,numBodies);
numContacts = m_data->m_narrowphase->getNumContactsGpu();
if (dumpContactStats && numContacts)
{
m_data->m_narrowphase->getContactsGpu();
printf("numContacts = %d\n", numContacts);
int totalPoints = 0;
const b3Contact4* contacts = m_data->m_narrowphase->getContactsCPU();
for (int i=0;i<numContacts;i++)
{
totalPoints += contacts->getNPoints();
}
printf("totalPoints=%d\n",totalPoints);
}
}
//convert contact points to contact constraints
//solve constraints
b3OpenCLArray<b3RigidBodyCL> gpuBodies(m_data->m_context,m_data->m_queue,0,true);
gpuBodies.setFromOpenCLBuffer(m_data->m_narrowphase->getBodiesGpu(),m_data->m_narrowphase->getNumBodiesGpu());
b3OpenCLArray<b3InertiaCL> gpuInertias(m_data->m_context,m_data->m_queue,0,true);
gpuInertias.setFromOpenCLBuffer(m_data->m_narrowphase->getBodyInertiasGpu(),m_data->m_narrowphase->getNumBodiesGpu());
b3OpenCLArray<b3Contact4> gpuContacts(m_data->m_context,m_data->m_queue,0,true);
gpuContacts.setFromOpenCLBuffer(m_data->m_narrowphase->getContactsGpu(),m_data->m_narrowphase->getNumContactsGpu());
if (useBullet2CpuSolver)
{
b3AlignedObjectArray<b3RigidBodyCL> hostBodies;
gpuBodies.copyToHost(hostBodies);
b3AlignedObjectArray<b3InertiaCL> hostInertias;
gpuInertias.copyToHost(hostInertias);
b3AlignedObjectArray<b3Contact4> hostContacts;
gpuContacts.copyToHost(hostContacts);
{
int numJoints = m_data->m_joints.size();
b3TypedConstraint** joints = numJoints? &m_data->m_joints[0] : 0;
b3Contact4* contacts = numContacts? &hostContacts[0]: 0;
// m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,contacts,numJoints, joints);
m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],0,0,numJoints, joints);
}
gpuBodies.copyFromHost(hostBodies);
}
if (numContacts)
{
#ifdef TEST_OTHER_GPU_SOLVER
if (useJacobi)
{
bool useGpu = true;
if (useGpu)
{
bool forceHost = false;
if (forceHost)
{
b3AlignedObjectArray<b3RigidBodyCL> hostBodies;
b3AlignedObjectArray<b3InertiaCL> hostInertias;
b3AlignedObjectArray<b3Contact4> hostContacts;
{
B3_PROFILE("copyToHost");
gpuBodies.copyToHost(hostBodies);
gpuInertias.copyToHost(hostInertias);
gpuContacts.copyToHost(hostContacts);
}
{
b3JacobiSolverInfo solverInfo;
m_data->m_solver3->solveGroupHost(&hostBodies[0], &hostInertias[0], hostBodies.size(),&hostContacts[0],hostContacts.size(),0,0,solverInfo);
}
{
B3_PROFILE("copyFromHost");
gpuBodies.copyFromHost(hostBodies);
}
} else
{
b3JacobiSolverInfo solverInfo;
m_data->m_solver3->solveGroup(&gpuBodies, &gpuInertias, &gpuContacts,solverInfo);
}
} else
{
b3AlignedObjectArray<b3RigidBodyCL> hostBodies;
gpuBodies.copyToHost(hostBodies);
b3AlignedObjectArray<b3InertiaCL> hostInertias;
gpuInertias.copyToHost(hostInertias);
b3AlignedObjectArray<b3Contact4> hostContacts;
gpuContacts.copyToHost(hostContacts);
{
m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,&hostContacts[0]);
}
gpuBodies.copyFromHost(hostBodies);
}
} else
#endif //TEST_OTHER_GPU_SOLVER
{
b3Config config;
m_data->m_solver2->solveContacts(numBodies, gpuBodies.getBufferCL(),gpuInertias.getBufferCL(),numContacts, gpuContacts.getBufferCL(),config);
//m_data->m_solver4->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(), gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL());
/*m_data->m_solver3->solveContactConstraintHost(
(b3OpenCLArray<RigidBodyBase::Body>*)&gpuBodies,
(b3OpenCLArray<RigidBodyBase::Inertia>*)&gpuInertias,
(b3OpenCLArray<Constraint4>*) &gpuContacts,
0,numContacts,256);
*/
}
}
integrate(deltaTime);
}
void b3GpuRigidBodyPipeline::integrate(float timeStep)
{
//integrate
b3LauncherCL launcher(m_data->m_queue,m_data->m_integrateTransformsKernel);
launcher.setBuffer(m_data->m_narrowphase->getBodiesGpu());
int numBodies = m_data->m_narrowphase->getNumBodiesGpu();
launcher.setConst(numBodies);
launcher.setConst(timeStep);
float angularDamp = 0.99f;
launcher.setConst(angularDamp);
b3Vector3 gravity(0.f,-9.8f,0.f);
launcher.setConst(gravity);
launcher.launch1D(numBodies);
}
void b3GpuRigidBodyPipeline::setupGpuAabbsFull()
{
cl_int ciErrNum=0;
int numBodies = m_data->m_narrowphase->getNumBodiesGpu();
if (!numBodies)
return;
//__kernel void initializeGpuAabbsFull( const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global b3AABBCL* plocalShapeAABB, __global b3AABBCL* pAABB)
b3LauncherCL launcher(m_data->m_queue,m_data->m_updateAabbsKernel);
launcher.setConst(numBodies);
cl_mem bodies = m_data->m_narrowphase->getBodiesGpu();
launcher.setBuffer(bodies);
cl_mem collidables = m_data->m_narrowphase->getCollidablesGpu();
launcher.setBuffer(collidables);
cl_mem localAabbs = m_data->m_narrowphase->getAabbBufferGpu();
launcher.setBuffer(localAabbs);
cl_mem worldAabbs =0;
if (useDbvt)
{
worldAabbs = m_data->m_allAabbsGPU->getBufferCL();
} else
{
worldAabbs = m_data->m_broadphaseSap->getAabbBufferWS();
}
launcher.setBuffer(worldAabbs);
launcher.launch1D(numBodies);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
cl_mem b3GpuRigidBodyPipeline::getBodyBuffer()
{
return m_data->m_narrowphase->getBodiesGpu();
}
int b3GpuRigidBodyPipeline::getNumBodies() const
{
return m_data->m_narrowphase->getNumBodiesGpu();
}
void b3GpuRigidBodyPipeline::writeAllInstancesToGpu()
{
m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
}
int b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userIndex, bool writeInstanceToGpu)
{
b3Vector3 aabbMin(0,0,0),aabbMax(0,0,0);
int bodyIndex = m_data->m_narrowphase->getNumRigidBodies();
if (collidableIndex>=0)
{
b3SapAabb localAabb = m_data->m_narrowphase->getLocalSpaceAabb(collidableIndex);
b3Vector3 localAabbMin(localAabb.m_min[0],localAabb.m_min[1],localAabb.m_min[2]);
b3Vector3 localAabbMax(localAabb.m_max[0],localAabb.m_max[1],localAabb.m_max[2]);
b3Scalar margin = 0.01f;
b3Transform t;
t.setIdentity();
t.setOrigin(b3Vector3(position[0],position[1],position[2]));
t.setRotation(b3Quaternion(orientation[0],orientation[1],orientation[2],orientation[3]));
b3TransformAabb(localAabbMin,localAabbMax, margin,t,aabbMin,aabbMax);
if (useDbvt)
{
m_data->m_broadphaseDbvt->createProxy(aabbMin,aabbMax,bodyIndex,0,1,1);
b3SapAabb aabb;
for (int i=0;i<3;i++)
{
aabb.m_min[i] = aabbMin[i];
aabb.m_max[i] = aabbMax[i];
aabb.m_minIndices[3] = bodyIndex;
}
m_data->m_allAabbsCPU.push_back(aabb);
if (writeInstanceToGpu)
{
m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
}
} else
{
if (mass)
{
m_data->m_broadphaseSap->createProxy(aabbMin,aabbMax,userIndex,1,1);//m_dispatcher);
} else
{
m_data->m_broadphaseSap->createLargeProxy(aabbMin,aabbMax,userIndex,1,1);//m_dispatcher);
}
}
}
bool writeToGpu = false;
bodyIndex = m_data->m_narrowphase->registerRigidBody(collidableIndex,mass,position,orientation,&aabbMin.getX(),&aabbMax.getX(),writeToGpu);
/*
if (mass>0.f)
m_numDynamicPhysicsInstances++;
m_numPhysicsInstances++;
*/
return bodyIndex;
}

View File

@@ -0,0 +1,45 @@
#ifndef B3_GPU_RIGIDBODY_PIPELINE_H
#define B3_GPU_RIGIDBODY_PIPELINE_H
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
class b3GpuRigidBodyPipeline
{
protected:
struct b3GpuRigidBodyPipelineInternalData* m_data;
int allocateCollidable();
public:
b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue q , class b3GpuNarrowPhase* narrowphase, class b3GpuSapBroadphase* broadphaseSap, class b3DynamicBvhBroadphase* broadphaseDbvt);
virtual ~b3GpuRigidBodyPipeline();
void stepSimulation(float deltaTime);
void integrate(float timeStep);
void setupGpuAabbsFull();
int registerConvexPolyhedron(class b3ConvexUtility* convex);
//int registerConvexPolyhedron(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
//int registerSphereShape(float radius);
//int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
//int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling);
//int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);
int registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData, bool writeInstanceToGpu);
//if you passed "writeInstanceToGpu" false in the registerPhysicsInstance method (for performance) you need to call writeAllInstancesToGpu after all instances are registered
void writeAllInstancesToGpu();
void addConstraint(class b3TypedConstraint* constraint);
cl_mem getBodyBuffer();
int getNumBodies() const;
};
#endif //B3_GPU_RIGIDBODY_PIPELINE_H

View File

@@ -0,0 +1,44 @@
#ifndef B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H
#define B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3OpenCL/NarrowphaseCollision/b3Collidable.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
#include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h"
#include "Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h"
struct b3GpuRigidBodyPipelineInternalData
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_integrateTransformsKernel;
cl_kernel m_updateAabbsKernel;
class b3PgsJacobiSolver* m_solver;
class b3GpuBatchingPgsSolver* m_solver2;
class b3GpuJacobiSolver* m_solver3;
class b3GpuSapBroadphase* m_broadphaseSap;
class b3DynamicBvhBroadphase* m_broadphaseDbvt;
b3OpenCLArray<b3SapAabb>* m_allAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
b3OpenCLArray<b3BroadphasePair>* m_overlappingPairsGPU;
b3AlignedObjectArray<b3TypedConstraint*> m_joints;
class b3GpuNarrowPhase* m_narrowphase;
};
#endif //B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H

View File

@@ -0,0 +1,935 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#include "b3Solver.h"
///useNewBatchingKernel is a rewritten kernel using just a single thread of the warp, for experiments
bool useNewBatchingKernel = true;
#define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
#define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
#define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
#define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
#define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
#define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
#include "kernels/solverSetup.h"
#include "kernels/solverSetup2.h"
#include "kernels/solveContact.h"
#include "kernels/solveFriction.h"
#include "kernels/batchingKernels.h"
#include "kernels/batchingKernelsNew.h"
#include "Bullet3Common/b3Quickprof.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3Common/b3Vector3.h"
struct SolverDebugInfo
{
int m_valInt0;
int m_valInt1;
int m_valInt2;
int m_valInt3;
int m_valInt4;
int m_valInt5;
int m_valInt6;
int m_valInt7;
int m_valInt8;
int m_valInt9;
int m_valInt10;
int m_valInt11;
int m_valInt12;
int m_valInt13;
int m_valInt14;
int m_valInt15;
float m_val0;
float m_val1;
float m_val2;
float m_val3;
};
class SolverDeviceInl
{
public:
struct ParallelSolveData
{
b3OpenCLArray<unsigned int>* m_numConstraints;
b3OpenCLArray<unsigned int>* m_offsets;
};
};
b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity)
:m_nIterations(4),
m_context(ctx),
m_device(device),
m_queue(queue)
{
m_sort32 = new b3RadixSort32CL(ctx,device,queue);
m_scan = new b3PrefixScanCL(ctx,device,queue,N_SPLIT*N_SPLIT);
m_search = new b3BoundSearchCL(ctx,device,queue,N_SPLIT*N_SPLIT);
const int sortSize = B3NEXTMULTIPLEOF( pairCapacity, 512 );
m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx,queue,sortSize);
m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx,queue);
m_numConstraints = new b3OpenCLArray<unsigned int>(ctx,queue,N_SPLIT*N_SPLIT );
m_numConstraints->resize(N_SPLIT*N_SPLIT);
m_offsets = new b3OpenCLArray<unsigned int>( ctx,queue, N_SPLIT*N_SPLIT );
m_offsets->resize(N_SPLIT*N_SPLIT);
const char* additionalMacros = "";
const char* srcFileNameForCaching="";
cl_int pErrNum;
const char* batchKernelSource = batchingKernelsCL;
const char* batchKernelNewSource = batchingKernelsNewCL;
const char* solverSetupSource = solverSetupCL;
const char* solverSetup2Source = solverSetup2CL;
const char* solveContactSource = solveContactCL;
const char* solveFrictionSource = solveFrictionCL;
{
cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
b3Assert(solveContactProg);
cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
b3Assert(solveFrictionProg);
cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
b3Assert(solverSetup2Prog);
cl_program solverSetupProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
b3Assert(solverSetupProg);
m_solveFrictionKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros );
b3Assert(m_solveFrictionKernel);
m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros );
b3Assert(m_solveContactKernel);
m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros );
b3Assert(m_contactToConstraintKernel);
m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros );
b3Assert(m_setSortDataKernel);
m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros );
b3Assert(m_reorderContactKernel);
m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros );
b3Assert(m_copyConstraintKernel);
}
{
cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, B3_BATCHING_PATH);
b3Assert(batchingProg);
m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros );
b3Assert(m_batchingKernel);
}
{
cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, B3_BATCHING_NEW_PATH);
b3Assert(batchingNewProg);
m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros );
//m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros );
b3Assert(m_batchingKernelNew);
}
}
b3Solver::~b3Solver()
{
delete m_sortDataBuffer;
delete m_contactBuffer2;
delete m_sort32;
delete m_scan;
delete m_search;
clReleaseKernel(m_batchingKernel);
clReleaseKernel(m_batchingKernelNew);
clReleaseKernel( m_solveContactKernel);
clReleaseKernel( m_solveFrictionKernel);
clReleaseKernel( m_contactToConstraintKernel);
clReleaseKernel( m_setSortDataKernel);
clReleaseKernel( m_reorderContactKernel);
clReleaseKernel( m_copyConstraintKernel);
}
/*void b3Solver::reorderConvertToConstraints( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf,
const b3OpenCLArray<b3InertiaCL>* shapeBuf,
b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData,
int nContacts, const b3Solver::ConstraintCfg& cfg )
{
if( m_contactBuffer )
{
m_contactBuffer->resize(nContacts);
}
if( m_contactBuffer == 0 )
{
B3_PROFILE("new m_contactBuffer;");
m_contactBuffer = new b3OpenCLArray<b3Contact4>(m_context,m_queue,nContacts );
m_contactBuffer->resize(nContacts);
}
//DeviceUtils::Config dhCfg;
//Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
if( cfg.m_enableParallelSolve )
{
clFinish(m_queue);
// contactsIn -> m_contactBuffer
{
B3_PROFILE("sortContacts");
sortContacts( bodyBuf, contactsIn, additionalData, nContacts, cfg );
clFinish(m_queue);
}
{
B3_PROFILE("m_copyConstraintKernel");
b3Int4 cdata; cdata.x = nContacts;
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_contactBuffer->getBufferCL() ), b3BufferInfoCL( contactsIn->getBufferCL() ) };
// b3LauncherCL launcher( m_queue, data->m_device->getKernel( PATH, "CopyConstraintKernel", "-I ..\\..\\ -Wf,--c++", 0 ) );
b3LauncherCL launcher( m_queue, m_copyConstraintKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( nContacts, 64 );
clFinish(m_queue);
}
{
B3_PROFILE("batchContacts");
b3Solver::batchContacts( contactsIn, nContacts, m_numConstraints, m_offsets, cfg.m_staticIdx );
}
}
{
B3_PROFILE("waitForCompletion (batchContacts)");
clFinish(m_queue);
}
//================
{
B3_PROFILE("convertToConstraints");
b3Solver::convertToConstraints( bodyBuf, shapeBuf, contactsIn, contactCOut, additionalData, nContacts, cfg );
}
{
B3_PROFILE("convertToConstraints waitForCompletion");
clFinish(m_queue);
}
}
*/
static
inline
float calcRelVel(const b3Vector3& l0, const b3Vector3& l1, const b3Vector3& a0, const b3Vector3& a1,
const b3Vector3& linVel0, const b3Vector3& angVel0, const b3Vector3& linVel1, const b3Vector3& angVel1)
{
return b3Dot(l0, linVel0) + b3Dot(a0, angVel0) + b3Dot(l1, linVel1) + b3Dot(a1, angVel1);
}
static
inline
void setLinearAndAngular(const b3Vector3& n, const b3Vector3& r0, const b3Vector3& r1,
b3Vector3& linear, b3Vector3& angular0, b3Vector3& angular1)
{
linear = -n;
angular0 = -b3Cross(r0, n);
angular1 = b3Cross(r1, n);
}
template<bool JACOBI>
static
__inline
void solveContact(b3GpuConstraint4& cs,
const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
float maxRambdaDt[4], float minRambdaDt[4])
{
b3Vector3 dLinVelA; dLinVelA.setZero();
b3Vector3 dAngVelA; dAngVelA.setZero();
b3Vector3 dLinVelB; dLinVelB.setZero();
b3Vector3 dAngVelB; dAngVelB.setZero();
for(int ic=0; ic<4; ic++)
{
// dont necessary because this makes change to 0
if( cs.m_jacCoeffInv[ic] == 0.f ) continue;
{
b3Vector3 angular0, angular1, linear;
b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
setLinearAndAngular( (const b3Vector3 &)-cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, linear, angular0, angular1 );
float rambdaDt = calcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1,
linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic];
rambdaDt *= cs.m_jacCoeffInv[ic];
{
float prevSum = cs.m_appliedRambdaDt[ic];
float updated = prevSum;
updated += rambdaDt;
updated = b3Max( updated, minRambdaDt[ic] );
updated = b3Min( updated, maxRambdaDt[ic] );
rambdaDt = updated - prevSum;
cs.m_appliedRambdaDt[ic] = updated;
}
b3Vector3 linImp0 = invMassA*linear*rambdaDt;
b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
#ifdef _WIN32
b3Assert(_finite(linImp0.getX()));
b3Assert(_finite(linImp1.getX()));
#endif
if( JACOBI )
{
dLinVelA += linImp0;
dAngVelA += angImp0;
dLinVelB += linImp1;
dAngVelB += angImp1;
}
else
{
linVelA += linImp0;
angVelA += angImp0;
linVelB += linImp1;
angVelB += angImp1;
}
}
}
if( JACOBI )
{
linVelA += dLinVelA;
angVelA += dAngVelA;
linVelB += dLinVelB;
angVelB += dAngVelB;
}
}
static
__inline
void solveFriction(b3GpuConstraint4& cs,
const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
float maxRambdaDt[4], float minRambdaDt[4])
{
if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return;
const b3Vector3& center = (const b3Vector3&)cs.m_center;
b3Vector3 n = -(const b3Vector3&)cs.m_linear;
b3Vector3 tangent[2];
#if 1
b3PlaneSpace1 (n, tangent[0],tangent[1]);
#else
b3Vector3 r = cs.m_worldPos[0]-center;
tangent[0] = cross3( n, r );
tangent[1] = cross3( tangent[0], n );
tangent[0] = normalize3( tangent[0] );
tangent[1] = normalize3( tangent[1] );
#endif
b3Vector3 angular0, angular1, linear;
b3Vector3 r0 = center - posA;
b3Vector3 r1 = center - posB;
for(int i=0; i<2; i++)
{
setLinearAndAngular( tangent[i], r0, r1, linear, angular0, angular1 );
float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
linVelA, angVelA, linVelB, angVelB );
rambdaDt *= cs.m_fJacCoeffInv[i];
{
float prevSum = cs.m_fAppliedRambdaDt[i];
float updated = prevSum;
updated += rambdaDt;
updated = b3Max( updated, minRambdaDt[i] );
updated = b3Min( updated, maxRambdaDt[i] );
rambdaDt = updated - prevSum;
cs.m_fAppliedRambdaDt[i] = updated;
}
b3Vector3 linImp0 = invMassA*linear*rambdaDt;
b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
#ifdef _WIN32
b3Assert(_finite(linImp0.getX()));
b3Assert(_finite(linImp1.getX()));
#endif
linVelA += linImp0;
angVelA += angImp0;
linVelB += linImp1;
angVelB += angImp1;
}
{ // angular damping for point constraint
b3Vector3 ab = ( posB - posA ).normalized();
b3Vector3 ac = ( center - posA ).normalized();
if( b3Dot( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
{
float angNA = b3Dot( n, angVelA );
float angNB = b3Dot( n, angVelB );
angVelA -= (angNA*0.1f)*n;
angVelB -= (angNB*0.1f)*n;
}
}
}
struct SolveTask// : public ThreadPool::Task
{
SolveTask(b3AlignedObjectArray<b3RigidBodyCL>& bodies, b3AlignedObjectArray<b3InertiaCL>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints,
int start, int nConstraints)
: m_bodies( bodies ), m_shapes( shapes ), m_constraints( constraints ), m_start( start ), m_nConstraints( nConstraints ),
m_solveFriction( true ){}
unsigned short int getType(){ return 0; }
void run(int tIdx)
{
for(int ic=0; ic<m_nConstraints; ic++)
{
int i = m_start + ic;
float frictionCoeff = m_constraints[i].getFrictionCoeff();
int aIdx = (int)m_constraints[i].m_bodyA;
int bIdx = (int)m_constraints[i].m_bodyB;
b3RigidBodyCL& bodyA = m_bodies[aIdx];
b3RigidBodyCL& bodyB = m_bodies[bIdx];
if( !m_solveFriction )
{
float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld,
(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
maxRambdaDt, minRambdaDt );
}
else
{
float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
float sum = 0;
for(int j=0; j<4; j++)
{
sum +=m_constraints[i].m_appliedRambdaDt[j];
}
frictionCoeff = 0.7f;
for(int j=0; j<4; j++)
{
maxRambdaDt[j] = frictionCoeff*sum;
minRambdaDt[j] = -maxRambdaDt[j];
}
solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld,
(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
maxRambdaDt, minRambdaDt );
}
}
}
b3AlignedObjectArray<b3RigidBodyCL>& m_bodies;
b3AlignedObjectArray<b3InertiaCL>& m_shapes;
b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
int m_start;
int m_nConstraints;
bool m_solveFriction;
};
void b3Solver::solveContactConstraintHost( b3OpenCLArray<b3RigidBodyCL>* bodyBuf, b3OpenCLArray<b3InertiaCL>* shapeBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches)
{
b3AlignedObjectArray<b3RigidBodyCL> bodyNative;
bodyBuf->copyToHost(bodyNative);
b3AlignedObjectArray<b3InertiaCL> shapeNative;
shapeBuf->copyToHost(shapeNative);
b3AlignedObjectArray<b3GpuConstraint4> constraintNative;
constraint->copyToHost(constraintNative);
for(int iter=0; iter<m_nIterations; iter++)
{
SolveTask task( bodyNative, shapeNative, constraintNative, 0, n );
task.m_solveFriction = false;
task.run(0);
}
for(int iter=0; iter<m_nIterations; iter++)
{
SolveTask task( bodyNative, shapeNative, constraintNative, 0, n );
task.m_solveFriction = true;
task.run(0);
}
bodyBuf->copyFromHost(bodyNative);
shapeBuf->copyFromHost(shapeNative);
constraint->copyFromHost(constraintNative);
}
void b3Solver::solveContactConstraint( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf, const b3OpenCLArray<b3InertiaCL>* shapeBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches)
{
b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 );
{
const int nn = N_SPLIT*N_SPLIT;
cdata.x = 0;
cdata.y = maxNumBatches;//250;
int numWorkItems = 64*nn/N_BATCHES;
#ifdef DEBUG_ME
SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
#endif
{
B3_PROFILE("m_batchSolveKernel iterations");
for(int iter=0; iter<m_nIterations; iter++)
{
for(int ib=0; ib<N_BATCHES; ib++)
{
#ifdef DEBUG_ME
memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
gpuDebugInfo.write(debugInfo,numWorkItems);
#endif
cdata.z = ib;
cdata.w = N_SPLIT;
b3LauncherCL launcher( m_queue, m_solveContactKernel );
#if 1
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL( bodyBuf->getBufferCL() ),
b3BufferInfoCL( shapeBuf->getBufferCL() ),
b3BufferInfoCL( constraint->getBufferCL() ),
b3BufferInfoCL( m_numConstraints->getBufferCL() ),
b3BufferInfoCL( m_offsets->getBufferCL() )
#ifdef DEBUG_ME
, b3BufferInfoCL(&gpuDebugInfo)
#endif
};
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
//launcher.setConst( cdata.x );
launcher.setConst( cdata.y );
launcher.setConst( cdata.z );
launcher.setConst( cdata.w );
launcher.launch1D( numWorkItems, 64 );
#else
const char* fileName = "m_batchSolveKernel.bin";
FILE* f = fopen(fileName,"rb");
if (f)
{
int sizeInBytes=0;
if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
{
printf("error, cannot get file size\n");
exit(0);
}
unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
fread(buf,sizeInBytes,1,f);
int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
int num = *(int*)&buf[serializedBytes];
launcher.launch1D( num);
//this clFinish is for testing on errors
clFinish(m_queue);
}
#endif
#ifdef DEBUG_ME
clFinish(m_queue);
gpuDebugInfo.read(debugInfo,numWorkItems);
clFinish(m_queue);
for (int i=0;i<numWorkItems;i++)
{
if (debugInfo[i].m_valInt2>0)
{
printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2);
}
if (debugInfo[i].m_valInt3>0)
{
printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3);
}
}
#endif //DEBUG_ME
}
}
clFinish(m_queue);
}
cdata.x = 1;
bool applyFriction=true;
if (applyFriction)
{
B3_PROFILE("m_batchSolveKernel iterations2");
for(int iter=0; iter<m_nIterations; iter++)
{
for(int ib=0; ib<N_BATCHES; ib++)
{
cdata.z = ib;
cdata.w = N_SPLIT;
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL( bodyBuf->getBufferCL() ),
b3BufferInfoCL( shapeBuf->getBufferCL() ),
b3BufferInfoCL( constraint->getBufferCL() ),
b3BufferInfoCL( m_numConstraints->getBufferCL() ),
b3BufferInfoCL( m_offsets->getBufferCL() )
#ifdef DEBUG_ME
,b3BufferInfoCL(&gpuDebugInfo)
#endif //DEBUG_ME
};
b3LauncherCL launcher( m_queue, m_solveFrictionKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
//launcher.setConst( cdata.x );
launcher.setConst( cdata.y );
launcher.setConst( cdata.z );
launcher.setConst( cdata.w );
launcher.launch1D( 64*nn/N_BATCHES, 64 );
}
}
clFinish(m_queue);
}
#ifdef DEBUG_ME
delete[] debugInfo;
#endif //DEBUG_ME
}
}
void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf,
const b3OpenCLArray<b3InertiaCL>* shapeBuf,
b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData,
int nContacts, const ConstraintCfg& cfg )
{
b3OpenCLArray<b3GpuConstraint4>* constraintNative =0;
struct CB
{
int m_nContacts;
float m_dt;
float m_positionDrift;
float m_positionConstraintCoeff;
};
{
B3_PROFILE("m_contactToConstraintKernel");
CB cdata;
cdata.m_nContacts = nContacts;
cdata.m_dt = cfg.m_dt;
cdata.m_positionDrift = cfg.m_positionDrift;
cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( shapeBuf->getBufferCL()),
b3BufferInfoCL( contactCOut->getBufferCL() )};
b3LauncherCL launcher( m_queue, m_contactToConstraintKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
//launcher.setConst( cdata );
launcher.setConst(cdata.m_nContacts);
launcher.setConst(cdata.m_dt);
launcher.setConst(cdata.m_positionDrift);
launcher.setConst(cdata.m_positionConstraintCoeff);
launcher.launch1D( nContacts, 64 );
clFinish(m_queue);
}
contactCOut->resize(nContacts);
}
/*
void b3Solver::sortContacts( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf,
b3OpenCLArray<b3Contact4>* contactsIn, void* additionalData,
int nContacts, const b3Solver::ConstraintCfg& cfg )
{
const int sortAlignment = 512; // todo. get this out of sort
if( cfg.m_enableParallelSolve )
{
int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
b3OpenCLArray<unsigned int>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
b3OpenCLArray<unsigned int>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
{ // 2. set cell idx
struct CB
{
int m_nContacts;
int m_staticIdx;
float m_scale;
int m_nSplit;
};
b3Assert( sortSize%64 == 0 );
CB cdata;
cdata.m_nContacts = nContacts;
cdata.m_staticIdx = cfg.m_staticIdx;
cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
cdata.m_nSplit = N_SPLIT;
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
b3LauncherCL launcher( m_queue, m_setSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( sortSize, 64 );
}
{ // 3. sort by cell idx
int n = N_SPLIT*N_SPLIT;
int sortBit = 32;
//if( n <= 0xffff ) sortBit = 16;
//if( n <= 0xff ) sortBit = 8;
m_sort32->execute(*m_sortDataBuffer,sortSize);
}
{ // 4. find entries
m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, N_SPLIT*N_SPLIT, b3BoundSearchCL::COUNT);
m_scan->execute( *countsNative, *offsetsNative, N_SPLIT*N_SPLIT );
}
{ // 5. sort constraints by cellIdx
// todo. preallocate this
// b3Assert( contactsIn->getType() == TYPE_HOST );
// b3OpenCLArray<b3Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn ); // copying contacts to this buffer
{
b3Int4 cdata; cdata.x = nContacts;
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( m_contactBuffer->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
b3LauncherCL launcher( m_queue, m_reorderContactKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( nContacts, 64 );
}
// BufferUtils::unmap<true>( out, contactsIn, nContacts );
}
}
}
*/
void b3Solver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx )
{
int numWorkItems = 64*N_SPLIT*N_SPLIT;
{
B3_PROFILE("batch generation");
b3Int4 cdata;
cdata.x = nContacts;
cdata.y = 0;
cdata.z = staticIdx;
#ifdef BATCH_DEBUG
SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
gpuDebugInfo.write(debugInfo,numWorkItems);
#endif
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL( contacts->getBufferCL() ),
b3BufferInfoCL( m_contactBuffer2->getBufferCL()),
b3BufferInfoCL( nNative->getBufferCL() ),
b3BufferInfoCL( offsetsNative->getBufferCL() ),
#ifdef BATCH_DEBUG
, b3BufferInfoCL(&gpuDebugInfo)
#endif
};
{
B3_PROFILE("batchingKernel");
//b3LauncherCL launcher( m_queue, m_batchingKernel);
cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;
b3LauncherCL launcher( m_queue, k);
if (!useNewBatchingKernel )
{
launcher.setBuffer( contacts->getBufferCL() );
}
launcher.setBuffer( m_contactBuffer2->getBufferCL() );
launcher.setBuffer( nNative->getBufferCL());
launcher.setBuffer( offsetsNative->getBufferCL());
//launcher.setConst( cdata );
launcher.setConst(staticIdx);
launcher.launch1D( numWorkItems, 64 );
clFinish(m_queue);
}
#ifdef BATCH_DEBUG
aaaa
b3Contact4* hostContacts = new b3Contact4[nContacts];
m_contactBuffer->read(hostContacts,nContacts);
clFinish(m_queue);
gpuDebugInfo.read(debugInfo,numWorkItems);
clFinish(m_queue);
for (int i=0;i<numWorkItems;i++)
{
if (debugInfo[i].m_valInt1>0)
{
printf("catch\n");
}
if (debugInfo[i].m_valInt2>0)
{
printf("catch22\n");
}
if (debugInfo[i].m_valInt3>0)
{
printf("catch666\n");
}
if (debugInfo[i].m_valInt4>0)
{
printf("catch777\n");
}
}
delete[] debugInfo;
#endif //BATCH_DEBUG
}
// copy buffer to buffer
//b3Assert(m_contactBuffer->size()==nContacts);
//contacts->copyFromOpenCLArray( *m_contactBuffer);
//clFinish(m_queue);//needed?
}

View File

@@ -0,0 +1,124 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#ifndef __ADL_SOLVER_H
#define __ADL_SOLVER_H
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "b3GpuConstraint4.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#define B3NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
class b3SolverBase
{
public:
struct ConstraintCfg
{
ConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(-1) {}
float m_positionDrift;
float m_positionConstraintCoeff;
float m_dt;
bool m_enableParallelSolve;
float m_averageExtent;
int m_staticIdx;
};
enum
{
N_SPLIT = 16,
N_BATCHES = 4,
N_OBJ_PER_SPLIT = 10,
N_TASKS_PER_BATCH = N_SPLIT*N_SPLIT,
};
};
class b3Solver : public b3SolverBase
{
public:
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
b3OpenCLArray<unsigned int>* m_numConstraints;
b3OpenCLArray<unsigned int>* m_offsets;
int m_nIterations;
cl_kernel m_batchingKernel;
cl_kernel m_batchingKernelNew;
cl_kernel m_solveContactKernel;
cl_kernel m_solveFrictionKernel;
cl_kernel m_contactToConstraintKernel;
cl_kernel m_setSortDataKernel;
cl_kernel m_reorderContactKernel;
cl_kernel m_copyConstraintKernel;
class b3RadixSort32CL* m_sort32;
class b3BoundSearchCL* m_search;
class b3PrefixScanCL* m_scan;
b3OpenCLArray<b3SortData>* m_sortDataBuffer;
b3OpenCLArray<b3Contact4>* m_contactBuffer2;
enum
{
DYNAMIC_CONTACT_ALLOCATION_THRESHOLD = 2000000,
};
b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity);
virtual ~b3Solver();
void solveContactConstraint( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf, const b3OpenCLArray<b3InertiaCL>* inertiaBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches);
void solveContactConstraintHost( b3OpenCLArray<b3RigidBodyCL>* bodyBuf, b3OpenCLArray<b3InertiaCL>* shapeBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches);
void convertToConstraints( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf,
const b3OpenCLArray<b3InertiaCL>* shapeBuf,
b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData,
int nContacts, const ConstraintCfg& cfg );
void batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx );
};
#endif //__ADL_SOLVER_H

View File

@@ -0,0 +1,344 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
#ifdef cl_ext_atomic_counters_32
#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
#else
#define counter32_t volatile __global int*
#endif
typedef unsigned int u32;
typedef unsigned short u16;
typedef unsigned char u8;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GET_NUM_GROUPS get_num_groups(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
#define AtomInc(x) atom_inc(&(x))
#define AtomInc1(x, out) out = atom_inc(&(x))
#define AppendInc(x, out) out = atomic_inc(x)
#define AtomAdd(x, value) atom_add(&(x), value)
#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
#define AtomXhg(x, value) atom_xchg ( &(x), value )
#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
#define make_float4 (float4)
#define make_float2 (float2)
#define make_uint4 (uint4)
#define make_int4 (int4)
#define make_uint2 (uint2)
#define make_int2 (int2)
#define max2 max
#define min2 min
#define WG_SIZE 64
typedef struct
{
float4 m_worldPos[4];
float4 m_worldNormal;
u32 m_coeffs;
int m_batchIdx;
int m_bodyA;//sign bit set for fixed objects
int m_bodyB;
}Contact4;
typedef struct
{
int m_n;
int m_start;
int m_staticIdx;
int m_paddings[1];
} ConstBuffer;
typedef struct
{
int m_a;
int m_b;
u32 m_idx;
}Elem;
#define STACK_SIZE (WG_SIZE*10)
//#define STACK_SIZE (WG_SIZE)
#define RING_SIZE 1024
#define RING_SIZE_MASK (RING_SIZE-1)
#define CHECK_SIZE (WG_SIZE)
#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)
#define RING_END ldsTmp
u32 readBuf(__local u32* buff, int idx)
{
idx = idx % (32*CHECK_SIZE);
int bitIdx = idx%32;
int bufIdx = idx/32;
return buff[bufIdx] & (1<<bitIdx);
}
void writeBuf(__local u32* buff, int idx)
{
idx = idx % (32*CHECK_SIZE);
int bitIdx = idx%32;
int bufIdx = idx/32;
// buff[bufIdx] |= (1<<bitIdx);
atom_or( &buff[bufIdx], (1<<bitIdx) );
}
u32 tryWrite(__local u32* buff, int idx)
{
idx = idx % (32*CHECK_SIZE);
int bitIdx = idx%32;
int bufIdx = idx/32;
u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );
return ((ans >> bitIdx)&1) == 0;
}
// batching on the GPU
__kernel void CreateBatches( __global const Contact4* gConstraints, __global Contact4* gConstraintsOut,
__global const u32* gN, __global const u32* gStart,
int m_staticIdx )
{
__local u32 ldsStackIdx[STACK_SIZE];
__local u32 ldsStackEnd;
__local Elem ldsRingElem[RING_SIZE];
__local u32 ldsRingEnd;
__local u32 ldsTmp;
__local u32 ldsCheckBuffer[CHECK_SIZE];
__local u32 ldsFixedBuffer[CHECK_SIZE];
__local u32 ldsGEnd;
__local u32 ldsDstEnd;
int wgIdx = GET_GROUP_IDX;
int lIdx = GET_LOCAL_IDX;
const int m_n = gN[wgIdx];
const int m_start = gStart[wgIdx];
if( lIdx == 0 )
{
ldsRingEnd = 0;
ldsGEnd = 0;
ldsStackEnd = 0;
ldsDstEnd = m_start;
}
// while(1)
//was 250
for(int ie=0; ie<50; ie++)
{
ldsFixedBuffer[lIdx] = 0;
for(int giter=0; giter<4; giter++)
{
int ringCap = GET_RING_CAPACITY;
// 1. fill ring
if( ldsGEnd < m_n )
{
while( ringCap > WG_SIZE )
{
if( ldsGEnd >= m_n ) break;
if( lIdx < ringCap - WG_SIZE )
{
int srcIdx;
AtomInc1( ldsGEnd, srcIdx );
if( srcIdx < m_n )
{
int dstIdx;
AtomInc1( ldsRingEnd, dstIdx );
int a = gConstraints[m_start+srcIdx].m_bodyA;
int b = gConstraints[m_start+srcIdx].m_bodyB;
ldsRingElem[dstIdx].m_a = (a>b)? b:a;
ldsRingElem[dstIdx].m_b = (a>b)? a:b;
ldsRingElem[dstIdx].m_idx = srcIdx;
}
}
ringCap = GET_RING_CAPACITY;
}
}
GROUP_LDS_BARRIER;
// 2. fill stack
__local Elem* dst = ldsRingElem;
if( lIdx == 0 ) RING_END = 0;
int srcIdx=lIdx;
int end = ldsRingEnd;
{
for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)
{
Elem e;
if(srcIdx<end) e = ldsRingElem[srcIdx];
bool done = (srcIdx<end)?false:true;
for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;
if( !done )
{
int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));
int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));
if( aUsed==0 && bUsed==0 )
{
int aAvailable;
int bAvailable;
int ea = abs(e.m_a);
int eb = abs(e.m_b);
aAvailable = tryWrite( ldsCheckBuffer, ea );
bAvailable = tryWrite( ldsCheckBuffer, eb );
bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);
bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);
aAvailable = aStatic? 1: aAvailable;
bAvailable = bStatic? 1: bAvailable;
bool success = (aAvailable && bAvailable);
if(success)
{
if (!aStatic)
writeBuf( ldsFixedBuffer, ea );
if (!bStatic)
writeBuf( ldsFixedBuffer, eb );
}
done = success;
}
}
// put it aside
if(srcIdx<end)
{
if( done )
{
int dstIdx; AtomInc1( ldsStackEnd, dstIdx );
if( dstIdx < STACK_SIZE )
ldsStackIdx[dstIdx] = e.m_idx;
else{
done = false;
AtomAdd( ldsStackEnd, -1 );
}
}
if( !done )
{
int dstIdx; AtomInc1( RING_END, dstIdx );
dst[dstIdx] = e;
}
}
// if filled, flush
if( ldsStackEnd == STACK_SIZE )
{
for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)
{
int idx = m_start + ldsStackIdx[i];
int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
gConstraintsOut[ dstIdx ].m_batchIdx = ie;
}
if( lIdx == 0 ) ldsStackEnd = 0;
//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE)
ldsFixedBuffer[lIdx] = 0;
}
}
}
if( lIdx == 0 ) ldsRingEnd = RING_END;
}
GROUP_LDS_BARRIER;
for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)
{
int idx = m_start + ldsStackIdx[i];
int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
gConstraintsOut[ dstIdx ].m_batchIdx = ie;
}
// in case it couldn't consume any pair. Flush them
// todo. Serial batch worth while?
if( ldsStackEnd == 0 )
{
for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)
{
int idx = m_start + ldsRingElem[i].m_idx;
int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;
}
GROUP_LDS_BARRIER;
if( lIdx == 0 ) ldsRingEnd = 0;
}
if( lIdx == 0 ) ldsStackEnd = 0;
GROUP_LDS_BARRIER;
// termination
if( ldsGEnd == m_n && ldsRingEnd == 0 )
break;
}
}

View File

@@ -0,0 +1,348 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* batchingKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile __global int*\n"
"#endif\n"
"\n"
"\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"\n"
"\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"\n"
"#define make_float4 (float4)\n"
"#define make_float2 (float2)\n"
"#define make_uint4 (uint4)\n"
"#define make_int4 (int4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"\n"
"#define max2 max\n"
"#define min2 min\n"
"\n"
"\n"
"#define WG_SIZE 64\n"
"\n"
"\n"
"\n"
"typedef struct \n"
"{\n"
" float4 m_worldPos[4];\n"
" float4 m_worldNormal;\n"
" u32 m_coeffs;\n"
" int m_batchIdx;\n"
"\n"
" int m_bodyA;//sign bit set for fixed objects\n"
" int m_bodyB;\n"
"}Contact4;\n"
"\n"
"typedef struct \n"
"{\n"
" int m_n;\n"
" int m_start;\n"
" int m_staticIdx;\n"
" int m_paddings[1];\n"
"} ConstBuffer;\n"
"\n"
"typedef struct \n"
"{\n"
" int m_a;\n"
" int m_b;\n"
" u32 m_idx;\n"
"}Elem;\n"
"\n"
"#define STACK_SIZE (WG_SIZE*10)\n"
"//#define STACK_SIZE (WG_SIZE)\n"
"#define RING_SIZE 1024\n"
"#define RING_SIZE_MASK (RING_SIZE-1)\n"
"#define CHECK_SIZE (WG_SIZE)\n"
"\n"
"\n"
"#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n"
"#define RING_END ldsTmp\n"
"\n"
"u32 readBuf(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" return buff[bufIdx] & (1<<bitIdx);\n"
"}\n"
"\n"
"void writeBuf(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
"// buff[bufIdx] |= (1<<bitIdx);\n"
" atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
"}\n"
"\n"
"u32 tryWrite(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
" return ((ans >> bitIdx)&1) == 0;\n"
"}\n"
"\n"
"// batching on the GPU\n"
"__kernel void CreateBatches( __global const Contact4* gConstraints, __global Contact4* gConstraintsOut,\n"
" __global const u32* gN, __global const u32* gStart, \n"
" int m_staticIdx )\n"
"{\n"
" __local u32 ldsStackIdx[STACK_SIZE];\n"
" __local u32 ldsStackEnd;\n"
" __local Elem ldsRingElem[RING_SIZE];\n"
" __local u32 ldsRingEnd;\n"
" __local u32 ldsTmp;\n"
" __local u32 ldsCheckBuffer[CHECK_SIZE];\n"
" __local u32 ldsFixedBuffer[CHECK_SIZE];\n"
" __local u32 ldsGEnd;\n"
" __local u32 ldsDstEnd;\n"
"\n"
" int wgIdx = GET_GROUP_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" \n"
" const int m_n = gN[wgIdx];\n"
" const int m_start = gStart[wgIdx];\n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" ldsRingEnd = 0;\n"
" ldsGEnd = 0;\n"
" ldsStackEnd = 0;\n"
" ldsDstEnd = m_start;\n"
" }\n"
" \n"
"// while(1)\n"
"//was 250\n"
" for(int ie=0; ie<50; ie++)\n"
" {\n"
" ldsFixedBuffer[lIdx] = 0;\n"
"\n"
" for(int giter=0; giter<4; giter++)\n"
" {\n"
" int ringCap = GET_RING_CAPACITY;\n"
" \n"
" // 1. fill ring\n"
" if( ldsGEnd < m_n )\n"
" {\n"
" while( ringCap > WG_SIZE )\n"
" {\n"
" if( ldsGEnd >= m_n ) break;\n"
" if( lIdx < ringCap - WG_SIZE )\n"
" {\n"
" int srcIdx;\n"
" AtomInc1( ldsGEnd, srcIdx );\n"
" if( srcIdx < m_n )\n"
" {\n"
" int dstIdx;\n"
" AtomInc1( ldsRingEnd, dstIdx );\n"
" \n"
" int a = gConstraints[m_start+srcIdx].m_bodyA;\n"
" int b = gConstraints[m_start+srcIdx].m_bodyB;\n"
" ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n"
" ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n"
" ldsRingElem[dstIdx].m_idx = srcIdx;\n"
" }\n"
" }\n"
" ringCap = GET_RING_CAPACITY;\n"
" }\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
" \n"
" // 2. fill stack\n"
" __local Elem* dst = ldsRingElem;\n"
" if( lIdx == 0 ) RING_END = 0;\n"
"\n"
" int srcIdx=lIdx;\n"
" int end = ldsRingEnd;\n"
"\n"
" {\n"
" for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n"
" {\n"
" Elem e;\n"
" if(srcIdx<end) e = ldsRingElem[srcIdx];\n"
" bool done = (srcIdx<end)?false:true;\n"
"\n"
" for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n"
" \n"
" if( !done )\n"
" {\n"
" int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));\n"
" int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));\n"
"\n"
" if( aUsed==0 && bUsed==0 )\n"
" {\n"
" int aAvailable;\n"
" int bAvailable;\n"
" int ea = abs(e.m_a);\n"
" int eb = abs(e.m_b);\n"
"\n"
" aAvailable = tryWrite( ldsCheckBuffer, ea );\n"
" bAvailable = tryWrite( ldsCheckBuffer, eb );\n"
"\n"
" bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n"
" bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n"
" \n"
" aAvailable = aStatic? 1: aAvailable;\n"
" bAvailable = bStatic? 1: bAvailable;\n"
"\n"
" bool success = (aAvailable && bAvailable);\n"
" if(success)\n"
" {\n"
" \n"
" if (!aStatic)\n"
" writeBuf( ldsFixedBuffer, ea );\n"
" if (!bStatic)\n"
" writeBuf( ldsFixedBuffer, eb );\n"
" }\n"
" done = success;\n"
" }\n"
" }\n"
"\n"
" // put it aside\n"
" if(srcIdx<end)\n"
" {\n"
" if( done )\n"
" {\n"
" int dstIdx; AtomInc1( ldsStackEnd, dstIdx );\n"
" if( dstIdx < STACK_SIZE )\n"
" ldsStackIdx[dstIdx] = e.m_idx;\n"
" else{\n"
" done = false;\n"
" AtomAdd( ldsStackEnd, -1 );\n"
" }\n"
" }\n"
" if( !done )\n"
" {\n"
" int dstIdx; AtomInc1( RING_END, dstIdx );\n"
" dst[dstIdx] = e;\n"
" }\n"
" }\n"
"\n"
" // if filled, flush\n"
" if( ldsStackEnd == STACK_SIZE )\n"
" {\n"
" for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)\n"
" {\n"
" int idx = m_start + ldsStackIdx[i];\n"
" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
" gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
" }\n"
" if( lIdx == 0 ) ldsStackEnd = 0;\n"
"\n"
" //for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n"
" ldsFixedBuffer[lIdx] = 0;\n"
" }\n"
" }\n"
" }\n"
"\n"
" if( lIdx == 0 ) ldsRingEnd = RING_END;\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n"
" {\n"
" int idx = m_start + ldsStackIdx[i];\n"
" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
" gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
" }\n"
"\n"
" // in case it couldn't consume any pair. Flush them\n"
" // todo. Serial batch worth while?\n"
" if( ldsStackEnd == 0 )\n"
" {\n"
" for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)\n"
" {\n"
" int idx = m_start + ldsRingElem[i].m_idx;\n"
" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
" gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 ) ldsRingEnd = 0;\n"
" }\n"
"\n"
" if( lIdx == 0 ) ldsStackEnd = 0;\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" // termination\n"
" if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
" break;\n"
" }\n"
"\n"
"\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
;

View File

@@ -0,0 +1,236 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
#ifdef cl_ext_atomic_counters_32
#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
#else
#define counter32_t volatile __global int*
#endif
#define SIMD_WIDTH 64
typedef unsigned int u32;
typedef unsigned short u16;
typedef unsigned char u8;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GET_NUM_GROUPS get_num_groups(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
#define AtomInc(x) atom_inc(&(x))
#define AtomInc1(x, out) out = atom_inc(&(x))
#define AppendInc(x, out) out = atomic_inc(x)
#define AtomAdd(x, value) atom_add(&(x), value)
#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
#define AtomXhg(x, value) atom_xchg ( &(x), value )
#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
#define make_float4 (float4)
#define make_float2 (float2)
#define make_uint4 (uint4)
#define make_int4 (int4)
#define make_uint2 (uint2)
#define make_int2 (int2)
#define max2 max
#define min2 min
#define WG_SIZE 64
typedef struct
{
float4 m_worldPos[4];
float4 m_worldNormal;
u32 m_coeffs;
int m_batchIdx;
int m_bodyAPtrAndSignBit;//sign bit set for fixed objects
int m_bodyBPtrAndSignBit;
}Contact4;
typedef struct
{
int m_n;
int m_start;
int m_staticIdx;
int m_paddings[1];
} ConstBuffer;
typedef struct
{
int m_a;
int m_b;
u32 m_idx;
}Elem;
// batching on the GPU
__kernel void CreateBatchesBruteForce( __global Contact4* gConstraints, __global const u32* gN, __global const u32* gStart, int m_staticIdx )
{
int wgIdx = GET_GROUP_IDX;
int lIdx = GET_LOCAL_IDX;
const int m_n = gN[wgIdx];
const int m_start = gStart[wgIdx];
if( lIdx == 0 )
{
for (int i=0;i<m_n;i++)
{
int srcIdx = i+m_start;
int batchIndex = i;
gConstraints[ srcIdx ].m_batchIdx = batchIndex;
}
}
}
#define CHECK_SIZE (WG_SIZE)
u32 readBuf(__local u32* buff, int idx)
{
idx = idx % (32*CHECK_SIZE);
int bitIdx = idx%32;
int bufIdx = idx/32;
return buff[bufIdx] & (1<<bitIdx);
}
void writeBuf(__local u32* buff, int idx)
{
idx = idx % (32*CHECK_SIZE);
int bitIdx = idx%32;
int bufIdx = idx/32;
buff[bufIdx] |= (1<<bitIdx);
//atom_or( &buff[bufIdx], (1<<bitIdx) );
}
u32 tryWrite(__local u32* buff, int idx)
{
idx = idx % (32*CHECK_SIZE);
int bitIdx = idx%32;
int bufIdx = idx/32;
u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );
return ((ans >> bitIdx)&1) == 0;
}
// batching on the GPU
__kernel void CreateBatchesNew( __global Contact4* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )
{
int wgIdx = GET_GROUP_IDX;
int lIdx = GET_LOCAL_IDX;
const int numConstraints = gN[wgIdx];
const int m_start = gStart[wgIdx];
__local u32 ldsFixedBuffer[CHECK_SIZE];
if( lIdx == 0 )
{
__global Contact4* cs = &gConstraints[m_start];
int numValidConstraints = 0;
int batchIdx = 0;
while( numValidConstraints < numConstraints)
{
int nCurrentBatch = 0;
// clear flag
for(int i=0; i<CHECK_SIZE; i++)
ldsFixedBuffer[i] = 0;
for(int i=numValidConstraints; i<numConstraints; i++)
{
int bodyAS = cs[i].m_bodyAPtrAndSignBit;
int bodyBS = cs[i].m_bodyBPtrAndSignBit;
int bodyA = abs(bodyAS);
int bodyB = abs(bodyBS);
bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;
bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;
int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);
int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);
if( aUnavailable==0 && bUnavailable==0 ) // ok
{
if (!aIsStatic)
{
writeBuf( ldsFixedBuffer, bodyA );
}
if (!bIsStatic)
{
writeBuf( ldsFixedBuffer, bodyB );
}
cs[i].m_batchIdx = batchIdx;
if (i!=numValidConstraints)
{
//btSwap(cs[i],cs[numValidConstraints]);
Contact4 tmp = cs[i];
cs[i] = cs[numValidConstraints];
cs[numValidConstraints] = tmp;
}
numValidConstraints++;
nCurrentBatch++;
if( nCurrentBatch == SIMD_WIDTH)
{
nCurrentBatch = 0;
for(int i=0; i<CHECK_SIZE; i++)
ldsFixedBuffer[i] = 0;
}
}
}//for
batchIdx ++;
}//while
}//if( lIdx == 0 )
//return batchIdx;
}

View File

@@ -0,0 +1,240 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* batchingKernelsNewCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"\n"
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile __global int*\n"
"#endif\n"
"\n"
"#define SIMD_WIDTH 64\n"
"\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"\n"
"\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"\n"
"#define make_float4 (float4)\n"
"#define make_float2 (float2)\n"
"#define make_uint4 (uint4)\n"
"#define make_int4 (int4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"\n"
"#define max2 max\n"
"#define min2 min\n"
"\n"
"\n"
"#define WG_SIZE 64\n"
"\n"
"\n"
"\n"
"typedef struct \n"
"{\n"
" float4 m_worldPos[4];\n"
" float4 m_worldNormal;\n"
" u32 m_coeffs;\n"
" int m_batchIdx;\n"
"\n"
" int m_bodyAPtrAndSignBit;//sign bit set for fixed objects\n"
" int m_bodyBPtrAndSignBit;\n"
"}Contact4;\n"
"\n"
"typedef struct \n"
"{\n"
" int m_n;\n"
" int m_start;\n"
" int m_staticIdx;\n"
" int m_paddings[1];\n"
"} ConstBuffer;\n"
"\n"
"typedef struct \n"
"{\n"
" int m_a;\n"
" int m_b;\n"
" u32 m_idx;\n"
"}Elem;\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"// batching on the GPU\n"
"__kernel void CreateBatchesBruteForce( __global Contact4* gConstraints, __global const u32* gN, __global const u32* gStart, int m_staticIdx )\n"
"{\n"
" int wgIdx = GET_GROUP_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" \n"
" const int m_n = gN[wgIdx];\n"
" const int m_start = gStart[wgIdx];\n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" for (int i=0;i<m_n;i++)\n"
" {\n"
" int srcIdx = i+m_start;\n"
" int batchIndex = i;\n"
" gConstraints[ srcIdx ].m_batchIdx = batchIndex; \n"
" }\n"
" }\n"
"}\n"
"\n"
"\n"
"#define CHECK_SIZE (WG_SIZE)\n"
"\n"
"\n"
"\n"
"\n"
"u32 readBuf(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" return buff[bufIdx] & (1<<bitIdx);\n"
"}\n"
"\n"
"void writeBuf(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" buff[bufIdx] |= (1<<bitIdx);\n"
" //atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
"}\n"
"\n"
"u32 tryWrite(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
" return ((ans >> bitIdx)&1) == 0;\n"
"}\n"
"\n"
"\n"
"// batching on the GPU\n"
"__kernel void CreateBatchesNew( __global Contact4* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )\n"
"{\n"
" int wgIdx = GET_GROUP_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" const int numConstraints = gN[wgIdx];\n"
" const int m_start = gStart[wgIdx];\n"
" \n"
" \n"
" __local u32 ldsFixedBuffer[CHECK_SIZE];\n"
" \n"
" \n"
" \n"
" \n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" \n"
" \n"
" __global Contact4* cs = &gConstraints[m_start]; \n"
" \n"
" \n"
" int numValidConstraints = 0;\n"
" int batchIdx = 0;\n"
"\n"
" while( numValidConstraints < numConstraints)\n"
" {\n"
" int nCurrentBatch = 0;\n"
" // clear flag\n"
" \n"
" for(int i=0; i<CHECK_SIZE; i++) \n"
" ldsFixedBuffer[i] = 0; \n"
"\n"
" for(int i=numValidConstraints; i<numConstraints; i++)\n"
" {\n"
"\n"
" int bodyAS = cs[i].m_bodyAPtrAndSignBit;\n"
" int bodyBS = cs[i].m_bodyBPtrAndSignBit;\n"
" int bodyA = abs(bodyAS);\n"
" int bodyB = abs(bodyBS);\n"
" bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;\n"
" bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;\n"
" int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);\n"
" int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);\n"
" \n"
" if( aUnavailable==0 && bUnavailable==0 ) // ok\n"
" {\n"
" if (!aIsStatic)\n"
" {\n"
" writeBuf( ldsFixedBuffer, bodyA );\n"
" }\n"
" if (!bIsStatic)\n"
" {\n"
" writeBuf( ldsFixedBuffer, bodyB );\n"
" }\n"
"\n"
" cs[i].m_batchIdx = batchIdx;\n"
"\n"
" if (i!=numValidConstraints)\n"
" {\n"
" //b3Swap(cs[i],cs[numValidConstraints]);\n"
" \n"
" Contact4 tmp = cs[i];\n"
" cs[i] = cs[numValidConstraints];\n"
" cs[numValidConstraints] = tmp;\n"
" \n"
" }\n"
"\n"
" numValidConstraints++;\n"
" \n"
" nCurrentBatch++;\n"
" if( nCurrentBatch == SIMD_WIDTH)\n"
" {\n"
" nCurrentBatch = 0;\n"
" for(int i=0; i<CHECK_SIZE; i++) \n"
" ldsFixedBuffer[i] = 0;\n"
" \n"
" }\n"
" }\n"
" }//for\n"
" batchIdx ++;\n"
" }//while\n"
" }//if( lIdx == 0 )\n"
" \n"
" //return batchIdx;\n"
"}\n"
"\n"
;

View File

@@ -0,0 +1,92 @@
float4 quatMult(float4 q1, float4 q2)
{
float4 q;
q.x = q1.w * q2.x + q1.x * q2.w + q1.y * q2.z - q1.z * q2.y;
q.y = q1.w * q2.y + q1.y * q2.w + q1.z * q2.x - q1.x * q2.z;
q.z = q1.w * q2.z + q1.z * q2.w + q1.x * q2.y - q1.y * q2.x;
q.w = q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z;
return q;
}
float4 quatNorm(float4 q)
{
float len = native_sqrt(dot(q, q));
if(len > 0.f)
{
q *= 1.f / len;
}
else
{
q.x = q.y = q.z = 0.f;
q.w = 1.f;
}
return q;
}
typedef struct
{
float4 m_pos;
float4 m_quat;
float4 m_linVel;
float4 m_angVel;
unsigned int m_collidableIdx;
float m_invMass;
float m_restituitionCoeff;
float m_frictionCoeff;
} Body;
__kernel void
integrateTransformsKernel( __global Body* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)
{
int nodeID = get_global_id(0);
float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);
if( nodeID < numNodes && (bodies[nodeID].m_invMass != 0.f))
{
//angular velocity
{
float4 axis;
//add some hardcoded angular damping
bodies[nodeID].m_angVel.x *= angularDamping;
bodies[nodeID].m_angVel.y *= angularDamping;
bodies[nodeID].m_angVel.z *= angularDamping;
float4 angvel = bodies[nodeID].m_angVel;
float fAngle = native_sqrt(dot(angvel, angvel));
//limit the angular motion
if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)
{
fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;
}
if(fAngle < 0.001f)
{
// use Taylor's expansions of sync function
axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);
}
else
{
// sync(fAngle) = sin(c*fAngle)/t
axis = angvel * ( native_sin(0.5f * fAngle * timeStep) / fAngle);
}
float4 dorn = axis;
dorn.w = native_cos(fAngle * timeStep * 0.5f);
float4 orn0 = bodies[nodeID].m_quat;
float4 predictedOrn = quatMult(dorn, orn0);
predictedOrn = quatNorm(predictedOrn);
bodies[nodeID].m_quat=predictedOrn;
}
//linear velocity
bodies[nodeID].m_pos += bodies[nodeID].m_linVel * timeStep;
//apply gravity
bodies[nodeID].m_linVel += gravityAcceleration * timeStep;
}
}

View File

@@ -0,0 +1,96 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* integrateKernelCL= \
"\n"
"float4 quatMult(float4 q1, float4 q2)\n"
"{\n"
" float4 q;\n"
" q.x = q1.w * q2.x + q1.x * q2.w + q1.y * q2.z - q1.z * q2.y;\n"
" q.y = q1.w * q2.y + q1.y * q2.w + q1.z * q2.x - q1.x * q2.z;\n"
" q.z = q1.w * q2.z + q1.z * q2.w + q1.x * q2.y - q1.y * q2.x;\n"
" q.w = q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z; \n"
" return q;\n"
"}\n"
"\n"
"float4 quatNorm(float4 q)\n"
"{\n"
" float len = native_sqrt(dot(q, q));\n"
" if(len > 0.f)\n"
" {\n"
" q *= 1.f / len;\n"
" }\n"
" else\n"
" {\n"
" q.x = q.y = q.z = 0.f;\n"
" q.w = 1.f;\n"
" }\n"
" return q;\n"
"}\n"
"\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" float4 m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
"\n"
" unsigned int m_collidableIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"\n"
"\n"
"\n"
"\n"
"__kernel void \n"
" integrateTransformsKernel( __global Body* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n"
"{\n"
" int nodeID = get_global_id(0);\n"
" float B3_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
" if( nodeID < numNodes && (bodies[nodeID].m_invMass != 0.f))\n"
" {\n"
" //angular velocity\n"
" {\n"
" float4 axis;\n"
" //add some hardcoded angular damping\n"
" bodies[nodeID].m_angVel.x *= angularDamping;\n"
" bodies[nodeID].m_angVel.y *= angularDamping;\n"
" bodies[nodeID].m_angVel.z *= angularDamping;\n"
" \n"
" float4 angvel = bodies[nodeID].m_angVel;\n"
" float fAngle = native_sqrt(dot(angvel, angvel));\n"
" //limit the angular motion\n"
" if(fAngle*timeStep > B3_GPU_ANGULAR_MOTION_THRESHOLD)\n"
" {\n"
" fAngle = B3_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
" }\n"
" if(fAngle < 0.001f)\n"
" {\n"
" // use Taylor's expansions of sync function\n"
" axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n"
" }\n"
" else\n"
" {\n"
" // sync(fAngle) = sin(c*fAngle)/t\n"
" axis = angvel * ( native_sin(0.5f * fAngle * timeStep) / fAngle);\n"
" }\n"
" float4 dorn = axis;\n"
" dorn.w = native_cos(fAngle * timeStep * 0.5f);\n"
" float4 orn0 = bodies[nodeID].m_quat;\n"
"\n"
" float4 predictedOrn = quatMult(dorn, orn0);\n"
" predictedOrn = quatNorm(predictedOrn);\n"
" bodies[nodeID].m_quat=predictedOrn;\n"
" }\n"
"\n"
" //linear velocity \n"
" bodies[nodeID].m_pos += bodies[nodeID].m_linVel * timeStep;\n"
" \n"
" //apply gravity\n"
" bodies[nodeID].m_linVel += gravityAcceleration * timeStep;\n"
" \n"
" }\n"
"}\n"
"\n"
;

View File

@@ -0,0 +1,476 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
//#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
#ifdef cl_ext_atomic_counters_32
#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
#else
#define counter32_t volatile global int*
#endif
typedef unsigned int u32;
typedef unsigned short u16;
typedef unsigned char u8;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GET_NUM_GROUPS get_num_groups(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
#define AtomInc(x) atom_inc(&(x))
#define AtomInc1(x, out) out = atom_inc(&(x))
#define AppendInc(x, out) out = atomic_inc(x)
#define AtomAdd(x, value) atom_add(&(x), value)
#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
#define AtomXhg(x, value) atom_xchg ( &(x), value )
#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
#define mymake_float4 (float4)
//#define make_float2 (float2)
//#define make_uint4 (uint4)
//#define make_int4 (int4)
//#define make_uint2 (uint2)
//#define make_int2 (int2)
#define max2 max
#define min2 min
///////////////////////////////////////
// Vector
///////////////////////////////////////
__inline
float4 fastNormalize4(float4 v)
{
return fast_normalize(v);
}
__inline
float4 cross3(float4 a, float4 b)
{
return cross(a,b);
}
__inline
float dot3F4(float4 a, float4 b)
{
float4 a1 = mymake_float4(a.xyz,0.f);
float4 b1 = mymake_float4(b.xyz,0.f);
return dot(a1, b1);
}
__inline
float4 normalize3(const float4 a)
{
float4 n = mymake_float4(a.x, a.y, a.z, 0.f);
return fastNormalize4( n );
// float length = sqrtf(dot3F4(a, a));
// return 1.f/length * a;
}
///////////////////////////////////////
// Matrix3x3
///////////////////////////////////////
typedef struct
{
float4 m_row[3];
}Matrix3x3;
__inline
float4 mtMul1(Matrix3x3 a, float4 b);
__inline
float4 mtMul3(float4 a, Matrix3x3 b);
__inline
float4 mtMul1(Matrix3x3 a, float4 b)
{
float4 ans;
ans.x = dot3F4( a.m_row[0], b );
ans.y = dot3F4( a.m_row[1], b );
ans.z = dot3F4( a.m_row[2], b );
ans.w = 0.f;
return ans;
}
__inline
float4 mtMul3(float4 a, Matrix3x3 b)
{
float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
float4 ans;
ans.x = dot3F4( a, colx );
ans.y = dot3F4( a, coly );
ans.z = dot3F4( a, colz );
return ans;
}
///////////////////////////////////////
// Quaternion
///////////////////////////////////////
typedef float4 Quaternion;
#define WG_SIZE 64
typedef struct
{
float4 m_pos;
Quaternion m_quat;
float4 m_linVel;
float4 m_angVel;
u32 m_shapeIdx;
float m_invMass;
float m_restituitionCoeff;
float m_frictionCoeff;
} Body;
typedef struct
{
Matrix3x3 m_invInertia;
Matrix3x3 m_initInvInertia;
} Shape;
typedef struct
{
float4 m_linear;
float4 m_worldPos[4];
float4 m_center;
float m_jacCoeffInv[4];
float m_b[4];
float m_appliedRambdaDt[4];
float m_fJacCoeffInv[2];
float m_fAppliedRambdaDt[2];
u32 m_bodyA;
u32 m_bodyB;
int m_batchIdx;
u32 m_paddings[1];
} Constraint4;
typedef struct
{
float4 m_worldPos[4];
float4 m_worldNormal;
u32 m_coeffs;
int m_batchIdx;
int m_bodyAPtrAndSignBit;
int m_bodyBPtrAndSignBit;
} Contact4;
typedef struct
{
int m_nConstraints;
int m_start;
int m_batchIdx;
int m_nSplit;
// int m_paddings[1];
} ConstBuffer;
typedef struct
{
int m_solveFriction;
int m_maxBatch; // long batch really kills the performance
int m_batchIdx;
int m_nSplit;
// int m_paddings[1];
} ConstBufferBatchSolve;
void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);
void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)
{
*linear = mymake_float4(-n.xyz,0.f);
*angular0 = -cross3(r0, n);
*angular1 = cross3(r1, n);
}
float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );
float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )
{
return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
}
float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);
float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)
{
// linear0,1 are normlized
float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);
float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);
return -1.f/(jmj0+jmj1+jmj2+jmj3);
}
void solveContact(__global Constraint4* cs,
float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,
float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);
void solveContact(__global Constraint4* cs,
float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,
float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)
{
float minRambdaDt = 0;
float maxRambdaDt = FLT_MAX;
for(int ic=0; ic<4; ic++)
{
if( cs->m_jacCoeffInv[ic] == 0.f ) continue;
float4 angular0, angular1, linear;
float4 r0 = cs->m_worldPos[ic] - posA;
float4 r1 = cs->m_worldPos[ic] - posB;
setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );
float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1,
*linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];
rambdaDt *= cs->m_jacCoeffInv[ic];
{
float prevSum = cs->m_appliedRambdaDt[ic];
float updated = prevSum;
updated += rambdaDt;
updated = max2( updated, minRambdaDt );
updated = min2( updated, maxRambdaDt );
rambdaDt = updated - prevSum;
cs->m_appliedRambdaDt[ic] = updated;
}
float4 linImp0 = invMassA*linear*rambdaDt;
float4 linImp1 = invMassB*(-linear)*rambdaDt;
float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
*linVelA += linImp0;
*angVelA += angImp0;
*linVelB += linImp1;
*angVelB += angImp1;
}
}
void btPlaneSpace1 (const float4* n, float4* p, float4* q);
void btPlaneSpace1 (const float4* n, float4* p, float4* q)
{
if (fabs(n[0].z) > 0.70710678f) {
// choose p in y-z plane
float a = n[0].y*n[0].y + n[0].z*n[0].z;
float k = 1.f/sqrt(a);
p[0].x = 0;
p[0].y = -n[0].z*k;
p[0].z = n[0].y*k;
// set q = n x p
q[0].x = a*k;
q[0].y = -n[0].x*p[0].z;
q[0].z = n[0].x*p[0].y;
}
else {
// choose p in x-y plane
float a = n[0].x*n[0].x + n[0].y*n[0].y;
float k = 1.f/sqrt(a);
p[0].x = -n[0].y*k;
p[0].y = n[0].x*k;
p[0].z = 0;
// set q = n x p
q[0].x = -n[0].z*p[0].y;
q[0].y = n[0].z*p[0].x;
q[0].z = a*k;
}
}
void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);
void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)
{
//float frictionCoeff = ldsCs[0].m_linear.w;
int aIdx = ldsCs[0].m_bodyA;
int bIdx = ldsCs[0].m_bodyB;
float4 posA = gBodies[aIdx].m_pos;
float4 linVelA = gBodies[aIdx].m_linVel;
float4 angVelA = gBodies[aIdx].m_angVel;
float invMassA = gBodies[aIdx].m_invMass;
Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
float4 posB = gBodies[bIdx].m_pos;
float4 linVelB = gBodies[bIdx].m_linVel;
float4 angVelB = gBodies[bIdx].m_angVel;
float invMassB = gBodies[bIdx].m_invMass;
Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,
posB, &linVelB, &angVelB, invMassB, invInertiaB );
if (gBodies[aIdx].m_invMass)
{
gBodies[aIdx].m_linVel = linVelA;
gBodies[aIdx].m_angVel = angVelA;
} else
{
gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);
gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);
}
if (gBodies[bIdx].m_invMass)
{
gBodies[bIdx].m_linVel = linVelB;
gBodies[bIdx].m_angVel = angVelB;
} else
{
gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);
gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);
}
}
typedef struct
{
int m_valInt0;
int m_valInt1;
int m_valInt2;
int m_valInt3;
float m_val0;
float m_val1;
float m_val2;
float m_val3;
} SolverDebugInfo;
__kernel
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
void BatchSolveKernelContact(__global Body* gBodies,
__global Shape* gShapes,
__global Constraint4* gConstraints,
__global int* gN,
__global int* gOffsets,
int maxBatch,
int bIdx,
int nSplit
)
{
//__local int ldsBatchIdx[WG_SIZE+1];
__local int ldsCurBatch;
__local int ldsNextBatch;
__local int ldsStart;
int lIdx = GET_LOCAL_IDX;
int wgIdx = GET_GROUP_IDX;
// int gIdx = GET_GLOBAL_IDX;
// debugInfo[gIdx].m_valInt0 = gIdx;
//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;
int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);
int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);
int cellIdx = xIdx+yIdx*nSplit;
if( gN[cellIdx] == 0 )
return;
const int start = gOffsets[cellIdx];
const int end = start + gN[cellIdx];
if( lIdx == 0 )
{
ldsCurBatch = 0;
ldsNextBatch = 0;
ldsStart = start;
}
GROUP_LDS_BARRIER;
int idx=ldsStart+lIdx;
while (ldsCurBatch < maxBatch)
{
for(; idx<end; )
{
if (gConstraints[idx].m_batchIdx == ldsCurBatch)
{
solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );
idx+=64;
} else
{
break;
}
}
GROUP_LDS_BARRIER;
if( lIdx == 0 )
{
ldsCurBatch++;
}
GROUP_LDS_BARRIER;
}
}

View File

@@ -0,0 +1,480 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* solveContactCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"\n"
"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"\n"
"\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile global int*\n"
"#endif\n"
"\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"\n"
"\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"\n"
"#define mymake_float4 (float4)\n"
"//#define make_float2 (float2)\n"
"//#define make_uint4 (uint4)\n"
"//#define make_int4 (int4)\n"
"//#define make_uint2 (uint2)\n"
"//#define make_int2 (int2)\n"
"\n"
"\n"
"#define max2 max\n"
"#define min2 min\n"
"\n"
"\n"
"///////////////////////////////////////\n"
"// Vector\n"
"///////////////////////////////////////\n"
"\n"
"\n"
"\n"
"\n"
"__inline\n"
"float4 fastNormalize4(float4 v)\n"
"{\n"
" return fast_normalize(v);\n"
"}\n"
"\n"
"\n"
"\n"
"__inline\n"
"float4 cross3(float4 a, float4 b)\n"
"{\n"
" return cross(a,b);\n"
"}\n"
"\n"
"__inline\n"
"float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = mymake_float4(a.xyz,0.f);\n"
" float4 b1 = mymake_float4(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"__inline\n"
"float4 normalize3(const float4 a)\n"
"{\n"
" float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n"
" return fastNormalize4( n );\n"
"// float length = sqrtf(dot3F4(a, a));\n"
"// return 1.f/length * a;\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"///////////////////////////////////////\n"
"// Matrix3x3\n"
"///////////////////////////////////////\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_row[3];\n"
"}Matrix3x3;\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b);\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b);\n"
"\n"
"\n"
"\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b)\n"
"{\n"
" float4 ans;\n"
" ans.x = dot3F4( a.m_row[0], b );\n"
" ans.y = dot3F4( a.m_row[1], b );\n"
" ans.z = dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b)\n"
"{\n"
" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
"\n"
" float4 ans;\n"
" ans.x = dot3F4( a, colx );\n"
" ans.y = dot3F4( a, coly );\n"
" ans.z = dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"\n"
"///////////////////////////////////////\n"
"// Quaternion\n"
"///////////////////////////////////////\n"
"\n"
"typedef float4 Quaternion;\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"#define WG_SIZE 64\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" Quaternion m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
"\n"
" u32 m_shapeIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"\n"
"typedef struct\n"
"{\n"
" Matrix3x3 m_invInertia;\n"
" Matrix3x3 m_initInvInertia;\n"
"} Shape;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_linear;\n"
" float4 m_worldPos[4];\n"
" float4 m_center; \n"
" float m_jacCoeffInv[4];\n"
" float m_b[4];\n"
" float m_appliedRambdaDt[4];\n"
"\n"
" float m_fJacCoeffInv[2]; \n"
" float m_fAppliedRambdaDt[2]; \n"
"\n"
" u32 m_bodyA;\n"
" u32 m_bodyB;\n"
"\n"
" int m_batchIdx;\n"
" u32 m_paddings[1];\n"
"} Constraint4;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_worldPos[4];\n"
" float4 m_worldNormal;\n"
" u32 m_coeffs;\n"
" int m_batchIdx;\n"
"\n"
" int m_bodyAPtrAndSignBit;\n"
" int m_bodyBPtrAndSignBit;\n"
"} Contact4;\n"
"\n"
"typedef struct\n"
"{\n"
" int m_nConstraints;\n"
" int m_start;\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBuffer;\n"
"\n"
"typedef struct\n"
"{\n"
" int m_solveFriction;\n"
" int m_maxBatch; // long batch really kills the performance\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBufferBatchSolve;\n"
"\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
"\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
"{\n"
" *linear = mymake_float4(-n.xyz,0.f);\n"
" *angular0 = -cross3(r0, n);\n"
" *angular1 = cross3(r1, n);\n"
"}\n"
"\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
"\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
"{\n"
" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
"}\n"
"\n"
"\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
"\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
"{\n"
" // linear0,1 are normlized\n"
" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
"}\n"
"\n"
"\n"
"void solveContact(__global Constraint4* cs,\n"
" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);\n"
"\n"
"void solveContact(__global Constraint4* cs,\n"
" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)\n"
"{\n"
" float minRambdaDt = 0;\n"
" float maxRambdaDt = FLT_MAX;\n"
"\n"
" for(int ic=0; ic<4; ic++)\n"
" {\n"
" if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n"
"\n"
" float4 angular0, angular1, linear;\n"
" float4 r0 = cs->m_worldPos[ic] - posA;\n"
" float4 r1 = cs->m_worldPos[ic] - posB;\n"
" setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n"
"\n"
" float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n"
" *linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];\n"
" rambdaDt *= cs->m_jacCoeffInv[ic];\n"
"\n"
" {\n"
" float prevSum = cs->m_appliedRambdaDt[ic];\n"
" float updated = prevSum;\n"
" updated += rambdaDt;\n"
" updated = max2( updated, minRambdaDt );\n"
" updated = min2( updated, maxRambdaDt );\n"
" rambdaDt = updated - prevSum;\n"
" cs->m_appliedRambdaDt[ic] = updated;\n"
" }\n"
"\n"
" float4 linImp0 = invMassA*linear*rambdaDt;\n"
" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
"\n"
" *linVelA += linImp0;\n"
" *angVelA += angImp0;\n"
" *linVelB += linImp1;\n"
" *angVelB += angImp1;\n"
" }\n"
"}\n"
"\n"
"void b3PlaneSpace1 (const float4* n, float4* p, float4* q);\n"
" void b3PlaneSpace1 (const float4* n, float4* p, float4* q)\n"
"{\n"
" if (fabs(n[0].z) > 0.70710678f) {\n"
" // choose p in y-z plane\n"
" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = 0;\n"
" p[0].y = -n[0].z*k;\n"
" p[0].z = n[0].y*k;\n"
" // set q = n x p\n"
" q[0].x = a*k;\n"
" q[0].y = -n[0].x*p[0].z;\n"
" q[0].z = n[0].x*p[0].y;\n"
" }\n"
" else {\n"
" // choose p in x-y plane\n"
" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = -n[0].y*k;\n"
" p[0].y = n[0].x*k;\n"
" p[0].z = 0;\n"
" // set q = n x p\n"
" q[0].x = -n[0].z*p[0].y;\n"
" q[0].y = n[0].z*p[0].x;\n"
" q[0].z = a*k;\n"
" }\n"
"}\n"
"\n"
"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
"{\n"
" //float frictionCoeff = ldsCs[0].m_linear.w;\n"
" int aIdx = ldsCs[0].m_bodyA;\n"
" int bIdx = ldsCs[0].m_bodyB;\n"
"\n"
" float4 posA = gBodies[aIdx].m_pos;\n"
" float4 linVelA = gBodies[aIdx].m_linVel;\n"
" float4 angVelA = gBodies[aIdx].m_angVel;\n"
" float invMassA = gBodies[aIdx].m_invMass;\n"
" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
"\n"
" float4 posB = gBodies[bIdx].m_pos;\n"
" float4 linVelB = gBodies[bIdx].m_linVel;\n"
" float4 angVelB = gBodies[bIdx].m_angVel;\n"
" float invMassB = gBodies[bIdx].m_invMass;\n"
" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
"\n"
" solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
" posB, &linVelB, &angVelB, invMassB, invInertiaB );\n"
"\n"
" if (gBodies[aIdx].m_invMass)\n"
" {\n"
" gBodies[aIdx].m_linVel = linVelA;\n"
" gBodies[aIdx].m_angVel = angVelA;\n"
" } else\n"
" {\n"
" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n"
" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n"
" \n"
" }\n"
" if (gBodies[bIdx].m_invMass)\n"
" {\n"
" gBodies[bIdx].m_linVel = linVelB;\n"
" gBodies[bIdx].m_angVel = angVelB;\n"
" } else\n"
" {\n"
" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n"
" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
" \n"
" }\n"
"\n"
"}\n"
"\n"
"\n"
"\n"
"typedef struct \n"
"{\n"
" int m_valInt0;\n"
" int m_valInt1;\n"
" int m_valInt2;\n"
" int m_valInt3;\n"
"\n"
" float m_val0;\n"
" float m_val1;\n"
" float m_val2;\n"
" float m_val3;\n"
"} SolverDebugInfo;\n"
"\n"
"\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void BatchSolveKernelContact(__global Body* gBodies,\n"
" __global Shape* gShapes,\n"
" __global Constraint4* gConstraints,\n"
" __global int* gN,\n"
" __global int* gOffsets,\n"
" int maxBatch,\n"
" int bIdx,\n"
" int nSplit\n"
" )\n"
"{\n"
" //__local int ldsBatchIdx[WG_SIZE+1];\n"
" __local int ldsCurBatch;\n"
" __local int ldsNextBatch;\n"
" __local int ldsStart;\n"
"\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int wgIdx = GET_GROUP_IDX;\n"
"\n"
"// int gIdx = GET_GLOBAL_IDX;\n"
"// debugInfo[gIdx].m_valInt0 = gIdx;\n"
" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
"\n"
"\n"
" int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n"
" int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n"
" int cellIdx = xIdx+yIdx*nSplit;\n"
" \n"
" if( gN[cellIdx] == 0 ) \n"
" return;\n"
"\n"
" const int start = gOffsets[cellIdx];\n"
" const int end = start + gN[cellIdx];\n"
"\n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" ldsCurBatch = 0;\n"
" ldsNextBatch = 0;\n"
" ldsStart = start;\n"
" }\n"
"\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" int idx=ldsStart+lIdx;\n"
" while (ldsCurBatch < maxBatch)\n"
" {\n"
" for(; idx<end; )\n"
" {\n"
" if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
" {\n"
" solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
"\n"
" idx+=64;\n"
" } else\n"
" {\n"
" break;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 )\n"
" {\n"
" ldsCurBatch++;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" \n"
" \n"
"}\n"
"\n"
;

View File

@@ -0,0 +1,506 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
//#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
#ifdef cl_ext_atomic_counters_32
#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
#else
#define counter32_t volatile global int*
#endif
typedef unsigned int u32;
typedef unsigned short u16;
typedef unsigned char u8;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GET_NUM_GROUPS get_num_groups(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
#define AtomInc(x) atom_inc(&(x))
#define AtomInc1(x, out) out = atom_inc(&(x))
#define AppendInc(x, out) out = atomic_inc(x)
#define AtomAdd(x, value) atom_add(&(x), value)
#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
#define AtomXhg(x, value) atom_xchg ( &(x), value )
#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
#define mymake_float4 (float4)
//#define make_float2 (float2)
//#define make_uint4 (uint4)
//#define make_int4 (int4)
//#define make_uint2 (uint2)
//#define make_int2 (int2)
#define max2 max
#define min2 min
///////////////////////////////////////
// Vector
///////////////////////////////////////
__inline
float4 fastNormalize4(float4 v)
{
return fast_normalize(v);
}
__inline
float4 cross3(float4 a, float4 b)
{
return cross(a,b);
}
__inline
float dot3F4(float4 a, float4 b)
{
float4 a1 = mymake_float4(a.xyz,0.f);
float4 b1 = mymake_float4(b.xyz,0.f);
return dot(a1, b1);
}
__inline
float4 normalize3(const float4 a)
{
float4 n = mymake_float4(a.x, a.y, a.z, 0.f);
return fastNormalize4( n );
// float length = sqrtf(dot3F4(a, a));
// return 1.f/length * a;
}
///////////////////////////////////////
// Matrix3x3
///////////////////////////////////////
typedef struct
{
float4 m_row[3];
}Matrix3x3;
__inline
float4 mtMul1(Matrix3x3 a, float4 b);
__inline
float4 mtMul3(float4 a, Matrix3x3 b);
__inline
float4 mtMul1(Matrix3x3 a, float4 b)
{
float4 ans;
ans.x = dot3F4( a.m_row[0], b );
ans.y = dot3F4( a.m_row[1], b );
ans.z = dot3F4( a.m_row[2], b );
ans.w = 0.f;
return ans;
}
__inline
float4 mtMul3(float4 a, Matrix3x3 b)
{
float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
float4 ans;
ans.x = dot3F4( a, colx );
ans.y = dot3F4( a, coly );
ans.z = dot3F4( a, colz );
return ans;
}
///////////////////////////////////////
// Quaternion
///////////////////////////////////////
typedef float4 Quaternion;
#define WG_SIZE 64
typedef struct
{
float4 m_pos;
Quaternion m_quat;
float4 m_linVel;
float4 m_angVel;
u32 m_shapeIdx;
float m_invMass;
float m_restituitionCoeff;
float m_frictionCoeff;
} Body;
typedef struct
{
Matrix3x3 m_invInertia;
Matrix3x3 m_initInvInertia;
} Shape;
typedef struct
{
float4 m_linear;
float4 m_worldPos[4];
float4 m_center;
float m_jacCoeffInv[4];
float m_b[4];
float m_appliedRambdaDt[4];
float m_fJacCoeffInv[2];
float m_fAppliedRambdaDt[2];
u32 m_bodyA;
u32 m_bodyB;
int m_batchIdx;
u32 m_paddings[1];
} Constraint4;
typedef struct
{
float4 m_worldPos[4];
float4 m_worldNormal;
u32 m_coeffs;
int m_batchIdx;
int m_bodyAPtrAndSignBit;
int m_bodyBPtrAndSignBit;
} Contact4;
typedef struct
{
int m_nConstraints;
int m_start;
int m_batchIdx;
int m_nSplit;
// int m_paddings[1];
} ConstBuffer;
typedef struct
{
int m_solveFriction;
int m_maxBatch; // long batch really kills the performance
int m_batchIdx;
int m_nSplit;
// int m_paddings[1];
} ConstBufferBatchSolve;
void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);
void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)
{
*linear = mymake_float4(-n.xyz,0.f);
*angular0 = -cross3(r0, n);
*angular1 = cross3(r1, n);
}
float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );
float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )
{
return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
}
float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);
float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)
{
// linear0,1 are normlized
float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);
float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);
return -1.f/(jmj0+jmj1+jmj2+jmj3);
}
void btPlaneSpace1 (const float4* n, float4* p, float4* q);
void btPlaneSpace1 (const float4* n, float4* p, float4* q)
{
if (fabs(n[0].z) > 0.70710678f) {
// choose p in y-z plane
float a = n[0].y*n[0].y + n[0].z*n[0].z;
float k = 1.f/sqrt(a);
p[0].x = 0;
p[0].y = -n[0].z*k;
p[0].z = n[0].y*k;
// set q = n x p
q[0].x = a*k;
q[0].y = -n[0].x*p[0].z;
q[0].z = n[0].x*p[0].y;
}
else {
// choose p in x-y plane
float a = n[0].x*n[0].x + n[0].y*n[0].y;
float k = 1.f/sqrt(a);
p[0].x = -n[0].y*k;
p[0].y = n[0].x*k;
p[0].z = 0;
// set q = n x p
q[0].x = -n[0].z*p[0].y;
q[0].y = n[0].z*p[0].x;
q[0].z = a*k;
}
}
void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);
void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)
{
float frictionCoeff = ldsCs[0].m_linear.w;
int aIdx = ldsCs[0].m_bodyA;
int bIdx = ldsCs[0].m_bodyB;
float4 posA = gBodies[aIdx].m_pos;
float4 linVelA = gBodies[aIdx].m_linVel;
float4 angVelA = gBodies[aIdx].m_angVel;
float invMassA = gBodies[aIdx].m_invMass;
Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
float4 posB = gBodies[bIdx].m_pos;
float4 linVelB = gBodies[bIdx].m_linVel;
float4 angVelB = gBodies[bIdx].m_angVel;
float invMassB = gBodies[bIdx].m_invMass;
Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
{
float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
float sum = 0;
for(int j=0; j<4; j++)
{
sum +=ldsCs[0].m_appliedRambdaDt[j];
}
frictionCoeff = 0.7f;
for(int j=0; j<4; j++)
{
maxRambdaDt[j] = frictionCoeff*sum;
minRambdaDt[j] = -maxRambdaDt[j];
}
// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,
// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );
{
__global Constraint4* cs = ldsCs;
if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;
const float4 center = cs->m_center;
float4 n = -cs->m_linear;
float4 tangent[2];
btPlaneSpace1(&n,&tangent[0],&tangent[1]);
float4 angular0, angular1, linear;
float4 r0 = center - posA;
float4 r1 = center - posB;
for(int i=0; i<2; i++)
{
setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );
float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
linVelA, angVelA, linVelB, angVelB );
rambdaDt *= cs->m_fJacCoeffInv[i];
{
float prevSum = cs->m_fAppliedRambdaDt[i];
float updated = prevSum;
updated += rambdaDt;
updated = max2( updated, minRambdaDt[i] );
updated = min2( updated, maxRambdaDt[i] );
rambdaDt = updated - prevSum;
cs->m_fAppliedRambdaDt[i] = updated;
}
float4 linImp0 = invMassA*linear*rambdaDt;
float4 linImp1 = invMassB*(-linear)*rambdaDt;
float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
linVelA += linImp0;
angVelA += angImp0;
linVelB += linImp1;
angVelB += angImp1;
}
{ // angular damping for point constraint
float4 ab = normalize3( posB - posA );
float4 ac = normalize3( center - posA );
if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
{
float angNA = dot3F4( n, angVelA );
float angNB = dot3F4( n, angVelB );
angVelA -= (angNA*0.1f)*n;
angVelB -= (angNB*0.1f)*n;
}
}
}
}
if (gBodies[aIdx].m_invMass)
{
gBodies[aIdx].m_linVel = linVelA;
gBodies[aIdx].m_angVel = angVelA;
} else
{
gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);
gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);
}
if (gBodies[bIdx].m_invMass)
{
gBodies[bIdx].m_linVel = linVelB;
gBodies[bIdx].m_angVel = angVelB;
} else
{
gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);
gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);
}
}
typedef struct
{
int m_valInt0;
int m_valInt1;
int m_valInt2;
int m_valInt3;
float m_val0;
float m_val1;
float m_val2;
float m_val3;
} SolverDebugInfo;
__kernel
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
void BatchSolveKernelFriction(__global Body* gBodies,
__global Shape* gShapes,
__global Constraint4* gConstraints,
__global int* gN,
__global int* gOffsets,
int maxBatch,
int bIdx,
int nSplit
)
{
//__local int ldsBatchIdx[WG_SIZE+1];
__local int ldsCurBatch;
__local int ldsNextBatch;
__local int ldsStart;
int lIdx = GET_LOCAL_IDX;
int wgIdx = GET_GROUP_IDX;
// int gIdx = GET_GLOBAL_IDX;
// debugInfo[gIdx].m_valInt0 = gIdx;
//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;
int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);
int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);
int cellIdx = xIdx+yIdx*nSplit;
if( gN[cellIdx] == 0 )
return;
const int start = gOffsets[cellIdx];
const int end = start + gN[cellIdx];
if( lIdx == 0 )
{
ldsCurBatch = 0;
ldsNextBatch = 0;
ldsStart = start;
}
GROUP_LDS_BARRIER;
int idx=ldsStart+lIdx;
while (ldsCurBatch < maxBatch)
{
for(; idx<end; )
{
if (gConstraints[idx].m_batchIdx == ldsCurBatch)
{
solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );
idx+=64;
} else
{
break;
}
}
GROUP_LDS_BARRIER;
if( lIdx == 0 )
{
ldsCurBatch++;
}
GROUP_LDS_BARRIER;
}
}

View File

@@ -0,0 +1,510 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* solveFrictionCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"\n"
"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"\n"
"\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile global int*\n"
"#endif\n"
"\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"\n"
"\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"\n"
"#define mymake_float4 (float4)\n"
"//#define make_float2 (float2)\n"
"//#define make_uint4 (uint4)\n"
"//#define make_int4 (int4)\n"
"//#define make_uint2 (uint2)\n"
"//#define make_int2 (int2)\n"
"\n"
"\n"
"#define max2 max\n"
"#define min2 min\n"
"\n"
"\n"
"///////////////////////////////////////\n"
"// Vector\n"
"///////////////////////////////////////\n"
"\n"
"\n"
"\n"
"\n"
"__inline\n"
"float4 fastNormalize4(float4 v)\n"
"{\n"
" return fast_normalize(v);\n"
"}\n"
"\n"
"\n"
"\n"
"__inline\n"
"float4 cross3(float4 a, float4 b)\n"
"{\n"
" return cross(a,b);\n"
"}\n"
"\n"
"__inline\n"
"float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = mymake_float4(a.xyz,0.f);\n"
" float4 b1 = mymake_float4(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"__inline\n"
"float4 normalize3(const float4 a)\n"
"{\n"
" float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n"
" return fastNormalize4( n );\n"
"// float length = sqrtf(dot3F4(a, a));\n"
"// return 1.f/length * a;\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"///////////////////////////////////////\n"
"// Matrix3x3\n"
"///////////////////////////////////////\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_row[3];\n"
"}Matrix3x3;\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b);\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b);\n"
"\n"
"\n"
"\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b)\n"
"{\n"
" float4 ans;\n"
" ans.x = dot3F4( a.m_row[0], b );\n"
" ans.y = dot3F4( a.m_row[1], b );\n"
" ans.z = dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b)\n"
"{\n"
" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
"\n"
" float4 ans;\n"
" ans.x = dot3F4( a, colx );\n"
" ans.y = dot3F4( a, coly );\n"
" ans.z = dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"\n"
"///////////////////////////////////////\n"
"// Quaternion\n"
"///////////////////////////////////////\n"
"\n"
"typedef float4 Quaternion;\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"#define WG_SIZE 64\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" Quaternion m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
"\n"
" u32 m_shapeIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"\n"
"typedef struct\n"
"{\n"
" Matrix3x3 m_invInertia;\n"
" Matrix3x3 m_initInvInertia;\n"
"} Shape;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_linear;\n"
" float4 m_worldPos[4];\n"
" float4 m_center; \n"
" float m_jacCoeffInv[4];\n"
" float m_b[4];\n"
" float m_appliedRambdaDt[4];\n"
"\n"
" float m_fJacCoeffInv[2]; \n"
" float m_fAppliedRambdaDt[2]; \n"
"\n"
" u32 m_bodyA;\n"
" u32 m_bodyB;\n"
"\n"
" int m_batchIdx;\n"
" u32 m_paddings[1];\n"
"} Constraint4;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_worldPos[4];\n"
" float4 m_worldNormal;\n"
" u32 m_coeffs;\n"
" int m_batchIdx;\n"
"\n"
" int m_bodyAPtrAndSignBit;\n"
" int m_bodyBPtrAndSignBit;\n"
"} Contact4;\n"
"\n"
"typedef struct\n"
"{\n"
" int m_nConstraints;\n"
" int m_start;\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBuffer;\n"
"\n"
"typedef struct\n"
"{\n"
" int m_solveFriction;\n"
" int m_maxBatch; // long batch really kills the performance\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBufferBatchSolve;\n"
"\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
"\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
"{\n"
" *linear = mymake_float4(-n.xyz,0.f);\n"
" *angular0 = -cross3(r0, n);\n"
" *angular1 = cross3(r1, n);\n"
"}\n"
"\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
"\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
"{\n"
" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
"}\n"
"\n"
"\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
"\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
"{\n"
" // linear0,1 are normlized\n"
" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
"}\n"
"void b3PlaneSpace1 (const float4* n, float4* p, float4* q);\n"
" void b3PlaneSpace1 (const float4* n, float4* p, float4* q)\n"
"{\n"
" if (fabs(n[0].z) > 0.70710678f) {\n"
" // choose p in y-z plane\n"
" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = 0;\n"
" p[0].y = -n[0].z*k;\n"
" p[0].z = n[0].y*k;\n"
" // set q = n x p\n"
" q[0].x = a*k;\n"
" q[0].y = -n[0].x*p[0].z;\n"
" q[0].z = n[0].x*p[0].y;\n"
" }\n"
" else {\n"
" // choose p in x-y plane\n"
" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = -n[0].y*k;\n"
" p[0].y = n[0].x*k;\n"
" p[0].z = 0;\n"
" // set q = n x p\n"
" q[0].x = -n[0].z*p[0].y;\n"
" q[0].y = n[0].z*p[0].x;\n"
" q[0].z = a*k;\n"
" }\n"
"}\n"
"\n"
"\n"
"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
"{\n"
" float frictionCoeff = ldsCs[0].m_linear.w;\n"
" int aIdx = ldsCs[0].m_bodyA;\n"
" int bIdx = ldsCs[0].m_bodyB;\n"
"\n"
"\n"
" float4 posA = gBodies[aIdx].m_pos;\n"
" float4 linVelA = gBodies[aIdx].m_linVel;\n"
" float4 angVelA = gBodies[aIdx].m_angVel;\n"
" float invMassA = gBodies[aIdx].m_invMass;\n"
" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
"\n"
" float4 posB = gBodies[bIdx].m_pos;\n"
" float4 linVelB = gBodies[bIdx].m_linVel;\n"
" float4 angVelB = gBodies[bIdx].m_angVel;\n"
" float invMassB = gBodies[bIdx].m_invMass;\n"
" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
" \n"
"\n"
" {\n"
" float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n"
" float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n"
"\n"
" float sum = 0;\n"
" for(int j=0; j<4; j++)\n"
" {\n"
" sum +=ldsCs[0].m_appliedRambdaDt[j];\n"
" }\n"
" frictionCoeff = 0.7f;\n"
" for(int j=0; j<4; j++)\n"
" {\n"
" maxRambdaDt[j] = frictionCoeff*sum;\n"
" minRambdaDt[j] = -maxRambdaDt[j];\n"
" }\n"
"\n"
" \n"
"// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
"// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n"
" \n"
" \n"
" {\n"
" \n"
" __global Constraint4* cs = ldsCs;\n"
" \n"
" if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n"
" const float4 center = cs->m_center;\n"
" \n"
" float4 n = -cs->m_linear;\n"
" \n"
" float4 tangent[2];\n"
" b3PlaneSpace1(&n,&tangent[0],&tangent[1]);\n"
" float4 angular0, angular1, linear;\n"
" float4 r0 = center - posA;\n"
" float4 r1 = center - posB;\n"
" for(int i=0; i<2; i++)\n"
" {\n"
" setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n"
" float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n"
" linVelA, angVelA, linVelB, angVelB );\n"
" rambdaDt *= cs->m_fJacCoeffInv[i];\n"
" \n"
" {\n"
" float prevSum = cs->m_fAppliedRambdaDt[i];\n"
" float updated = prevSum;\n"
" updated += rambdaDt;\n"
" updated = max2( updated, minRambdaDt[i] );\n"
" updated = min2( updated, maxRambdaDt[i] );\n"
" rambdaDt = updated - prevSum;\n"
" cs->m_fAppliedRambdaDt[i] = updated;\n"
" }\n"
" \n"
" float4 linImp0 = invMassA*linear*rambdaDt;\n"
" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
" \n"
" linVelA += linImp0;\n"
" angVelA += angImp0;\n"
" linVelB += linImp1;\n"
" angVelB += angImp1;\n"
" }\n"
" { // angular damping for point constraint\n"
" float4 ab = normalize3( posB - posA );\n"
" float4 ac = normalize3( center - posA );\n"
" if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))\n"
" {\n"
" float angNA = dot3F4( n, angVelA );\n"
" float angNB = dot3F4( n, angVelB );\n"
" \n"
" angVelA -= (angNA*0.1f)*n;\n"
" angVelB -= (angNB*0.1f)*n;\n"
" }\n"
" }\n"
" }\n"
"\n"
" \n"
" \n"
" }\n"
"\n"
" if (gBodies[aIdx].m_invMass)\n"
" {\n"
" gBodies[aIdx].m_linVel = linVelA;\n"
" gBodies[aIdx].m_angVel = angVelA;\n"
" } else\n"
" {\n"
" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n"
" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n"
" }\n"
" if (gBodies[bIdx].m_invMass)\n"
" {\n"
" gBodies[bIdx].m_linVel = linVelB;\n"
" gBodies[bIdx].m_angVel = angVelB;\n"
" } else\n"
" {\n"
" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n"
" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
" }\n"
" \n"
"\n"
"}\n"
"\n"
"typedef struct \n"
"{\n"
" int m_valInt0;\n"
" int m_valInt1;\n"
" int m_valInt2;\n"
" int m_valInt3;\n"
"\n"
" float m_val0;\n"
" float m_val1;\n"
" float m_val2;\n"
" float m_val3;\n"
"} SolverDebugInfo;\n"
"\n"
"\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void BatchSolveKernelFriction(__global Body* gBodies,\n"
" __global Shape* gShapes,\n"
" __global Constraint4* gConstraints,\n"
" __global int* gN,\n"
" __global int* gOffsets,\n"
" int maxBatch,\n"
" int bIdx,\n"
" int nSplit\n"
" )\n"
"{\n"
" //__local int ldsBatchIdx[WG_SIZE+1];\n"
" __local int ldsCurBatch;\n"
" __local int ldsNextBatch;\n"
" __local int ldsStart;\n"
"\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int wgIdx = GET_GROUP_IDX;\n"
"\n"
"// int gIdx = GET_GLOBAL_IDX;\n"
"// debugInfo[gIdx].m_valInt0 = gIdx;\n"
" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
"\n"
"\n"
" int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n"
" int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n"
" int cellIdx = xIdx+yIdx*nSplit;\n"
" \n"
" if( gN[cellIdx] == 0 ) \n"
" return;\n"
"\n"
" const int start = gOffsets[cellIdx];\n"
" const int end = start + gN[cellIdx];\n"
"\n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" ldsCurBatch = 0;\n"
" ldsNextBatch = 0;\n"
" ldsStart = start;\n"
" }\n"
"\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" int idx=ldsStart+lIdx;\n"
" while (ldsCurBatch < maxBatch)\n"
" {\n"
" for(; idx<end; )\n"
" {\n"
" if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
" {\n"
"\n"
" solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
"\n"
" idx+=64;\n"
" } else\n"
" {\n"
" break;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 )\n"
" {\n"
" ldsCurBatch++;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" \n"
" \n"
"}\n"
"\n"
;

View File

@@ -0,0 +1,660 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
#ifdef cl_ext_atomic_counters_32
#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
#else
#define counter32_t volatile global int*
#endif
typedef unsigned int u32;
typedef unsigned short u16;
typedef unsigned char u8;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GET_NUM_GROUPS get_num_groups(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
#define AtomInc(x) atom_inc(&(x))
#define AtomInc1(x, out) out = atom_inc(&(x))
#define AppendInc(x, out) out = atomic_inc(x)
#define AtomAdd(x, value) atom_add(&(x), value)
#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
#define AtomXhg(x, value) atom_xchg ( &(x), value )
#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
#define make_float4 (float4)
#define make_float2 (float2)
#define make_uint4 (uint4)
#define make_int4 (int4)
#define make_uint2 (uint2)
#define make_int2 (int2)
#define max2 max
#define min2 min
///////////////////////////////////////
// Vector
///////////////////////////////////////
__inline
float fastDiv(float numerator, float denominator)
{
return native_divide(numerator, denominator);
// return numerator/denominator;
}
__inline
float4 fastDiv4(float4 numerator, float4 denominator)
{
return native_divide(numerator, denominator);
}
__inline
float fastSqrtf(float f2)
{
return native_sqrt(f2);
// return sqrt(f2);
}
__inline
float fastRSqrt(float f2)
{
return native_rsqrt(f2);
}
__inline
float fastLength4(float4 v)
{
return fast_length(v);
}
__inline
float4 fastNormalize4(float4 v)
{
return fast_normalize(v);
}
__inline
float sqrtf(float a)
{
// return sqrt(a);
return native_sqrt(a);
}
__inline
float4 cross3(float4 a, float4 b)
{
return cross(a,b);
}
__inline
float dot3F4(float4 a, float4 b)
{
float4 a1 = make_float4(a.xyz,0.f);
float4 b1 = make_float4(b.xyz,0.f);
return dot(a1, b1);
}
__inline
float length3(const float4 a)
{
return sqrtf(dot3F4(a,a));
}
__inline
float dot4(const float4 a, const float4 b)
{
return dot( a, b );
}
// for height
__inline
float dot3w1(const float4 point, const float4 eqn)
{
return dot3F4(point,eqn) + eqn.w;
}
__inline
float4 normalize3(const float4 a)
{
float4 n = make_float4(a.x, a.y, a.z, 0.f);
return fastNormalize4( n );
// float length = sqrtf(dot3F4(a, a));
// return 1.f/length * a;
}
__inline
float4 normalize4(const float4 a)
{
float length = sqrtf(dot4(a, a));
return 1.f/length * a;
}
__inline
float4 createEquation(const float4 a, const float4 b, const float4 c)
{
float4 eqn;
float4 ab = b-a;
float4 ac = c-a;
eqn = normalize3( cross3(ab, ac) );
eqn.w = -dot3F4(eqn,a);
return eqn;
}
///////////////////////////////////////
// Matrix3x3
///////////////////////////////////////
typedef struct
{
float4 m_row[3];
}Matrix3x3;
__inline
Matrix3x3 mtZero();
__inline
Matrix3x3 mtIdentity();
__inline
Matrix3x3 mtTranspose(Matrix3x3 m);
__inline
Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);
__inline
float4 mtMul1(Matrix3x3 a, float4 b);
__inline
float4 mtMul3(float4 a, Matrix3x3 b);
__inline
Matrix3x3 mtZero()
{
Matrix3x3 m;
m.m_row[0] = (float4)(0.f);
m.m_row[1] = (float4)(0.f);
m.m_row[2] = (float4)(0.f);
return m;
}
__inline
Matrix3x3 mtIdentity()
{
Matrix3x3 m;
m.m_row[0] = (float4)(1,0,0,0);
m.m_row[1] = (float4)(0,1,0,0);
m.m_row[2] = (float4)(0,0,1,0);
return m;
}
__inline
Matrix3x3 mtTranspose(Matrix3x3 m)
{
Matrix3x3 out;
out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
return out;
}
__inline
Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
{
Matrix3x3 transB;
transB = mtTranspose( b );
Matrix3x3 ans;
// why this doesn't run when 0ing in the for{}
a.m_row[0].w = 0.f;
a.m_row[1].w = 0.f;
a.m_row[2].w = 0.f;
for(int i=0; i<3; i++)
{
// a.m_row[i].w = 0.f;
ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
ans.m_row[i].w = 0.f;
}
return ans;
}
__inline
float4 mtMul1(Matrix3x3 a, float4 b)
{
float4 ans;
ans.x = dot3F4( a.m_row[0], b );
ans.y = dot3F4( a.m_row[1], b );
ans.z = dot3F4( a.m_row[2], b );
ans.w = 0.f;
return ans;
}
__inline
float4 mtMul3(float4 a, Matrix3x3 b)
{
float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
float4 ans;
ans.x = dot3F4( a, colx );
ans.y = dot3F4( a, coly );
ans.z = dot3F4( a, colz );
return ans;
}
///////////////////////////////////////
// Quaternion
///////////////////////////////////////
typedef float4 Quaternion;
__inline
Quaternion qtMul(Quaternion a, Quaternion b);
__inline
Quaternion qtNormalize(Quaternion in);
__inline
float4 qtRotate(Quaternion q, float4 vec);
__inline
Quaternion qtInvert(Quaternion q);
__inline
Matrix3x3 qtGetRotationMatrix(Quaternion q);
__inline
Quaternion qtMul(Quaternion a, Quaternion b)
{
Quaternion ans;
ans = cross3( a, b );
ans += a.w*b+b.w*a;
// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
ans.w = a.w*b.w - dot3F4(a, b);
return ans;
}
__inline
Quaternion qtNormalize(Quaternion in)
{
return fastNormalize4(in);
// in /= length( in );
// return in;
}
__inline
float4 qtRotate(Quaternion q, float4 vec)
{
Quaternion qInv = qtInvert( q );
float4 vcpy = vec;
vcpy.w = 0.f;
float4 out = qtMul(qtMul(q,vcpy),qInv);
return out;
}
__inline
Quaternion qtInvert(Quaternion q)
{
return (Quaternion)(-q.xyz, q.w);
}
__inline
float4 qtInvRotate(const Quaternion q, float4 vec)
{
return qtRotate( qtInvert( q ), vec );
}
__inline
Matrix3x3 qtGetRotationMatrix(Quaternion quat)
{
float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
Matrix3x3 out;
out.m_row[0].x=1-2*quat2.y-2*quat2.z;
out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;
out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;
out.m_row[0].w = 0.f;
out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;
out.m_row[1].y=1-2*quat2.x-2*quat2.z;
out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;
out.m_row[1].w = 0.f;
out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;
out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;
out.m_row[2].z=1-2*quat2.x-2*quat2.y;
out.m_row[2].w = 0.f;
return out;
}
#define WG_SIZE 64
typedef struct
{
float4 m_pos;
Quaternion m_quat;
float4 m_linVel;
float4 m_angVel;
u32 m_shapeIdx;
float m_invMass;
float m_restituitionCoeff;
float m_frictionCoeff;
} Body;
typedef struct
{
Matrix3x3 m_invInertia;
Matrix3x3 m_initInvInertia;
} Shape;
typedef struct
{
float4 m_linear;
float4 m_worldPos[4];
float4 m_center;
float m_jacCoeffInv[4];
float m_b[4];
float m_appliedRambdaDt[4];
float m_fJacCoeffInv[2];
float m_fAppliedRambdaDt[2];
u32 m_bodyA;
u32 m_bodyB;
int m_batchIdx;
u32 m_paddings[1];
} Constraint4;
typedef struct
{
float4 m_worldPos[4];
float4 m_worldNormal;
u32 m_coeffs;
int m_batchIdx;
int m_bodyAPtrAndSignBit;
int m_bodyBPtrAndSignBit;
} Contact4;
typedef struct
{
int m_nConstraints;
int m_start;
int m_batchIdx;
int m_nSplit;
// int m_paddings[1];
} ConstBuffer;
typedef struct
{
int m_solveFriction;
int m_maxBatch; // long batch really kills the performance
int m_batchIdx;
int m_nSplit;
// int m_paddings[1];
} ConstBufferBatchSolve;
void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)
{
*linear = make_float4(-n.xyz,0.f);
*angular0 = -cross3(r0, n);
*angular1 = cross3(r1, n);
}
float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )
{
return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
}
float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)
{
// linear0,1 are normlized
float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);
float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);
return -1.f/(jmj0+jmj1+jmj2+jmj3);
}
typedef struct
{
int m_valInt0;
int m_valInt1;
int m_valInt2;
int m_valInt3;
float m_val0;
float m_val1;
float m_val2;
float m_val3;
} SolverDebugInfo;
typedef struct
{
int m_nContacts;
int m_staticIdx;
float m_scale;
int m_nSplit;
} ConstBufferSSD;
void btPlaneSpace1 (float4 n, float4* p, float4* q);
void btPlaneSpace1 (float4 n, float4* p, float4* q)
{
if (fabs(n.z) > 0.70710678f) {
// choose p in y-z plane
float a = n.y*n.y + n.z*n.z;
float k = 1.f/sqrt(a);
p[0].x = 0;
p[0].y = -n.z*k;
p[0].z = n.y*k;
// set q = n x p
q[0].x = a*k;
q[0].y = -n.x*p[0].z;
q[0].z = n.x*p[0].y;
}
else {
// choose p in x-y plane
float a = n.x*n.x + n.y*n.y;
float k = 1.f/sqrt(a);
p[0].x = -n.y*k;
p[0].y = n.x*k;
p[0].z = 0;
// set q = n x p
q[0].x = -n.z*p[0].y;
q[0].y = n.z*p[0].x;
q[0].z = a*k;
}
}
void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,
const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB,
__global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff,
Constraint4* dstC )
{
dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);
dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);
float dtInv = 1.f/dt;
for(int ic=0; ic<4; ic++)
{
dstC->m_appliedRambdaDt[ic] = 0.f;
}
dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;
dstC->m_linear = -src->m_worldNormal;
dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );
for(int ic=0; ic<4; ic++)
{
float4 r0 = src->m_worldPos[ic] - posA;
float4 r1 = src->m_worldPos[ic] - posB;
if( ic >= src->m_worldNormal.w )//npoints
{
dstC->m_jacCoeffInv[ic] = 0.f;
continue;
}
float relVelN;
{
float4 linear, angular0, angular1;
setLinearAndAngular(src->m_worldNormal, r0, r1, &linear, &angular0, &angular1);
dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,
invMassA, &invInertiaA, invMassB, &invInertiaB );
relVelN = calcRelVel(linear, -linear, angular0, angular1,
linVelA, angVelA, linVelB, angVelB);
float e = 0.f;//src->getRestituitionCoeff();
if( relVelN*relVelN < 0.004f ) e = 0.f;
dstC->m_b[ic] = e*relVelN;
//float penetration = src->m_worldPos[ic].w;
dstC->m_b[ic] += (src->m_worldPos[ic].w + positionDrift)*positionConstraintCoeff*dtInv;
dstC->m_appliedRambdaDt[ic] = 0.f;
}
}
if( src->m_worldNormal.w > 0 )//npoints
{ // prepare friction
float4 center = make_float4(0.f);
for(int i=0; i<src->m_worldNormal.w; i++)
center += src->m_worldPos[i];
center /= (float)src->m_worldNormal.w;
float4 tangent[2];
btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);
float4 r[2];
r[0] = center - posA;
r[1] = center - posB;
for(int i=0; i<2; i++)
{
float4 linear, angular0, angular1;
setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);
dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,
invMassA, &invInertiaA, invMassB, &invInertiaB );
dstC->m_fAppliedRambdaDt[i] = 0.f;
}
dstC->m_center = center;
}
for(int i=0; i<4; i++)
{
if( i<src->m_worldNormal.w )
{
dstC->m_worldPos[i] = src->m_worldPos[i];
}
else
{
dstC->m_worldPos[i] = make_float4(0.f);
}
}
}
typedef struct
{
int m_nContacts;
float m_dt;
float m_positionDrift;
float m_positionConstraintCoeff;
} ConstBufferCTC;
__kernel
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
void ContactToConstraintKernel(__global Contact4* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut,
int nContacts,
float dt,
float positionDrift,
float positionConstraintCoeff
)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < nContacts )
{
int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
float4 posA = gBodies[aIdx].m_pos;
float4 linVelA = gBodies[aIdx].m_linVel;
float4 angVelA = gBodies[aIdx].m_angVel;
float invMassA = gBodies[aIdx].m_invMass;
Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
float4 posB = gBodies[bIdx].m_pos;
float4 linVelB = gBodies[bIdx].m_linVel;
float4 angVelB = gBodies[bIdx].m_angVel;
float invMassB = gBodies[bIdx].m_invMass;
Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
Constraint4 cs;
setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
&gContact[gIdx], dt, positionDrift, positionConstraintCoeff,
&cs );
cs.m_batchIdx = gContact[gIdx].m_batchIdx;
gConstraintOut[gIdx] = cs;
}
}

View File

@@ -0,0 +1,664 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* solverSetupCL= \
"\n"
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"\n"
"\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile global int*\n"
"#endif\n"
"\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"\n"
"\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"\n"
"#define make_float4 (float4)\n"
"#define make_float2 (float2)\n"
"#define make_uint4 (uint4)\n"
"#define make_int4 (int4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"\n"
"#define max2 max\n"
"#define min2 min\n"
"\n"
"\n"
"///////////////////////////////////////\n"
"// Vector\n"
"///////////////////////////////////////\n"
"__inline\n"
"float fastDiv(float numerator, float denominator)\n"
"{\n"
" return native_divide(numerator, denominator); \n"
"// return numerator/denominator; \n"
"}\n"
"\n"
"__inline\n"
"float4 fastDiv4(float4 numerator, float4 denominator)\n"
"{\n"
" return native_divide(numerator, denominator); \n"
"}\n"
"\n"
"__inline\n"
"float fastSqrtf(float f2)\n"
"{\n"
" return native_sqrt(f2);\n"
"// return sqrt(f2);\n"
"}\n"
"\n"
"__inline\n"
"float fastRSqrt(float f2)\n"
"{\n"
" return native_rsqrt(f2);\n"
"}\n"
"\n"
"__inline\n"
"float fastLength4(float4 v)\n"
"{\n"
" return fast_length(v);\n"
"}\n"
"\n"
"__inline\n"
"float4 fastNormalize4(float4 v)\n"
"{\n"
" return fast_normalize(v);\n"
"}\n"
"\n"
"\n"
"__inline\n"
"float sqrtf(float a)\n"
"{\n"
"// return sqrt(a);\n"
" return native_sqrt(a);\n"
"}\n"
"\n"
"__inline\n"
"float4 cross3(float4 a, float4 b)\n"
"{\n"
" return cross(a,b);\n"
"}\n"
"\n"
"__inline\n"
"float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = make_float4(a.xyz,0.f);\n"
" float4 b1 = make_float4(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"\n"
"__inline\n"
"float length3(const float4 a)\n"
"{\n"
" return sqrtf(dot3F4(a,a));\n"
"}\n"
"\n"
"__inline\n"
"float dot4(const float4 a, const float4 b)\n"
"{\n"
" return dot( a, b );\n"
"}\n"
"\n"
"// for height\n"
"__inline\n"
"float dot3w1(const float4 point, const float4 eqn)\n"
"{\n"
" return dot3F4(point,eqn) + eqn.w;\n"
"}\n"
"\n"
"__inline\n"
"float4 normalize3(const float4 a)\n"
"{\n"
" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
" return fastNormalize4( n );\n"
"// float length = sqrtf(dot3F4(a, a));\n"
"// return 1.f/length * a;\n"
"}\n"
"\n"
"__inline\n"
"float4 normalize4(const float4 a)\n"
"{\n"
" float length = sqrtf(dot4(a, a));\n"
" return 1.f/length * a;\n"
"}\n"
"\n"
"__inline\n"
"float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
"{\n"
" float4 eqn;\n"
" float4 ab = b-a;\n"
" float4 ac = c-a;\n"
" eqn = normalize3( cross3(ab, ac) );\n"
" eqn.w = -dot3F4(eqn,a);\n"
" return eqn;\n"
"}\n"
"\n"
"///////////////////////////////////////\n"
"// Matrix3x3\n"
"///////////////////////////////////////\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_row[3];\n"
"}Matrix3x3;\n"
"\n"
"__inline\n"
"Matrix3x3 mtZero();\n"
"\n"
"__inline\n"
"Matrix3x3 mtIdentity();\n"
"\n"
"__inline\n"
"Matrix3x3 mtTranspose(Matrix3x3 m);\n"
"\n"
"__inline\n"
"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b);\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b);\n"
"\n"
"__inline\n"
"Matrix3x3 mtZero()\n"
"{\n"
" Matrix3x3 m;\n"
" m.m_row[0] = (float4)(0.f);\n"
" m.m_row[1] = (float4)(0.f);\n"
" m.m_row[2] = (float4)(0.f);\n"
" return m;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtIdentity()\n"
"{\n"
" Matrix3x3 m;\n"
" m.m_row[0] = (float4)(1,0,0,0);\n"
" m.m_row[1] = (float4)(0,1,0,0);\n"
" m.m_row[2] = (float4)(0,0,1,0);\n"
" return m;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtTranspose(Matrix3x3 m)\n"
"{\n"
" Matrix3x3 out;\n"
" out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
" out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
" out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
" return out;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
"{\n"
" Matrix3x3 transB;\n"
" transB = mtTranspose( b );\n"
" Matrix3x3 ans;\n"
" // why this doesn't run when 0ing in the for{}\n"
" a.m_row[0].w = 0.f;\n"
" a.m_row[1].w = 0.f;\n"
" a.m_row[2].w = 0.f;\n"
" for(int i=0; i<3; i++)\n"
" {\n"
"// a.m_row[i].w = 0.f;\n"
" ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n"
" ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n"
" ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n"
" ans.m_row[i].w = 0.f;\n"
" }\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b)\n"
"{\n"
" float4 ans;\n"
" ans.x = dot3F4( a.m_row[0], b );\n"
" ans.y = dot3F4( a.m_row[1], b );\n"
" ans.z = dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b)\n"
"{\n"
" float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
"\n"
" float4 ans;\n"
" ans.x = dot3F4( a, colx );\n"
" ans.y = dot3F4( a, coly );\n"
" ans.z = dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"\n"
"///////////////////////////////////////\n"
"// Quaternion\n"
"///////////////////////////////////////\n"
"\n"
"typedef float4 Quaternion;\n"
"\n"
"__inline\n"
"Quaternion qtMul(Quaternion a, Quaternion b);\n"
"\n"
"__inline\n"
"Quaternion qtNormalize(Quaternion in);\n"
"\n"
"__inline\n"
"float4 qtRotate(Quaternion q, float4 vec);\n"
"\n"
"__inline\n"
"Quaternion qtInvert(Quaternion q);\n"
"\n"
"__inline\n"
"Matrix3x3 qtGetRotationMatrix(Quaternion q);\n"
"\n"
"\n"
"\n"
"__inline\n"
"Quaternion qtMul(Quaternion a, Quaternion b)\n"
"{\n"
" Quaternion ans;\n"
" ans = cross3( a, b );\n"
" ans += a.w*b+b.w*a;\n"
"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
" ans.w = a.w*b.w - dot3F4(a, b);\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"Quaternion qtNormalize(Quaternion in)\n"
"{\n"
" return fastNormalize4(in);\n"
"// in /= length( in );\n"
"// return in;\n"
"}\n"
"__inline\n"
"float4 qtRotate(Quaternion q, float4 vec)\n"
"{\n"
" Quaternion qInv = qtInvert( q );\n"
" float4 vcpy = vec;\n"
" vcpy.w = 0.f;\n"
" float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
" return out;\n"
"}\n"
"\n"
"__inline\n"
"Quaternion qtInvert(Quaternion q)\n"
"{\n"
" return (Quaternion)(-q.xyz, q.w);\n"
"}\n"
"\n"
"__inline\n"
"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
"{\n"
" return qtRotate( qtInvert( q ), vec );\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 qtGetRotationMatrix(Quaternion quat)\n"
"{\n"
" float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
" Matrix3x3 out;\n"
"\n"
" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
" out.m_row[0].w = 0.f;\n"
"\n"
" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
" out.m_row[1].w = 0.f;\n"
"\n"
" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
" out.m_row[2].w = 0.f;\n"
"\n"
" return out;\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"#define WG_SIZE 64\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" Quaternion m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
"\n"
" u32 m_shapeIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"\n"
"typedef struct\n"
"{\n"
" Matrix3x3 m_invInertia;\n"
" Matrix3x3 m_initInvInertia;\n"
"} Shape;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_linear;\n"
" float4 m_worldPos[4];\n"
" float4 m_center; \n"
" float m_jacCoeffInv[4];\n"
" float m_b[4];\n"
" float m_appliedRambdaDt[4];\n"
"\n"
" float m_fJacCoeffInv[2]; \n"
" float m_fAppliedRambdaDt[2]; \n"
"\n"
" u32 m_bodyA;\n"
" u32 m_bodyB;\n"
"\n"
" int m_batchIdx;\n"
" u32 m_paddings[1];\n"
"} Constraint4;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_worldPos[4];\n"
" float4 m_worldNormal;\n"
" u32 m_coeffs;\n"
" int m_batchIdx;\n"
"\n"
" int m_bodyAPtrAndSignBit;\n"
" int m_bodyBPtrAndSignBit;\n"
"} Contact4;\n"
"\n"
"typedef struct\n"
"{\n"
" int m_nConstraints;\n"
" int m_start;\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBuffer;\n"
"\n"
"typedef struct\n"
"{\n"
" int m_solveFriction;\n"
" int m_maxBatch; // long batch really kills the performance\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBufferBatchSolve;\n"
"\n"
"\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
"{\n"
" *linear = make_float4(-n.xyz,0.f);\n"
" *angular0 = -cross3(r0, n);\n"
" *angular1 = cross3(r1, n);\n"
"}\n"
"\n"
"\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
"{\n"
" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
"}\n"
"\n"
"\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
"{\n"
" // linear0,1 are normlized\n"
" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
"}\n"
"\n"
"\n"
"\n"
" \n"
"\n"
"\n"
"typedef struct \n"
"{\n"
" int m_valInt0;\n"
" int m_valInt1;\n"
" int m_valInt2;\n"
" int m_valInt3;\n"
"\n"
" float m_val0;\n"
" float m_val1;\n"
" float m_val2;\n"
" float m_val3;\n"
"} SolverDebugInfo;\n"
"\n"
"\n"
"\n"
"typedef struct\n"
"{\n"
" int m_nContacts;\n"
" int m_staticIdx;\n"
" float m_scale;\n"
" int m_nSplit;\n"
"} ConstBufferSSD;\n"
"\n"
"\n"
"void b3PlaneSpace1 (float4 n, float4* p, float4* q);\n"
" void b3PlaneSpace1 (float4 n, float4* p, float4* q)\n"
"{\n"
" if (fabs(n.z) > 0.70710678f) {\n"
" // choose p in y-z plane\n"
" float a = n.y*n.y + n.z*n.z;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = 0;\n"
" p[0].y = -n.z*k;\n"
" p[0].z = n.y*k;\n"
" // set q = n x p\n"
" q[0].x = a*k;\n"
" q[0].y = -n.x*p[0].z;\n"
" q[0].z = n.x*p[0].y;\n"
" }\n"
" else {\n"
" // choose p in x-y plane\n"
" float a = n.x*n.x + n.y*n.y;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = -n.y*k;\n"
" p[0].y = n.x*k;\n"
" p[0].z = 0;\n"
" // set q = n x p\n"
" q[0].x = -n.z*p[0].y;\n"
" q[0].y = n.z*p[0].x;\n"
" q[0].z = a*k;\n"
" }\n"
"}\n"
"\n"
"\n"
"void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,\n"
" const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, \n"
" __global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff,\n"
" Constraint4* dstC )\n"
"{\n"
" dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n"
" dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n"
"\n"
" float dtInv = 1.f/dt;\n"
" for(int ic=0; ic<4; ic++)\n"
" {\n"
" dstC->m_appliedRambdaDt[ic] = 0.f;\n"
" }\n"
" dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n"
"\n"
"\n"
" dstC->m_linear = -src->m_worldNormal;\n"
" dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n"
" for(int ic=0; ic<4; ic++)\n"
" {\n"
" float4 r0 = src->m_worldPos[ic] - posA;\n"
" float4 r1 = src->m_worldPos[ic] - posB;\n"
"\n"
" if( ic >= src->m_worldNormal.w )//npoints\n"
" {\n"
" dstC->m_jacCoeffInv[ic] = 0.f;\n"
" continue;\n"
" }\n"
"\n"
" float relVelN;\n"
" {\n"
" float4 linear, angular0, angular1;\n"
" setLinearAndAngular(src->m_worldNormal, r0, r1, &linear, &angular0, &angular1);\n"
"\n"
" dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
" invMassA, &invInertiaA, invMassB, &invInertiaB );\n"
"\n"
" relVelN = calcRelVel(linear, -linear, angular0, angular1,\n"
" linVelA, angVelA, linVelB, angVelB);\n"
"\n"
" float e = 0.f;//src->getRestituitionCoeff();\n"
" if( relVelN*relVelN < 0.004f ) e = 0.f;\n"
"\n"
" dstC->m_b[ic] = e*relVelN;\n"
" //float penetration = src->m_worldPos[ic].w;\n"
" dstC->m_b[ic] += (src->m_worldPos[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n"
" dstC->m_appliedRambdaDt[ic] = 0.f;\n"
" }\n"
" }\n"
"\n"
" if( src->m_worldNormal.w > 0 )//npoints\n"
" { // prepare friction\n"
" float4 center = make_float4(0.f);\n"
" for(int i=0; i<src->m_worldNormal.w; i++) \n"
" center += src->m_worldPos[i];\n"
" center /= (float)src->m_worldNormal.w;\n"
"\n"
" float4 tangent[2];\n"
" b3PlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
" \n"
" float4 r[2];\n"
" r[0] = center - posA;\n"
" r[1] = center - posB;\n"
"\n"
" for(int i=0; i<2; i++)\n"
" {\n"
" float4 linear, angular0, angular1;\n"
" setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n"
"\n"
" dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
" invMassA, &invInertiaA, invMassB, &invInertiaB );\n"
" dstC->m_fAppliedRambdaDt[i] = 0.f;\n"
" }\n"
" dstC->m_center = center;\n"
" }\n"
"\n"
" for(int i=0; i<4; i++)\n"
" {\n"
" if( i<src->m_worldNormal.w )\n"
" {\n"
" dstC->m_worldPos[i] = src->m_worldPos[i];\n"
" }\n"
" else\n"
" {\n"
" dstC->m_worldPos[i] = make_float4(0.f);\n"
" }\n"
" }\n"
"}\n"
"\n"
"typedef struct\n"
"{\n"
" int m_nContacts;\n"
" float m_dt;\n"
" float m_positionDrift;\n"
" float m_positionConstraintCoeff;\n"
"} ConstBufferCTC;\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void ContactToConstraintKernel(__global Contact4* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut, \n"
"int nContacts,\n"
"float dt,\n"
"float positionDrift,\n"
"float positionConstraintCoeff\n"
")\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" \n"
" if( gIdx < nContacts )\n"
" {\n"
" int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n"
" int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n"
"\n"
" float4 posA = gBodies[aIdx].m_pos;\n"
" float4 linVelA = gBodies[aIdx].m_linVel;\n"
" float4 angVelA = gBodies[aIdx].m_angVel;\n"
" float invMassA = gBodies[aIdx].m_invMass;\n"
" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
"\n"
" float4 posB = gBodies[bIdx].m_pos;\n"
" float4 linVelB = gBodies[bIdx].m_linVel;\n"
" float4 angVelB = gBodies[bIdx].m_angVel;\n"
" float invMassB = gBodies[bIdx].m_invMass;\n"
" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
"\n"
" Constraint4 cs;\n"
"\n"
" setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n"
" &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,\n"
" &cs );\n"
" \n"
" cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n"
"\n"
" gConstraintOut[gIdx] = cs;\n"
" }\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
;

View File

@@ -0,0 +1,494 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
#ifdef cl_ext_atomic_counters_32
#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
#else
#define counter32_t volatile global int*
#endif
typedef unsigned int u32;
typedef unsigned short u16;
typedef unsigned char u8;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GET_NUM_GROUPS get_num_groups(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
#define AtomInc(x) atom_inc(&(x))
#define AtomInc1(x, out) out = atom_inc(&(x))
#define AppendInc(x, out) out = atomic_inc(x)
#define AtomAdd(x, value) atom_add(&(x), value)
#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
#define AtomXhg(x, value) atom_xchg ( &(x), value )
#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
#define make_float4 (float4)
#define make_float2 (float2)
#define make_uint4 (uint4)
#define make_int4 (int4)
#define make_uint2 (uint2)
#define make_int2 (int2)
#define max2 max
#define min2 min
///////////////////////////////////////
// Vector
///////////////////////////////////////
__inline
float fastDiv(float numerator, float denominator)
{
return native_divide(numerator, denominator);
// return numerator/denominator;
}
__inline
float4 fastDiv4(float4 numerator, float4 denominator)
{
return native_divide(numerator, denominator);
}
__inline
float fastSqrtf(float f2)
{
return native_sqrt(f2);
// return sqrt(f2);
}
__inline
float fastRSqrt(float f2)
{
return native_rsqrt(f2);
}
__inline
float fastLength4(float4 v)
{
return fast_length(v);
}
__inline
float4 fastNormalize4(float4 v)
{
return fast_normalize(v);
}
__inline
float sqrtf(float a)
{
// return sqrt(a);
return native_sqrt(a);
}
__inline
float4 cross3(float4 a, float4 b)
{
return cross(a,b);
}
__inline
float dot3F4(float4 a, float4 b)
{
float4 a1 = make_float4(a.xyz,0.f);
float4 b1 = make_float4(b.xyz,0.f);
return dot(a1, b1);
}
__inline
float length3(const float4 a)
{
return sqrtf(dot3F4(a,a));
}
__inline
float dot4(const float4 a, const float4 b)
{
return dot( a, b );
}
// for height
__inline
float dot3w1(const float4 point, const float4 eqn)
{
return dot3F4(point,eqn) + eqn.w;
}
__inline
float4 normalize3(const float4 a)
{
float4 n = make_float4(a.x, a.y, a.z, 0.f);
return fastNormalize4( n );
// float length = sqrtf(dot3F4(a, a));
// return 1.f/length * a;
}
__inline
float4 normalize4(const float4 a)
{
float length = sqrtf(dot4(a, a));
return 1.f/length * a;
}
__inline
float4 createEquation(const float4 a, const float4 b, const float4 c)
{
float4 eqn;
float4 ab = b-a;
float4 ac = c-a;
eqn = normalize3( cross3(ab, ac) );
eqn.w = -dot3F4(eqn,a);
return eqn;
}
///////////////////////////////////////
// Matrix3x3
///////////////////////////////////////
typedef struct
{
float4 m_row[3];
}Matrix3x3;
__inline
Matrix3x3 mtZero();
__inline
Matrix3x3 mtIdentity();
__inline
Matrix3x3 mtTranspose(Matrix3x3 m);
__inline
Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);
__inline
float4 mtMul1(Matrix3x3 a, float4 b);
__inline
float4 mtMul3(float4 a, Matrix3x3 b);
__inline
Matrix3x3 mtZero()
{
Matrix3x3 m;
m.m_row[0] = (float4)(0.f);
m.m_row[1] = (float4)(0.f);
m.m_row[2] = (float4)(0.f);
return m;
}
__inline
Matrix3x3 mtIdentity()
{
Matrix3x3 m;
m.m_row[0] = (float4)(1,0,0,0);
m.m_row[1] = (float4)(0,1,0,0);
m.m_row[2] = (float4)(0,0,1,0);
return m;
}
__inline
Matrix3x3 mtTranspose(Matrix3x3 m)
{
Matrix3x3 out;
out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
return out;
}
__inline
Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
{
Matrix3x3 transB;
transB = mtTranspose( b );
Matrix3x3 ans;
// why this doesn't run when 0ing in the for{}
a.m_row[0].w = 0.f;
a.m_row[1].w = 0.f;
a.m_row[2].w = 0.f;
for(int i=0; i<3; i++)
{
// a.m_row[i].w = 0.f;
ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
ans.m_row[i].w = 0.f;
}
return ans;
}
__inline
float4 mtMul1(Matrix3x3 a, float4 b)
{
float4 ans;
ans.x = dot3F4( a.m_row[0], b );
ans.y = dot3F4( a.m_row[1], b );
ans.z = dot3F4( a.m_row[2], b );
ans.w = 0.f;
return ans;
}
__inline
float4 mtMul3(float4 a, Matrix3x3 b)
{
float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
float4 ans;
ans.x = dot3F4( a, colx );
ans.y = dot3F4( a, coly );
ans.z = dot3F4( a, colz );
return ans;
}
///////////////////////////////////////
// Quaternion
///////////////////////////////////////
typedef float4 Quaternion;
__inline
Quaternion qtMul(Quaternion a, Quaternion b);
__inline
Quaternion qtNormalize(Quaternion in);
__inline
float4 qtRotate(Quaternion q, float4 vec);
__inline
Quaternion qtInvert(Quaternion q);
__inline
Quaternion qtMul(Quaternion a, Quaternion b)
{
Quaternion ans;
ans = cross3( a, b );
ans += a.w*b+b.w*a;
// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
ans.w = a.w*b.w - dot3F4(a, b);
return ans;
}
__inline
Quaternion qtNormalize(Quaternion in)
{
return fastNormalize4(in);
// in /= length( in );
// return in;
}
__inline
float4 qtRotate(Quaternion q, float4 vec)
{
Quaternion qInv = qtInvert( q );
float4 vcpy = vec;
vcpy.w = 0.f;
float4 out = qtMul(qtMul(q,vcpy),qInv);
return out;
}
__inline
Quaternion qtInvert(Quaternion q)
{
return (Quaternion)(-q.xyz, q.w);
}
__inline
float4 qtInvRotate(const Quaternion q, float4 vec)
{
return qtRotate( qtInvert( q ), vec );
}
#define WG_SIZE 64
typedef struct
{
float4 m_pos;
Quaternion m_quat;
float4 m_linVel;
float4 m_angVel;
u32 m_shapeIdx;
float m_invMass;
float m_restituitionCoeff;
float m_frictionCoeff;
} Body;
typedef struct
{
Matrix3x3 m_invInertia;
Matrix3x3 m_initInvInertia;
} Shape;
typedef struct
{
float4 m_linear;
float4 m_worldPos[4];
float4 m_center;
float m_jacCoeffInv[4];
float m_b[4];
float m_appliedRambdaDt[4];
float m_fJacCoeffInv[2];
float m_fAppliedRambdaDt[2];
u32 m_bodyA;
u32 m_bodyB;
int m_batchIdx;
u32 m_paddings[1];
} Constraint4;
typedef struct
{
float4 m_worldPos[4];
float4 m_worldNormal;
u32 m_coeffs;
int m_batchIdx;
int m_bodyAPtrAndSignBit;
int m_bodyBPtrAndSignBit;
} Contact4;
typedef struct
{
int m_nConstraints;
int m_start;
int m_batchIdx;
int m_nSplit;
// int m_paddings[1];
} ConstBuffer;
typedef struct
{
int m_solveFriction;
int m_maxBatch; // long batch really kills the performance
int m_batchIdx;
int m_nSplit;
// int m_paddings[1];
} ConstBufferBatchSolve;
typedef struct
{
int m_valInt0;
int m_valInt1;
int m_valInt2;
int m_valInt3;
float m_val0;
float m_val1;
float m_val2;
float m_val3;
} SolverDebugInfo;
// others
__kernel
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
void ReorderContactKernel(__global Contact4* in, __global Contact4* out, __global int2* sortData, int4 cb )
{
int nContacts = cb.x;
int gIdx = GET_GLOBAL_IDX;
if( gIdx < nContacts )
{
int srcIdx = sortData[gIdx].y;
out[gIdx] = in[srcIdx];
}
}
typedef struct
{
int m_nContacts;
int m_staticIdx;
float m_scale;
int m_nSplit;
} ConstBufferSSD;
__kernel
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
void SetSortDataKernel(__global Contact4* gContact, __global Body* gBodies, __global int2* gSortDataOut,
int nContacts,
float scale,
int N_SPLIT
)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < nContacts )
{
int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
int idx = (gContact[gIdx].m_bodyAPtrAndSignBit<0)? bIdx: aIdx;
float4 p = gBodies[idx].m_pos;
int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (N_SPLIT-1);
int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (N_SPLIT-1);
gSortDataOut[gIdx].x = (xIdx+zIdx*N_SPLIT);
gSortDataOut[gIdx].y = gIdx;
}
else
{
gSortDataOut[gIdx].x = 0xffffffff;
}
}
__kernel
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
void CopyConstraintKernel(__global Contact4* gIn, __global Contact4* gOut, int4 cb )
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < cb.x )
{
gOut[gIdx] = gIn[gIdx];
}
}

View File

@@ -0,0 +1,498 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* solverSetup2CL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"\n"
"\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile global int*\n"
"#endif\n"
"\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"\n"
"\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"\n"
"#define make_float4 (float4)\n"
"#define make_float2 (float2)\n"
"#define make_uint4 (uint4)\n"
"#define make_int4 (int4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"\n"
"#define max2 max\n"
"#define min2 min\n"
"\n"
"\n"
"///////////////////////////////////////\n"
"// Vector\n"
"///////////////////////////////////////\n"
"__inline\n"
"float fastDiv(float numerator, float denominator)\n"
"{\n"
" return native_divide(numerator, denominator); \n"
"// return numerator/denominator; \n"
"}\n"
"\n"
"__inline\n"
"float4 fastDiv4(float4 numerator, float4 denominator)\n"
"{\n"
" return native_divide(numerator, denominator); \n"
"}\n"
"\n"
"__inline\n"
"float fastSqrtf(float f2)\n"
"{\n"
" return native_sqrt(f2);\n"
"// return sqrt(f2);\n"
"}\n"
"\n"
"__inline\n"
"float fastRSqrt(float f2)\n"
"{\n"
" return native_rsqrt(f2);\n"
"}\n"
"\n"
"__inline\n"
"float fastLength4(float4 v)\n"
"{\n"
" return fast_length(v);\n"
"}\n"
"\n"
"__inline\n"
"float4 fastNormalize4(float4 v)\n"
"{\n"
" return fast_normalize(v);\n"
"}\n"
"\n"
"\n"
"__inline\n"
"float sqrtf(float a)\n"
"{\n"
"// return sqrt(a);\n"
" return native_sqrt(a);\n"
"}\n"
"\n"
"__inline\n"
"float4 cross3(float4 a, float4 b)\n"
"{\n"
" return cross(a,b);\n"
"}\n"
"\n"
"__inline\n"
"float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = make_float4(a.xyz,0.f);\n"
" float4 b1 = make_float4(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"\n"
"__inline\n"
"float length3(const float4 a)\n"
"{\n"
" return sqrtf(dot3F4(a,a));\n"
"}\n"
"\n"
"__inline\n"
"float dot4(const float4 a, const float4 b)\n"
"{\n"
" return dot( a, b );\n"
"}\n"
"\n"
"// for height\n"
"__inline\n"
"float dot3w1(const float4 point, const float4 eqn)\n"
"{\n"
" return dot3F4(point,eqn) + eqn.w;\n"
"}\n"
"\n"
"__inline\n"
"float4 normalize3(const float4 a)\n"
"{\n"
" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
" return fastNormalize4( n );\n"
"// float length = sqrtf(dot3F4(a, a));\n"
"// return 1.f/length * a;\n"
"}\n"
"\n"
"__inline\n"
"float4 normalize4(const float4 a)\n"
"{\n"
" float length = sqrtf(dot4(a, a));\n"
" return 1.f/length * a;\n"
"}\n"
"\n"
"__inline\n"
"float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
"{\n"
" float4 eqn;\n"
" float4 ab = b-a;\n"
" float4 ac = c-a;\n"
" eqn = normalize3( cross3(ab, ac) );\n"
" eqn.w = -dot3F4(eqn,a);\n"
" return eqn;\n"
"}\n"
"\n"
"///////////////////////////////////////\n"
"// Matrix3x3\n"
"///////////////////////////////////////\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_row[3];\n"
"}Matrix3x3;\n"
"\n"
"__inline\n"
"Matrix3x3 mtZero();\n"
"\n"
"__inline\n"
"Matrix3x3 mtIdentity();\n"
"\n"
"__inline\n"
"Matrix3x3 mtTranspose(Matrix3x3 m);\n"
"\n"
"__inline\n"
"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b);\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b);\n"
"\n"
"__inline\n"
"Matrix3x3 mtZero()\n"
"{\n"
" Matrix3x3 m;\n"
" m.m_row[0] = (float4)(0.f);\n"
" m.m_row[1] = (float4)(0.f);\n"
" m.m_row[2] = (float4)(0.f);\n"
" return m;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtIdentity()\n"
"{\n"
" Matrix3x3 m;\n"
" m.m_row[0] = (float4)(1,0,0,0);\n"
" m.m_row[1] = (float4)(0,1,0,0);\n"
" m.m_row[2] = (float4)(0,0,1,0);\n"
" return m;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtTranspose(Matrix3x3 m)\n"
"{\n"
" Matrix3x3 out;\n"
" out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
" out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
" out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
" return out;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
"{\n"
" Matrix3x3 transB;\n"
" transB = mtTranspose( b );\n"
" Matrix3x3 ans;\n"
" // why this doesn't run when 0ing in the for{}\n"
" a.m_row[0].w = 0.f;\n"
" a.m_row[1].w = 0.f;\n"
" a.m_row[2].w = 0.f;\n"
" for(int i=0; i<3; i++)\n"
" {\n"
"// a.m_row[i].w = 0.f;\n"
" ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n"
" ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n"
" ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n"
" ans.m_row[i].w = 0.f;\n"
" }\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b)\n"
"{\n"
" float4 ans;\n"
" ans.x = dot3F4( a.m_row[0], b );\n"
" ans.y = dot3F4( a.m_row[1], b );\n"
" ans.z = dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b)\n"
"{\n"
" float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
"\n"
" float4 ans;\n"
" ans.x = dot3F4( a, colx );\n"
" ans.y = dot3F4( a, coly );\n"
" ans.z = dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"\n"
"///////////////////////////////////////\n"
"// Quaternion\n"
"///////////////////////////////////////\n"
"\n"
"typedef float4 Quaternion;\n"
"\n"
"__inline\n"
"Quaternion qtMul(Quaternion a, Quaternion b);\n"
"\n"
"__inline\n"
"Quaternion qtNormalize(Quaternion in);\n"
"\n"
"__inline\n"
"float4 qtRotate(Quaternion q, float4 vec);\n"
"\n"
"__inline\n"
"Quaternion qtInvert(Quaternion q);\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"__inline\n"
"Quaternion qtMul(Quaternion a, Quaternion b)\n"
"{\n"
" Quaternion ans;\n"
" ans = cross3( a, b );\n"
" ans += a.w*b+b.w*a;\n"
"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
" ans.w = a.w*b.w - dot3F4(a, b);\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"Quaternion qtNormalize(Quaternion in)\n"
"{\n"
" return fastNormalize4(in);\n"
"// in /= length( in );\n"
"// return in;\n"
"}\n"
"__inline\n"
"float4 qtRotate(Quaternion q, float4 vec)\n"
"{\n"
" Quaternion qInv = qtInvert( q );\n"
" float4 vcpy = vec;\n"
" vcpy.w = 0.f;\n"
" float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
" return out;\n"
"}\n"
"\n"
"__inline\n"
"Quaternion qtInvert(Quaternion q)\n"
"{\n"
" return (Quaternion)(-q.xyz, q.w);\n"
"}\n"
"\n"
"__inline\n"
"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
"{\n"
" return qtRotate( qtInvert( q ), vec );\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"#define WG_SIZE 64\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" Quaternion m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
"\n"
" u32 m_shapeIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"\n"
"typedef struct\n"
"{\n"
" Matrix3x3 m_invInertia;\n"
" Matrix3x3 m_initInvInertia;\n"
"} Shape;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_linear;\n"
" float4 m_worldPos[4];\n"
" float4 m_center; \n"
" float m_jacCoeffInv[4];\n"
" float m_b[4];\n"
" float m_appliedRambdaDt[4];\n"
"\n"
" float m_fJacCoeffInv[2]; \n"
" float m_fAppliedRambdaDt[2]; \n"
"\n"
" u32 m_bodyA;\n"
" u32 m_bodyB;\n"
"\n"
" int m_batchIdx;\n"
" u32 m_paddings[1];\n"
"} Constraint4;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_worldPos[4];\n"
" float4 m_worldNormal;\n"
" u32 m_coeffs;\n"
" int m_batchIdx;\n"
"\n"
" int m_bodyAPtrAndSignBit;\n"
" int m_bodyBPtrAndSignBit;\n"
"} Contact4;\n"
"\n"
"typedef struct\n"
"{\n"
" int m_nConstraints;\n"
" int m_start;\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBuffer;\n"
"\n"
"typedef struct\n"
"{\n"
" int m_solveFriction;\n"
" int m_maxBatch; // long batch really kills the performance\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBufferBatchSolve;\n"
"\n"
"\n"
" \n"
"\n"
"\n"
"typedef struct \n"
"{\n"
" int m_valInt0;\n"
" int m_valInt1;\n"
" int m_valInt2;\n"
" int m_valInt3;\n"
"\n"
" float m_val0;\n"
" float m_val1;\n"
" float m_val2;\n"
" float m_val3;\n"
"} SolverDebugInfo;\n"
"\n"
"\n"
"\n"
"\n"
"// others\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void ReorderContactKernel(__global Contact4* in, __global Contact4* out, __global int2* sortData, int4 cb )\n"
"{\n"
" int nContacts = cb.x;\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < nContacts )\n"
" {\n"
" int srcIdx = sortData[gIdx].y;\n"
" out[gIdx] = in[srcIdx];\n"
" }\n"
"}\n"
"\n"
"typedef struct\n"
"{\n"
" int m_nContacts;\n"
" int m_staticIdx;\n"
" float m_scale;\n"
" int m_nSplit;\n"
"} ConstBufferSSD;\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void SetSortDataKernel(__global Contact4* gContact, __global Body* gBodies, __global int2* gSortDataOut, \n"
"int nContacts,\n"
"float scale,\n"
"int N_SPLIT\n"
")\n"
"\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" \n"
" if( gIdx < nContacts )\n"
" {\n"
" int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n"
" int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n"
"\n"
" int idx = (gContact[gIdx].m_bodyAPtrAndSignBit<0)? bIdx: aIdx;\n"
" float4 p = gBodies[idx].m_pos;\n"
" int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (N_SPLIT-1);\n"
" int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (N_SPLIT-1);\n"
"\n"
" gSortDataOut[gIdx].x = (xIdx+zIdx*N_SPLIT);\n"
" gSortDataOut[gIdx].y = gIdx;\n"
" }\n"
" else\n"
" {\n"
" gSortDataOut[gIdx].x = 0xffffffff;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void CopyConstraintKernel(__global Contact4* gIn, __global Contact4* gOut, int4 cb )\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < cb.x )\n"
" {\n"
" gOut[gIdx] = gIn[gIdx];\n"
" }\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
;

View File

@@ -0,0 +1,971 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
#ifdef cl_ext_atomic_counters_32
#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
#else
#define counter32_t volatile global int*
#endif
typedef unsigned int u32;
typedef unsigned short u16;
typedef unsigned char u8;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GET_NUM_GROUPS get_num_groups(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
#define AtomInc(x) atom_inc(&(x))
#define AtomInc1(x, out) out = atom_inc(&(x))
#define AppendInc(x, out) out = atomic_inc(x)
#define AtomAdd(x, value) atom_add(&(x), value)
#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
#define AtomXhg(x, value) atom_xchg ( &(x), value )
#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
#define make_float4 (float4)
#define make_float2 (float2)
#define make_uint4 (uint4)
#define make_int4 (int4)
#define make_uint2 (uint2)
#define make_int2 (int2)
#define max2 max
#define min2 min
///////////////////////////////////////
// Vector
///////////////////////////////////////
__inline
float fastDiv(float numerator, float denominator)
{
return native_divide(numerator, denominator);
// return numerator/denominator;
}
__inline
float4 fastDiv4(float4 numerator, float4 denominator)
{
return native_divide(numerator, denominator);
}
__inline
float fastSqrtf(float f2)
{
return native_sqrt(f2);
// return sqrt(f2);
}
__inline
float fastRSqrt(float f2)
{
return native_rsqrt(f2);
}
__inline
float fastLength4(float4 v)
{
return fast_length(v);
}
__inline
float4 fastNormalize4(float4 v)
{
return fast_normalize(v);
}
__inline
float sqrtf(float a)
{
// return sqrt(a);
return native_sqrt(a);
}
__inline
float4 cross3(float4 a1, float4 b1)
{
float4 a=make_float4(a1.xyz,0.f);
float4 b=make_float4(b1.xyz,0.f);
//float4 a=a1;
//float4 b=b1;
return cross(a,b);
}
__inline
float dot3F4(float4 a, float4 b)
{
float4 a1 = make_float4(a.xyz,0.f);
float4 b1 = make_float4(b.xyz,0.f);
return dot(a1, b1);
}
__inline
float length3(const float4 a)
{
return sqrtf(dot3F4(a,a));
}
__inline
float dot4(const float4 a, const float4 b)
{
return dot( a, b );
}
// for height
__inline
float dot3w1(const float4 point, const float4 eqn)
{
return dot3F4(point,eqn) + eqn.w;
}
__inline
float4 normalize3(const float4 a)
{
float4 n = make_float4(a.x, a.y, a.z, 0.f);
return fastNormalize4( n );
// float length = sqrtf(dot3F4(a, a));
// return 1.f/length * a;
}
__inline
float4 normalize4(const float4 a)
{
float length = sqrtf(dot4(a, a));
return 1.f/length * a;
}
__inline
float4 createEquation(const float4 a, const float4 b, const float4 c)
{
float4 eqn;
float4 ab = b-a;
float4 ac = c-a;
eqn = normalize3( cross3(ab, ac) );
eqn.w = -dot3F4(eqn,a);
return eqn;
}
///////////////////////////////////////
// Matrix3x3
///////////////////////////////////////
typedef struct
{
float4 m_row[3];
}Matrix3x3;
__inline
Matrix3x3 mtZero();
__inline
Matrix3x3 mtIdentity();
__inline
Matrix3x3 mtTranspose(Matrix3x3 m);
__inline
Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);
__inline
float4 mtMul1(Matrix3x3 a, float4 b);
__inline
float4 mtMul3(float4 a, Matrix3x3 b);
__inline
Matrix3x3 mtZero()
{
Matrix3x3 m;
m.m_row[0] = (float4)(0.f);
m.m_row[1] = (float4)(0.f);
m.m_row[2] = (float4)(0.f);
return m;
}
__inline
Matrix3x3 mtIdentity()
{
Matrix3x3 m;
m.m_row[0] = (float4)(1,0,0,0);
m.m_row[1] = (float4)(0,1,0,0);
m.m_row[2] = (float4)(0,0,1,0);
return m;
}
__inline
Matrix3x3 mtTranspose(Matrix3x3 m)
{
Matrix3x3 out;
out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
return out;
}
__inline
Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
{
Matrix3x3 transB;
transB = mtTranspose( b );
Matrix3x3 ans;
// why this doesn't run when 0ing in the for{}
a.m_row[0].w = 0.f;
a.m_row[1].w = 0.f;
a.m_row[2].w = 0.f;
for(int i=0; i<3; i++)
{
// a.m_row[i].w = 0.f;
ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
ans.m_row[i].w = 0.f;
}
return ans;
}
__inline
float4 mtMul1(Matrix3x3 a, float4 b)
{
float4 ans;
ans.x = dot3F4( a.m_row[0], b );
ans.y = dot3F4( a.m_row[1], b );
ans.z = dot3F4( a.m_row[2], b );
ans.w = 0.f;
return ans;
}
__inline
float4 mtMul3(float4 a, Matrix3x3 b)
{
float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
float4 ans;
ans.x = dot3F4( a, colx );
ans.y = dot3F4( a, coly );
ans.z = dot3F4( a, colz );
return ans;
}
///////////////////////////////////////
// Quaternion
///////////////////////////////////////
typedef float4 Quaternion;
__inline
Quaternion qtMul(Quaternion a, Quaternion b);
__inline
Quaternion qtNormalize(Quaternion in);
__inline
float4 qtRotate(Quaternion q, float4 vec);
__inline
Quaternion qtInvert(Quaternion q);
__inline
Quaternion qtMul(Quaternion a, Quaternion b)
{
Quaternion ans;
ans = cross3( a, b );
ans += a.w*b+b.w*a;
// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
ans.w = a.w*b.w - dot3F4(a, b);
return ans;
}
__inline
Quaternion qtNormalize(Quaternion in)
{
return fastNormalize4(in);
// in /= length( in );
// return in;
}
__inline
float4 qtRotate(Quaternion q, float4 vec)
{
Quaternion qInv = qtInvert( q );
float4 vcpy = vec;
vcpy.w = 0.f;
float4 out = qtMul(qtMul(q,vcpy),qInv);
return out;
}
__inline
Quaternion qtInvert(Quaternion q)
{
return (Quaternion)(-q.xyz, q.w);
}
__inline
float4 qtInvRotate(const Quaternion q, float4 vec)
{
return qtRotate( qtInvert( q ), vec );
}
#define WG_SIZE 64
typedef struct
{
float4 m_pos;
Quaternion m_quat;
float4 m_linVel;
float4 m_angVel;
u32 m_shapeIdx;
float m_invMass;
float m_restituitionCoeff;
float m_frictionCoeff;
} Body;
typedef struct
{
Matrix3x3 m_invInertia;
Matrix3x3 m_initInvInertia;
} Shape;
typedef struct
{
float4 m_linear;
float4 m_worldPos[4];
float4 m_center;
float m_jacCoeffInv[4];
float m_b[4];
float m_appliedRambdaDt[4];
float m_fJacCoeffInv[2];
float m_fAppliedRambdaDt[2];
u32 m_bodyA;
u32 m_bodyB;
int m_batchIdx;
u32 m_paddings;
} Constraint4;
typedef struct
{
float4 m_worldPos[4];
float4 m_worldNormal;
u32 m_coeffs;
int m_batchIdx;
int m_bodyAPtrAndSignBit;
int m_bodyBPtrAndSignBit;
} Contact4;
__kernel void CountBodiesKernel(__global Contact4* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)
{
int i = GET_GLOBAL_IDX;
if( i < numContactManifolds)
{
int pa = manifoldPtr[i].m_bodyAPtrAndSignBit;
bool isFixedA = (pa <0) || (pa == fixedBodyIndex);
int bodyIndexA = abs(pa);
if (!isFixedA)
{
AtomInc1(bodyCount[bodyIndexA],contactConstraintOffsets[i].x);
}
barrier(CLK_GLOBAL_MEM_FENCE);
int pb = manifoldPtr[i].m_bodyBPtrAndSignBit;
bool isFixedB = (pb <0) || (pb == fixedBodyIndex);
int bodyIndexB = abs(pb);
if (!isFixedB)
{
AtomInc1(bodyCount[bodyIndexB],contactConstraintOffsets[i].y);
}
}
}
__kernel void ClearVelocitiesKernel(__global float4* linearVelocities,__global float4* angularVelocities, int numSplitBodies)
{
int i = GET_GLOBAL_IDX;
if( i < numSplitBodies)
{
linearVelocities[i] = make_float4(0);
angularVelocities[i] = make_float4(0);
}
}
__kernel void AverageVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,
__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)
{
int i = GET_GLOBAL_IDX;
if (i<numBodies)
{
if (gBodies[i].m_invMass)
{
int bodyOffset = offsetSplitBodies[i];
int count = bodyCount[i];
float factor = 1.f/((float)count);
float4 averageLinVel = make_float4(0.f);
float4 averageAngVel = make_float4(0.f);
for (int j=0;j<count;j++)
{
averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;
averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;
}
for (int j=0;j<count;j++)
{
deltaLinearVelocities[bodyOffset+j] = averageLinVel;
deltaAngularVelocities[bodyOffset+j] = averageAngVel;
}
}//bodies[i].m_invMass
}//i<numBodies
}
void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)
{
*linear = make_float4(-n.xyz,0.f);
*angular0 = -cross3(r0, n);
*angular1 = cross3(r1, n);
}
float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )
{
return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
}
float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1, float countA, float countB)
{
// linear0,1 are normlized
float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);
float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);
return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB);
}
void btPlaneSpace1 (float4 n, float4* p, float4* q);
void btPlaneSpace1 (float4 n, float4* p, float4* q)
{
if (fabs(n.z) > 0.70710678f) {
// choose p in y-z plane
float a = n.y*n.y + n.z*n.z;
float k = 1.f/sqrt(a);
p[0].x = 0;
p[0].y = -n.z*k;
p[0].z = n.y*k;
// set q = n x p
q[0].x = a*k;
q[0].y = -n.x*p[0].z;
q[0].z = n.x*p[0].y;
}
else {
// choose p in x-y plane
float a = n.x*n.x + n.y*n.y;
float k = 1.f/sqrt(a);
p[0].x = -n.y*k;
p[0].y = n.x*k;
p[0].z = 0;
// set q = n x p
q[0].x = -n.z*p[0].y;
q[0].y = n.z*p[0].x;
q[0].z = a*k;
}
}
void solveContact(__global Constraint4* cs,
float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,
float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB,
float4* dLinVelA, float4* dAngVelA, float4* dLinVelB, float4* dAngVelB)
{
float minRambdaDt = 0;
float maxRambdaDt = FLT_MAX;
for(int ic=0; ic<4; ic++)
{
if( cs->m_jacCoeffInv[ic] == 0.f ) continue;
float4 angular0, angular1, linear;
float4 r0 = cs->m_worldPos[ic] - posA;
float4 r1 = cs->m_worldPos[ic] - posB;
setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );
float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1,
*linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic];
rambdaDt *= cs->m_jacCoeffInv[ic];
{
float prevSum = cs->m_appliedRambdaDt[ic];
float updated = prevSum;
updated += rambdaDt;
updated = max2( updated, minRambdaDt );
updated = min2( updated, maxRambdaDt );
rambdaDt = updated - prevSum;
cs->m_appliedRambdaDt[ic] = updated;
}
float4 linImp0 = invMassA*linear*rambdaDt;
float4 linImp1 = invMassB*(-linear)*rambdaDt;
float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
if (invMassA)
{
*dLinVelA += linImp0;
*dAngVelA += angImp0;
}
if (invMassB)
{
*dLinVelB += linImp1;
*dAngVelB += angImp1;
}
}
}
// solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,contactConstraintOffsets,offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);
void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs,
__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,
__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)
{
//float frictionCoeff = ldsCs[0].m_linear.w;
int aIdx = ldsCs[0].m_bodyA;
int bIdx = ldsCs[0].m_bodyB;
float4 posA = gBodies[aIdx].m_pos;
float4 linVelA = gBodies[aIdx].m_linVel;
float4 angVelA = gBodies[aIdx].m_angVel;
float invMassA = gBodies[aIdx].m_invMass;
Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
float4 posB = gBodies[bIdx].m_pos;
float4 linVelB = gBodies[bIdx].m_linVel;
float4 angVelB = gBodies[bIdx].m_angVel;
float invMassB = gBodies[bIdx].m_invMass;
Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
float4 dLinVelA = make_float4(0,0,0,0);
float4 dAngVelA = make_float4(0,0,0,0);
float4 dLinVelB = make_float4(0,0,0,0);
float4 dAngVelB = make_float4(0,0,0,0);
int bodyOffsetA = offsetSplitBodies[aIdx];
int constraintOffsetA = contactConstraintOffsets[0].x;
int splitIndexA = bodyOffsetA+constraintOffsetA;
if (invMassA)
{
dLinVelA = deltaLinearVelocities[splitIndexA];
dAngVelA = deltaAngularVelocities[splitIndexA];
}
int bodyOffsetB = offsetSplitBodies[bIdx];
int constraintOffsetB = contactConstraintOffsets[0].y;
int splitIndexB= bodyOffsetB+constraintOffsetB;
if (invMassB)
{
dLinVelB = deltaLinearVelocities[splitIndexB];
dAngVelB = deltaAngularVelocities[splitIndexB];
}
solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,
posB, &linVelB, &angVelB, invMassB, invInertiaB ,&dLinVelA, &dAngVelA, &dLinVelB, &dAngVelB);
if (invMassA)
{
deltaLinearVelocities[splitIndexA] = dLinVelA;
deltaAngularVelocities[splitIndexA] = dAngVelA;
}
if (invMassB)
{
deltaLinearVelocities[splitIndexB] = dLinVelB;
deltaAngularVelocities[splitIndexB] = dAngVelB;
}
}
__kernel void SolveContactJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,
__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,
float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds
)
{
int i = GET_GLOBAL_IDX;
if (i<numManifolds)
{
solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);
}
}
void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs,
__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,
__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)
{
float frictionCoeff = 0.7f;//ldsCs[0].m_linear.w;
int aIdx = ldsCs[0].m_bodyA;
int bIdx = ldsCs[0].m_bodyB;
float4 posA = gBodies[aIdx].m_pos;
float4 linVelA = gBodies[aIdx].m_linVel;
float4 angVelA = gBodies[aIdx].m_angVel;
float invMassA = gBodies[aIdx].m_invMass;
Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
float4 posB = gBodies[bIdx].m_pos;
float4 linVelB = gBodies[bIdx].m_linVel;
float4 angVelB = gBodies[bIdx].m_angVel;
float invMassB = gBodies[bIdx].m_invMass;
Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
float4 dLinVelA = make_float4(0,0,0,0);
float4 dAngVelA = make_float4(0,0,0,0);
float4 dLinVelB = make_float4(0,0,0,0);
float4 dAngVelB = make_float4(0,0,0,0);
int bodyOffsetA = offsetSplitBodies[aIdx];
int constraintOffsetA = contactConstraintOffsets[0].x;
int splitIndexA = bodyOffsetA+constraintOffsetA;
if (invMassA)
{
dLinVelA = deltaLinearVelocities[splitIndexA];
dAngVelA = deltaAngularVelocities[splitIndexA];
}
int bodyOffsetB = offsetSplitBodies[bIdx];
int constraintOffsetB = contactConstraintOffsets[0].y;
int splitIndexB= bodyOffsetB+constraintOffsetB;
if (invMassB)
{
dLinVelB = deltaLinearVelocities[splitIndexB];
dAngVelB = deltaAngularVelocities[splitIndexB];
}
{
float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
float sum = 0;
for(int j=0; j<4; j++)
{
sum +=ldsCs[0].m_appliedRambdaDt[j];
}
frictionCoeff = 0.7f;
for(int j=0; j<4; j++)
{
maxRambdaDt[j] = frictionCoeff*sum;
minRambdaDt[j] = -maxRambdaDt[j];
}
// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,
// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );
{
__global Constraint4* cs = ldsCs;
if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;
const float4 center = cs->m_center;
float4 n = -cs->m_linear;
float4 tangent[2];
btPlaneSpace1(n,&tangent[0],&tangent[1]);
float4 angular0, angular1, linear;
float4 r0 = center - posA;
float4 r1 = center - posB;
for(int i=0; i<2; i++)
{
setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );
float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
linVelA+dLinVelA, angVelA+dAngVelA, linVelB+dLinVelB, angVelB+dAngVelB );
rambdaDt *= cs->m_fJacCoeffInv[i];
{
float prevSum = cs->m_fAppliedRambdaDt[i];
float updated = prevSum;
updated += rambdaDt;
updated = max2( updated, minRambdaDt[i] );
updated = min2( updated, maxRambdaDt[i] );
rambdaDt = updated - prevSum;
cs->m_fAppliedRambdaDt[i] = updated;
}
float4 linImp0 = invMassA*linear*rambdaDt;
float4 linImp1 = invMassB*(-linear)*rambdaDt;
float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
dLinVelA += linImp0;
dAngVelA += angImp0;
dLinVelB += linImp1;
dAngVelB += angImp1;
}
{ // angular damping for point constraint
float4 ab = normalize3( posB - posA );
float4 ac = normalize3( center - posA );
if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
{
float angNA = dot3F4( n, angVelA );
float angNB = dot3F4( n, angVelB );
dAngVelA -= (angNA*0.1f)*n;
dAngVelB -= (angNB*0.1f)*n;
}
}
}
}
if (invMassA)
{
deltaLinearVelocities[splitIndexA] = dLinVelA;
deltaAngularVelocities[splitIndexA] = dAngVelA;
}
if (invMassB)
{
deltaLinearVelocities[splitIndexB] = dLinVelB;
deltaAngularVelocities[splitIndexB] = dAngVelB;
}
}
__kernel void SolveFrictionJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,
__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,
__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,
float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds
)
{
int i = GET_GLOBAL_IDX;
if (i<numManifolds)
{
solveFrictionConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);
}
}
__kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,
__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)
{
int i = GET_GLOBAL_IDX;
if (i<numBodies)
{
if (gBodies[i].m_invMass)
{
int bodyOffset = offsetSplitBodies[i];
int count = bodyCount[i];
if (count)
{
gBodies[i].m_linVel += deltaLinearVelocities[bodyOffset];
gBodies[i].m_angVel += deltaAngularVelocities[bodyOffset];
}
}
}
}
void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,
const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB,
__global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,
Constraint4* dstC )
{
dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);
dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);
float dtInv = 1.f/dt;
for(int ic=0; ic<4; ic++)
{
dstC->m_appliedRambdaDt[ic] = 0.f;
}
dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;
dstC->m_linear = -src->m_worldNormal;
dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );
for(int ic=0; ic<4; ic++)
{
float4 r0 = src->m_worldPos[ic] - posA;
float4 r1 = src->m_worldPos[ic] - posB;
if( ic >= src->m_worldNormal.w )//npoints
{
dstC->m_jacCoeffInv[ic] = 0.f;
continue;
}
float relVelN;
{
float4 linear, angular0, angular1;
setLinearAndAngular(src->m_worldNormal, r0, r1, &linear, &angular0, &angular1);
dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,
invMassA, &invInertiaA, invMassB, &invInertiaB , countA, countB);
relVelN = calcRelVel(linear, -linear, angular0, angular1,
linVelA, angVelA, linVelB, angVelB);
float e = 0.f;//src->getRestituitionCoeff();
if( relVelN*relVelN < 0.004f ) e = 0.f;
dstC->m_b[ic] = e*relVelN;
//float penetration = src->m_worldPos[ic].w;
dstC->m_b[ic] += (src->m_worldPos[ic].w + positionDrift)*positionConstraintCoeff*dtInv;
dstC->m_appliedRambdaDt[ic] = 0.f;
}
}
if( src->m_worldNormal.w > 0 )//npoints
{ // prepare friction
float4 center = make_float4(0.f);
for(int i=0; i<src->m_worldNormal.w; i++)
center += src->m_worldPos[i];
center /= (float)src->m_worldNormal.w;
float4 tangent[2];
btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);
float4 r[2];
r[0] = center - posA;
r[1] = center - posB;
for(int i=0; i<2; i++)
{
float4 linear, angular0, angular1;
setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);
dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,
invMassA, &invInertiaA, invMassB, &invInertiaB ,countA, countB);
dstC->m_fAppliedRambdaDt[i] = 0.f;
}
dstC->m_center = center;
}
for(int i=0; i<4; i++)
{
if( i<src->m_worldNormal.w )
{
dstC->m_worldPos[i] = src->m_worldPos[i];
}
else
{
dstC->m_worldPos[i] = make_float4(0.f);
}
}
}
__kernel
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
void ContactToConstraintSplitKernel(__global const Contact4* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut,
__global const unsigned int* bodyCount,
int nContacts,
float dt,
float positionDrift,
float positionConstraintCoeff
)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < nContacts )
{
int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
float4 posA = gBodies[aIdx].m_pos;
float4 linVelA = gBodies[aIdx].m_linVel;
float4 angVelA = gBodies[aIdx].m_angVel;
float invMassA = gBodies[aIdx].m_invMass;
Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
float4 posB = gBodies[bIdx].m_pos;
float4 linVelB = gBodies[bIdx].m_linVel;
float4 angVelB = gBodies[bIdx].m_angVel;
float invMassB = gBodies[bIdx].m_invMass;
Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
Constraint4 cs;
float countA = invMassA ? (float)bodyCount[aIdx] : 1;
float countB = invMassB ? (float)bodyCount[bIdx] : 1;
setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
&gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB,
&cs );
cs.m_batchIdx = gContact[gIdx].m_batchIdx;
gConstraintOut[gIdx] = cs;
}
}

View File

@@ -0,0 +1,974 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* solverUtilsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"\n"
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"\n"
"\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile global int*\n"
"#endif\n"
"\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"\n"
"\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"\n"
"#define make_float4 (float4)\n"
"#define make_float2 (float2)\n"
"#define make_uint4 (uint4)\n"
"#define make_int4 (int4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"\n"
"#define max2 max\n"
"#define min2 min\n"
"\n"
"\n"
"///////////////////////////////////////\n"
"// Vector\n"
"///////////////////////////////////////\n"
"__inline\n"
"float fastDiv(float numerator, float denominator)\n"
"{\n"
" return native_divide(numerator, denominator); \n"
"// return numerator/denominator; \n"
"}\n"
"\n"
"__inline\n"
"float4 fastDiv4(float4 numerator, float4 denominator)\n"
"{\n"
" return native_divide(numerator, denominator); \n"
"}\n"
"\n"
"__inline\n"
"float fastSqrtf(float f2)\n"
"{\n"
" return native_sqrt(f2);\n"
"// return sqrt(f2);\n"
"}\n"
"\n"
"__inline\n"
"float fastRSqrt(float f2)\n"
"{\n"
" return native_rsqrt(f2);\n"
"}\n"
"\n"
"__inline\n"
"float fastLength4(float4 v)\n"
"{\n"
" return fast_length(v);\n"
"}\n"
"\n"
"__inline\n"
"float4 fastNormalize4(float4 v)\n"
"{\n"
" return fast_normalize(v);\n"
"}\n"
"\n"
"\n"
"__inline\n"
"float sqrtf(float a)\n"
"{\n"
"// return sqrt(a);\n"
" return native_sqrt(a);\n"
"}\n"
"\n"
"__inline\n"
"float4 cross3(float4 a1, float4 b1)\n"
"{\n"
"\n"
" float4 a=make_float4(a1.xyz,0.f);\n"
" float4 b=make_float4(b1.xyz,0.f);\n"
" //float4 a=a1;\n"
" //float4 b=b1;\n"
" return cross(a,b);\n"
"}\n"
"\n"
"__inline\n"
"float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = make_float4(a.xyz,0.f);\n"
" float4 b1 = make_float4(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"\n"
"__inline\n"
"float length3(const float4 a)\n"
"{\n"
" return sqrtf(dot3F4(a,a));\n"
"}\n"
"\n"
"__inline\n"
"float dot4(const float4 a, const float4 b)\n"
"{\n"
" return dot( a, b );\n"
"}\n"
"\n"
"// for height\n"
"__inline\n"
"float dot3w1(const float4 point, const float4 eqn)\n"
"{\n"
" return dot3F4(point,eqn) + eqn.w;\n"
"}\n"
"\n"
"__inline\n"
"float4 normalize3(const float4 a)\n"
"{\n"
" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
" return fastNormalize4( n );\n"
"// float length = sqrtf(dot3F4(a, a));\n"
"// return 1.f/length * a;\n"
"}\n"
"\n"
"__inline\n"
"float4 normalize4(const float4 a)\n"
"{\n"
" float length = sqrtf(dot4(a, a));\n"
" return 1.f/length * a;\n"
"}\n"
"\n"
"__inline\n"
"float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
"{\n"
" float4 eqn;\n"
" float4 ab = b-a;\n"
" float4 ac = c-a;\n"
" eqn = normalize3( cross3(ab, ac) );\n"
" eqn.w = -dot3F4(eqn,a);\n"
" return eqn;\n"
"}\n"
"\n"
"///////////////////////////////////////\n"
"// Matrix3x3\n"
"///////////////////////////////////////\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_row[3];\n"
"}Matrix3x3;\n"
"\n"
"__inline\n"
"Matrix3x3 mtZero();\n"
"\n"
"__inline\n"
"Matrix3x3 mtIdentity();\n"
"\n"
"__inline\n"
"Matrix3x3 mtTranspose(Matrix3x3 m);\n"
"\n"
"__inline\n"
"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b);\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b);\n"
"\n"
"__inline\n"
"Matrix3x3 mtZero()\n"
"{\n"
" Matrix3x3 m;\n"
" m.m_row[0] = (float4)(0.f);\n"
" m.m_row[1] = (float4)(0.f);\n"
" m.m_row[2] = (float4)(0.f);\n"
" return m;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtIdentity()\n"
"{\n"
" Matrix3x3 m;\n"
" m.m_row[0] = (float4)(1,0,0,0);\n"
" m.m_row[1] = (float4)(0,1,0,0);\n"
" m.m_row[2] = (float4)(0,0,1,0);\n"
" return m;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtTranspose(Matrix3x3 m)\n"
"{\n"
" Matrix3x3 out;\n"
" out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
" out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
" out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
" return out;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
"{\n"
" Matrix3x3 transB;\n"
" transB = mtTranspose( b );\n"
" Matrix3x3 ans;\n"
" // why this doesn't run when 0ing in the for{}\n"
" a.m_row[0].w = 0.f;\n"
" a.m_row[1].w = 0.f;\n"
" a.m_row[2].w = 0.f;\n"
" for(int i=0; i<3; i++)\n"
" {\n"
"// a.m_row[i].w = 0.f;\n"
" ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n"
" ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n"
" ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n"
" ans.m_row[i].w = 0.f;\n"
" }\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b)\n"
"{\n"
" float4 ans;\n"
" ans.x = dot3F4( a.m_row[0], b );\n"
" ans.y = dot3F4( a.m_row[1], b );\n"
" ans.z = dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b)\n"
"{\n"
" float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
"\n"
" float4 ans;\n"
" ans.x = dot3F4( a, colx );\n"
" ans.y = dot3F4( a, coly );\n"
" ans.z = dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"\n"
"///////////////////////////////////////\n"
"// Quaternion\n"
"///////////////////////////////////////\n"
"\n"
"typedef float4 Quaternion;\n"
"\n"
"__inline\n"
"Quaternion qtMul(Quaternion a, Quaternion b);\n"
"\n"
"__inline\n"
"Quaternion qtNormalize(Quaternion in);\n"
"\n"
"__inline\n"
"float4 qtRotate(Quaternion q, float4 vec);\n"
"\n"
"__inline\n"
"Quaternion qtInvert(Quaternion q);\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"__inline\n"
"Quaternion qtMul(Quaternion a, Quaternion b)\n"
"{\n"
" Quaternion ans;\n"
" ans = cross3( a, b );\n"
" ans += a.w*b+b.w*a;\n"
"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
" ans.w = a.w*b.w - dot3F4(a, b);\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"Quaternion qtNormalize(Quaternion in)\n"
"{\n"
" return fastNormalize4(in);\n"
"// in /= length( in );\n"
"// return in;\n"
"}\n"
"__inline\n"
"float4 qtRotate(Quaternion q, float4 vec)\n"
"{\n"
" Quaternion qInv = qtInvert( q );\n"
" float4 vcpy = vec;\n"
" vcpy.w = 0.f;\n"
" float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
" return out;\n"
"}\n"
"\n"
"__inline\n"
"Quaternion qtInvert(Quaternion q)\n"
"{\n"
" return (Quaternion)(-q.xyz, q.w);\n"
"}\n"
"\n"
"__inline\n"
"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
"{\n"
" return qtRotate( qtInvert( q ), vec );\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"#define WG_SIZE 64\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" Quaternion m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
"\n"
" u32 m_shapeIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"\n"
"typedef struct\n"
"{\n"
" Matrix3x3 m_invInertia;\n"
" Matrix3x3 m_initInvInertia;\n"
"} Shape;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_linear;\n"
" float4 m_worldPos[4];\n"
" float4 m_center; \n"
" float m_jacCoeffInv[4];\n"
" float m_b[4];\n"
" float m_appliedRambdaDt[4];\n"
"\n"
" float m_fJacCoeffInv[2]; \n"
" float m_fAppliedRambdaDt[2]; \n"
"\n"
" u32 m_bodyA;\n"
" u32 m_bodyB;\n"
" int m_batchIdx;\n"
" u32 m_paddings;\n"
"} Constraint4;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_worldPos[4];\n"
" float4 m_worldNormal;\n"
" u32 m_coeffs;\n"
" int m_batchIdx;\n"
"\n"
" int m_bodyAPtrAndSignBit;\n"
" int m_bodyBPtrAndSignBit;\n"
"} Contact4;\n"
"\n"
"\n"
"__kernel void CountBodiesKernel(__global Contact4* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)\n"
"{\n"
" int i = GET_GLOBAL_IDX;\n"
" \n"
" if( i < numContactManifolds)\n"
" {\n"
" int pa = manifoldPtr[i].m_bodyAPtrAndSignBit;\n"
" bool isFixedA = (pa <0) || (pa == fixedBodyIndex);\n"
" int bodyIndexA = abs(pa);\n"
" if (!isFixedA)\n"
" {\n"
" AtomInc1(bodyCount[bodyIndexA],contactConstraintOffsets[i].x);\n"
" }\n"
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
" int pb = manifoldPtr[i].m_bodyBPtrAndSignBit;\n"
" bool isFixedB = (pb <0) || (pb == fixedBodyIndex);\n"
" int bodyIndexB = abs(pb);\n"
" if (!isFixedB)\n"
" {\n"
" AtomInc1(bodyCount[bodyIndexB],contactConstraintOffsets[i].y);\n"
" } \n"
" }\n"
"}\n"
"\n"
"__kernel void ClearVelocitiesKernel(__global float4* linearVelocities,__global float4* angularVelocities, int numSplitBodies)\n"
"{\n"
" int i = GET_GLOBAL_IDX;\n"
" \n"
" if( i < numSplitBodies)\n"
" {\n"
" linearVelocities[i] = make_float4(0);\n"
" angularVelocities[i] = make_float4(0);\n"
" }\n"
"}\n"
"\n"
"\n"
"__kernel void AverageVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n"
"__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n"
"{\n"
" int i = GET_GLOBAL_IDX;\n"
" if (i<numBodies)\n"
" {\n"
" if (gBodies[i].m_invMass)\n"
" {\n"
" int bodyOffset = offsetSplitBodies[i];\n"
" int count = bodyCount[i];\n"
" float factor = 1.f/((float)count);\n"
" float4 averageLinVel = make_float4(0.f);\n"
" float4 averageAngVel = make_float4(0.f);\n"
" \n"
" for (int j=0;j<count;j++)\n"
" {\n"
" averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;\n"
" averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;\n"
" }\n"
" \n"
" for (int j=0;j<count;j++)\n"
" {\n"
" deltaLinearVelocities[bodyOffset+j] = averageLinVel;\n"
" deltaAngularVelocities[bodyOffset+j] = averageAngVel;\n"
" }\n"
" \n"
" }//bodies[i].m_invMass\n"
" }//i<numBodies\n"
"}\n"
"\n"
"\n"
"\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
"{\n"
" *linear = make_float4(-n.xyz,0.f);\n"
" *angular0 = -cross3(r0, n);\n"
" *angular1 = cross3(r1, n);\n"
"}\n"
"\n"
"\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
"{\n"
" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
"}\n"
"\n"
"\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1, float countA, float countB)\n"
"{\n"
" // linear0,1 are normlized\n"
" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
" return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB);\n"
"}\n"
"\n"
"\n"
"void b3PlaneSpace1 (float4 n, float4* p, float4* q);\n"
" void b3PlaneSpace1 (float4 n, float4* p, float4* q)\n"
"{\n"
" if (fabs(n.z) > 0.70710678f) {\n"
" // choose p in y-z plane\n"
" float a = n.y*n.y + n.z*n.z;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = 0;\n"
" p[0].y = -n.z*k;\n"
" p[0].z = n.y*k;\n"
" // set q = n x p\n"
" q[0].x = a*k;\n"
" q[0].y = -n.x*p[0].z;\n"
" q[0].z = n.x*p[0].y;\n"
" }\n"
" else {\n"
" // choose p in x-y plane\n"
" float a = n.x*n.x + n.y*n.y;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = -n.y*k;\n"
" p[0].y = n.x*k;\n"
" p[0].z = 0;\n"
" // set q = n x p\n"
" q[0].x = -n.z*p[0].y;\n"
" q[0].y = n.z*p[0].x;\n"
" q[0].z = a*k;\n"
" }\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"void solveContact(__global Constraint4* cs,\n"
" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB,\n"
" float4* dLinVelA, float4* dAngVelA, float4* dLinVelB, float4* dAngVelB)\n"
"{\n"
" float minRambdaDt = 0;\n"
" float maxRambdaDt = FLT_MAX;\n"
"\n"
" for(int ic=0; ic<4; ic++)\n"
" {\n"
" if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n"
"\n"
" float4 angular0, angular1, linear;\n"
" float4 r0 = cs->m_worldPos[ic] - posA;\n"
" float4 r1 = cs->m_worldPos[ic] - posB;\n"
" setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n"
" \n"
"\n"
"\n"
" float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n"
" *linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic];\n"
" rambdaDt *= cs->m_jacCoeffInv[ic];\n"
"\n"
" \n"
" {\n"
" float prevSum = cs->m_appliedRambdaDt[ic];\n"
" float updated = prevSum;\n"
" updated += rambdaDt;\n"
" updated = max2( updated, minRambdaDt );\n"
" updated = min2( updated, maxRambdaDt );\n"
" rambdaDt = updated - prevSum;\n"
" cs->m_appliedRambdaDt[ic] = updated;\n"
" }\n"
"\n"
" \n"
" float4 linImp0 = invMassA*linear*rambdaDt;\n"
" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
"\n"
" \n"
" if (invMassA)\n"
" {\n"
" *dLinVelA += linImp0;\n"
" *dAngVelA += angImp0;\n"
" }\n"
" if (invMassB)\n"
" {\n"
" *dLinVelB += linImp1;\n"
" *dAngVelB += angImp1;\n"
" }\n"
" }\n"
"}\n"
"\n"
"\n"
"// solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,contactConstraintOffsets,offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n"
"\n"
"\n"
"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs, \n"
"__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n"
"__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n"
"{\n"
"\n"
" //float frictionCoeff = ldsCs[0].m_linear.w;\n"
" int aIdx = ldsCs[0].m_bodyA;\n"
" int bIdx = ldsCs[0].m_bodyB;\n"
"\n"
" float4 posA = gBodies[aIdx].m_pos;\n"
" float4 linVelA = gBodies[aIdx].m_linVel;\n"
" float4 angVelA = gBodies[aIdx].m_angVel;\n"
" float invMassA = gBodies[aIdx].m_invMass;\n"
" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
"\n"
" float4 posB = gBodies[bIdx].m_pos;\n"
" float4 linVelB = gBodies[bIdx].m_linVel;\n"
" float4 angVelB = gBodies[bIdx].m_angVel;\n"
" float invMassB = gBodies[bIdx].m_invMass;\n"
" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
"\n"
" \n"
" float4 dLinVelA = make_float4(0,0,0,0);\n"
" float4 dAngVelA = make_float4(0,0,0,0);\n"
" float4 dLinVelB = make_float4(0,0,0,0);\n"
" float4 dAngVelB = make_float4(0,0,0,0);\n"
" \n"
" int bodyOffsetA = offsetSplitBodies[aIdx];\n"
" int constraintOffsetA = contactConstraintOffsets[0].x;\n"
" int splitIndexA = bodyOffsetA+constraintOffsetA;\n"
" \n"
" if (invMassA)\n"
" {\n"
" dLinVelA = deltaLinearVelocities[splitIndexA];\n"
" dAngVelA = deltaAngularVelocities[splitIndexA];\n"
" }\n"
"\n"
" int bodyOffsetB = offsetSplitBodies[bIdx];\n"
" int constraintOffsetB = contactConstraintOffsets[0].y;\n"
" int splitIndexB= bodyOffsetB+constraintOffsetB;\n"
"\n"
" if (invMassB)\n"
" {\n"
" dLinVelB = deltaLinearVelocities[splitIndexB];\n"
" dAngVelB = deltaAngularVelocities[splitIndexB];\n"
" }\n"
"\n"
" solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
" posB, &linVelB, &angVelB, invMassB, invInertiaB ,&dLinVelA, &dAngVelA, &dLinVelB, &dAngVelB);\n"
"\n"
" if (invMassA)\n"
" {\n"
" deltaLinearVelocities[splitIndexA] = dLinVelA;\n"
" deltaAngularVelocities[splitIndexA] = dAngVelA;\n"
" } \n"
" if (invMassB)\n"
" {\n"
" deltaLinearVelocities[splitIndexB] = dLinVelB;\n"
" deltaAngularVelocities[splitIndexB] = dAngVelB;\n"
" }\n"
"\n"
"}\n"
"\n"
"\n"
"__kernel void SolveContactJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n"
"__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n"
"float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n"
")\n"
"{\n"
" int i = GET_GLOBAL_IDX;\n"
" if (i<numManifolds)\n"
" {\n"
" solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n"
" }\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs,\n"
" __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n"
" __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n"
"{\n"
" float frictionCoeff = 0.7f;//ldsCs[0].m_linear.w;\n"
" int aIdx = ldsCs[0].m_bodyA;\n"
" int bIdx = ldsCs[0].m_bodyB;\n"
"\n"
"\n"
" float4 posA = gBodies[aIdx].m_pos;\n"
" float4 linVelA = gBodies[aIdx].m_linVel;\n"
" float4 angVelA = gBodies[aIdx].m_angVel;\n"
" float invMassA = gBodies[aIdx].m_invMass;\n"
" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
"\n"
" float4 posB = gBodies[bIdx].m_pos;\n"
" float4 linVelB = gBodies[bIdx].m_linVel;\n"
" float4 angVelB = gBodies[bIdx].m_angVel;\n"
" float invMassB = gBodies[bIdx].m_invMass;\n"
" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
" \n"
"\n"
" float4 dLinVelA = make_float4(0,0,0,0);\n"
" float4 dAngVelA = make_float4(0,0,0,0);\n"
" float4 dLinVelB = make_float4(0,0,0,0);\n"
" float4 dAngVelB = make_float4(0,0,0,0);\n"
" \n"
" int bodyOffsetA = offsetSplitBodies[aIdx];\n"
" int constraintOffsetA = contactConstraintOffsets[0].x;\n"
" int splitIndexA = bodyOffsetA+constraintOffsetA;\n"
" \n"
" if (invMassA)\n"
" {\n"
" dLinVelA = deltaLinearVelocities[splitIndexA];\n"
" dAngVelA = deltaAngularVelocities[splitIndexA];\n"
" }\n"
"\n"
" int bodyOffsetB = offsetSplitBodies[bIdx];\n"
" int constraintOffsetB = contactConstraintOffsets[0].y;\n"
" int splitIndexB= bodyOffsetB+constraintOffsetB;\n"
"\n"
" if (invMassB)\n"
" {\n"
" dLinVelB = deltaLinearVelocities[splitIndexB];\n"
" dAngVelB = deltaAngularVelocities[splitIndexB];\n"
" }\n"
"\n"
"\n"
"\n"
"\n"
" {\n"
" float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n"
" float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n"
"\n"
" float sum = 0;\n"
" for(int j=0; j<4; j++)\n"
" {\n"
" sum +=ldsCs[0].m_appliedRambdaDt[j];\n"
" }\n"
" frictionCoeff = 0.7f;\n"
" for(int j=0; j<4; j++)\n"
" {\n"
" maxRambdaDt[j] = frictionCoeff*sum;\n"
" minRambdaDt[j] = -maxRambdaDt[j];\n"
" }\n"
"\n"
" \n"
"// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
"// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n"
" \n"
" \n"
" {\n"
" \n"
" __global Constraint4* cs = ldsCs;\n"
" \n"
" if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n"
" const float4 center = cs->m_center;\n"
" \n"
" float4 n = -cs->m_linear;\n"
" \n"
" float4 tangent[2];\n"
" b3PlaneSpace1(n,&tangent[0],&tangent[1]);\n"
" float4 angular0, angular1, linear;\n"
" float4 r0 = center - posA;\n"
" float4 r1 = center - posB;\n"
" for(int i=0; i<2; i++)\n"
" {\n"
" setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n"
" float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n"
" linVelA+dLinVelA, angVelA+dAngVelA, linVelB+dLinVelB, angVelB+dAngVelB );\n"
" rambdaDt *= cs->m_fJacCoeffInv[i];\n"
" \n"
" {\n"
" float prevSum = cs->m_fAppliedRambdaDt[i];\n"
" float updated = prevSum;\n"
" updated += rambdaDt;\n"
" updated = max2( updated, minRambdaDt[i] );\n"
" updated = min2( updated, maxRambdaDt[i] );\n"
" rambdaDt = updated - prevSum;\n"
" cs->m_fAppliedRambdaDt[i] = updated;\n"
" }\n"
" \n"
" float4 linImp0 = invMassA*linear*rambdaDt;\n"
" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
" \n"
" dLinVelA += linImp0;\n"
" dAngVelA += angImp0;\n"
" dLinVelB += linImp1;\n"
" dAngVelB += angImp1;\n"
" }\n"
" { // angular damping for point constraint\n"
" float4 ab = normalize3( posB - posA );\n"
" float4 ac = normalize3( center - posA );\n"
" if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))\n"
" {\n"
" float angNA = dot3F4( n, angVelA );\n"
" float angNB = dot3F4( n, angVelB );\n"
" \n"
" dAngVelA -= (angNA*0.1f)*n;\n"
" dAngVelB -= (angNB*0.1f)*n;\n"
" }\n"
" }\n"
" }\n"
"\n"
" \n"
" \n"
" }\n"
"\n"
" if (invMassA)\n"
" {\n"
" deltaLinearVelocities[splitIndexA] = dLinVelA;\n"
" deltaAngularVelocities[splitIndexA] = dAngVelA;\n"
" } \n"
" if (invMassB)\n"
" {\n"
" deltaLinearVelocities[splitIndexB] = dLinVelB;\n"
" deltaAngularVelocities[splitIndexB] = dAngVelB;\n"
" }\n"
" \n"
"\n"
"}\n"
"\n"
"\n"
"__kernel void SolveFrictionJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n"
" __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n"
" __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n"
" float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n"
")\n"
"{\n"
" int i = GET_GLOBAL_IDX;\n"
" if (i<numManifolds)\n"
" {\n"
" solveFrictionConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n"
" }\n"
"}\n"
"\n"
"\n"
"__kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n"
" __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n"
"{\n"
" int i = GET_GLOBAL_IDX;\n"
" if (i<numBodies)\n"
" {\n"
" if (gBodies[i].m_invMass)\n"
" {\n"
" int bodyOffset = offsetSplitBodies[i];\n"
" int count = bodyCount[i];\n"
" if (count)\n"
" {\n"
" gBodies[i].m_linVel += deltaLinearVelocities[bodyOffset];\n"
" gBodies[i].m_angVel += deltaAngularVelocities[bodyOffset];\n"
" }\n"
" }\n"
" }\n"
"}\n"
"\n"
"\n"
"\n"
"void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,\n"
" const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, \n"
" __global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,\n"
" Constraint4* dstC )\n"
"{\n"
" dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n"
" dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n"
"\n"
" float dtInv = 1.f/dt;\n"
" for(int ic=0; ic<4; ic++)\n"
" {\n"
" dstC->m_appliedRambdaDt[ic] = 0.f;\n"
" }\n"
" dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n"
"\n"
"\n"
" dstC->m_linear = -src->m_worldNormal;\n"
" dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n"
" for(int ic=0; ic<4; ic++)\n"
" {\n"
" float4 r0 = src->m_worldPos[ic] - posA;\n"
" float4 r1 = src->m_worldPos[ic] - posB;\n"
"\n"
" if( ic >= src->m_worldNormal.w )//npoints\n"
" {\n"
" dstC->m_jacCoeffInv[ic] = 0.f;\n"
" continue;\n"
" }\n"
"\n"
" float relVelN;\n"
" {\n"
" float4 linear, angular0, angular1;\n"
" setLinearAndAngular(src->m_worldNormal, r0, r1, &linear, &angular0, &angular1);\n"
"\n"
" dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
" invMassA, &invInertiaA, invMassB, &invInertiaB , countA, countB);\n"
"\n"
" relVelN = calcRelVel(linear, -linear, angular0, angular1,\n"
" linVelA, angVelA, linVelB, angVelB);\n"
"\n"
" float e = 0.f;//src->getRestituitionCoeff();\n"
" if( relVelN*relVelN < 0.004f ) e = 0.f;\n"
"\n"
" dstC->m_b[ic] = e*relVelN;\n"
" //float penetration = src->m_worldPos[ic].w;\n"
" dstC->m_b[ic] += (src->m_worldPos[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n"
" dstC->m_appliedRambdaDt[ic] = 0.f;\n"
" }\n"
" }\n"
"\n"
" if( src->m_worldNormal.w > 0 )//npoints\n"
" { // prepare friction\n"
" float4 center = make_float4(0.f);\n"
" for(int i=0; i<src->m_worldNormal.w; i++) \n"
" center += src->m_worldPos[i];\n"
" center /= (float)src->m_worldNormal.w;\n"
"\n"
" float4 tangent[2];\n"
" b3PlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
" \n"
" float4 r[2];\n"
" r[0] = center - posA;\n"
" r[1] = center - posB;\n"
"\n"
" for(int i=0; i<2; i++)\n"
" {\n"
" float4 linear, angular0, angular1;\n"
" setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n"
"\n"
" dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
" invMassA, &invInertiaA, invMassB, &invInertiaB ,countA, countB);\n"
" dstC->m_fAppliedRambdaDt[i] = 0.f;\n"
" }\n"
" dstC->m_center = center;\n"
" }\n"
"\n"
" for(int i=0; i<4; i++)\n"
" {\n"
" if( i<src->m_worldNormal.w )\n"
" {\n"
" dstC->m_worldPos[i] = src->m_worldPos[i];\n"
" }\n"
" else\n"
" {\n"
" dstC->m_worldPos[i] = make_float4(0.f);\n"
" }\n"
" }\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void ContactToConstraintSplitKernel(__global const Contact4* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, \n"
"__global const unsigned int* bodyCount,\n"
"int nContacts,\n"
"float dt,\n"
"float positionDrift,\n"
"float positionConstraintCoeff\n"
")\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" \n"
" if( gIdx < nContacts )\n"
" {\n"
" int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n"
" int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n"
"\n"
" float4 posA = gBodies[aIdx].m_pos;\n"
" float4 linVelA = gBodies[aIdx].m_linVel;\n"
" float4 angVelA = gBodies[aIdx].m_angVel;\n"
" float invMassA = gBodies[aIdx].m_invMass;\n"
" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
"\n"
" float4 posB = gBodies[bIdx].m_pos;\n"
" float4 linVelB = gBodies[bIdx].m_linVel;\n"
" float4 angVelB = gBodies[bIdx].m_angVel;\n"
" float invMassB = gBodies[bIdx].m_invMass;\n"
" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
"\n"
" Constraint4 cs;\n"
"\n"
" float countA = invMassA ? (float)bodyCount[aIdx] : 1;\n"
" float countB = invMassB ? (float)bodyCount[bIdx] : 1;\n"
"\n"
" setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n"
" &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB,\n"
" &cs );\n"
" \n"
" cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n"
"\n"
" gConstraintOut[gIdx] = cs;\n"
" }\n"
"}\n"
;

View File

@@ -0,0 +1,195 @@
#define SHAPE_CONVEX_HULL 3
typedef float4 Quaternion;
__inline
float4 cross3(float4 a, float4 b)
{
return cross(a,b);
}
__inline
float dot3F4(float4 a, float4 b)
{
float4 a1 = (float4)(a.xyz,0.f);
float4 b1 = (float4)(b.xyz,0.f);
return dot(a1, b1);
}
__inline
Quaternion qtMul(Quaternion a, Quaternion b)
{
Quaternion ans;
ans = cross3( a, b );
ans += a.w*b+b.w*a;
ans.w = a.w*b.w - dot3F4(a, b);
return ans;
}
__inline
Quaternion qtInvert(Quaternion q)
{
return (Quaternion)(-q.xyz, q.w);
}
__inline
float4 qtRotate(Quaternion q, float4 vec)
{
Quaternion qInv = qtInvert( q );
float4 vcpy = vec;
vcpy.w = 0.f;
float4 out = qtMul(qtMul(q,vcpy),qInv);
return out;
}
__inline
float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
{
return qtRotate( *orientation, *p ) + (*translation);
}
typedef struct
{
float4 m_row[3];
} Matrix3x3;
typedef unsigned int u32;
typedef struct
{
float4 m_pos;
float4 m_quat;
float4 m_linVel;
float4 m_angVel;
u32 m_collidableIdx;
float m_invMass;
float m_restituitionCoeff;
float m_frictionCoeff;
} Body;
typedef struct Collidable
{
int m_unused1;
int m_unused2;
int m_shapeType;
int m_shapeIndex;
} Collidable;
typedef struct
{
Matrix3x3 m_invInertia;
Matrix3x3 m_initInvInertia;
} Shape;
__inline
Matrix3x3 qtGetRotationMatrix(float4 quat)
{
float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
Matrix3x3 out;
out.m_row[0].x=fabs(1-2*quat2.y-2*quat2.z);
out.m_row[0].y=fabs(2*quat.x*quat.y-2*quat.w*quat.z);
out.m_row[0].z=fabs(2*quat.x*quat.z+2*quat.w*quat.y);
out.m_row[0].w = 0.f;
out.m_row[1].x=fabs(2*quat.x*quat.y+2*quat.w*quat.z);
out.m_row[1].y=fabs(1-2*quat2.x-2*quat2.z);
out.m_row[1].z=fabs(2*quat.y*quat.z-2*quat.w*quat.x);
out.m_row[1].w = 0.f;
out.m_row[2].x=fabs(2*quat.x*quat.z-2*quat.w*quat.y);
out.m_row[2].y=fabs(2*quat.y*quat.z+2*quat.w*quat.x);
out.m_row[2].z=fabs(1-2*quat2.x-2*quat2.y);
out.m_row[2].w = 0.f;
return out;
}
typedef struct
{
float fx;
float fy;
float fz;
int uw;
} btAABBCL;
__inline
Matrix3x3 mtTranspose(Matrix3x3 m)
{
Matrix3x3 out;
out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
return out;
}
__inline
Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
{
Matrix3x3 transB;
transB = mtTranspose( b );
Matrix3x3 ans;
// why this doesn't run when 0ing in the for{}
a.m_row[0].w = 0.f;
a.m_row[1].w = 0.f;
a.m_row[2].w = 0.f;
for(int i=0; i<3; i++)
{
// a.m_row[i].w = 0.f;
ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
ans.m_row[i].w = 0.f;
}
return ans;
}
__kernel void initializeGpuAabbsFull( const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global btAABBCL* plocalShapeAABB, __global btAABBCL* pAABB)
{
int nodeID = get_global_id(0);
if( nodeID < numNodes )
{
float4 position = gBodies[nodeID].m_pos;
float4 orientation = gBodies[nodeID].m_quat;
int collidableIndex = gBodies[nodeID].m_collidableIdx;
int shapeIndex = collidables[collidableIndex].m_shapeIndex;
if (shapeIndex>=0)
{
btAABBCL minAabb = plocalShapeAABB[collidableIndex*2];
btAABBCL maxAabb = plocalShapeAABB[collidableIndex*2+1];
float4 halfExtents = ((float4)(maxAabb.fx - minAabb.fx,maxAabb.fy - minAabb.fy,maxAabb.fz - minAabb.fz,0.f))*0.5f;
float4 localCenter = ((float4)(maxAabb.fx + minAabb.fx,maxAabb.fy + minAabb.fy,maxAabb.fz + minAabb.fz,0.f))*0.5f;
float4 worldCenter = transform(&localCenter,&position,&orientation);
Matrix3x3 abs_b = qtGetRotationMatrix(orientation);
float4 extent = (float4) ( dot(abs_b.m_row[0],halfExtents),dot(abs_b.m_row[1],halfExtents),dot(abs_b.m_row[2],halfExtents),0.f);
pAABB[nodeID*2].fx = worldCenter.x-extent.x;
pAABB[nodeID*2].fy = worldCenter.y-extent.y;
pAABB[nodeID*2].fz = worldCenter.z-extent.z;
pAABB[nodeID*2].uw = nodeID;
pAABB[nodeID*2+1].fx = worldCenter.x+extent.x;
pAABB[nodeID*2+1].fy = worldCenter.y+extent.y;
pAABB[nodeID*2+1].fz = worldCenter.z+extent.z;
pAABB[nodeID*2+1].uw = gBodies[nodeID].m_invMass==0.f? 0 : 1;
}
}
}

View File

@@ -0,0 +1,199 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* updateAabbsKernelCL= \
"\n"
"#define SHAPE_CONVEX_HULL 3\n"
"\n"
"typedef float4 Quaternion;\n"
"\n"
"__inline\n"
"float4 cross3(float4 a, float4 b)\n"
"{\n"
" return cross(a,b);\n"
"}\n"
"\n"
"__inline\n"
"float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = (float4)(a.xyz,0.f);\n"
" float4 b1 = (float4)(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"\n"
"\n"
"__inline\n"
"Quaternion qtMul(Quaternion a, Quaternion b)\n"
"{\n"
" Quaternion ans;\n"
" ans = cross3( a, b );\n"
" ans += a.w*b+b.w*a;\n"
" ans.w = a.w*b.w - dot3F4(a, b);\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"Quaternion qtInvert(Quaternion q)\n"
"{\n"
" return (Quaternion)(-q.xyz, q.w);\n"
"}\n"
"\n"
"__inline\n"
"float4 qtRotate(Quaternion q, float4 vec)\n"
"{\n"
" Quaternion qInv = qtInvert( q );\n"
" float4 vcpy = vec;\n"
" vcpy.w = 0.f;\n"
" float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
" return out;\n"
"}\n"
"\n"
"__inline\n"
"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
"{\n"
" return qtRotate( *orientation, *p ) + (*translation);\n"
"}\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_row[3];\n"
"} Matrix3x3;\n"
"\n"
"typedef unsigned int u32;\n"
"\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" float4 m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
"\n"
" u32 m_collidableIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"\n"
"typedef struct Collidable\n"
"{\n"
" int m_unused1;\n"
" int m_unused2;\n"
" int m_shapeType;\n"
" int m_shapeIndex;\n"
"} Collidable;\n"
"\n"
"\n"
"typedef struct\n"
"{\n"
" Matrix3x3 m_invInertia;\n"
" Matrix3x3 m_initInvInertia;\n"
"} Shape;\n"
"\n"
"\n"
"__inline\n"
"Matrix3x3 qtGetRotationMatrix(float4 quat)\n"
"{\n"
" float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
" Matrix3x3 out;\n"
"\n"
" out.m_row[0].x=fabs(1-2*quat2.y-2*quat2.z);\n"
" out.m_row[0].y=fabs(2*quat.x*quat.y-2*quat.w*quat.z);\n"
" out.m_row[0].z=fabs(2*quat.x*quat.z+2*quat.w*quat.y);\n"
" out.m_row[0].w = 0.f;\n"
"\n"
" out.m_row[1].x=fabs(2*quat.x*quat.y+2*quat.w*quat.z);\n"
" out.m_row[1].y=fabs(1-2*quat2.x-2*quat2.z);\n"
" out.m_row[1].z=fabs(2*quat.y*quat.z-2*quat.w*quat.x);\n"
" out.m_row[1].w = 0.f;\n"
"\n"
" out.m_row[2].x=fabs(2*quat.x*quat.z-2*quat.w*quat.y);\n"
" out.m_row[2].y=fabs(2*quat.y*quat.z+2*quat.w*quat.x);\n"
" out.m_row[2].z=fabs(1-2*quat2.x-2*quat2.y);\n"
" out.m_row[2].w = 0.f;\n"
"\n"
" return out;\n"
"}\n"
"\n"
"\n"
"typedef struct \n"
"{\n"
" float fx;\n"
" float fy;\n"
" float fz;\n"
" int uw;\n"
"} b3AABBCL;\n"
"\n"
"__inline\n"
"Matrix3x3 mtTranspose(Matrix3x3 m)\n"
"{\n"
" Matrix3x3 out;\n"
" out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
" out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
" out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
" return out;\n"
"}\n"
"\n"
"\n"
"\n"
"__inline\n"
"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
"{\n"
" Matrix3x3 transB;\n"
" transB = mtTranspose( b );\n"
" Matrix3x3 ans;\n"
" // why this doesn't run when 0ing in the for{}\n"
" a.m_row[0].w = 0.f;\n"
" a.m_row[1].w = 0.f;\n"
" a.m_row[2].w = 0.f;\n"
" for(int i=0; i<3; i++)\n"
" {\n"
"// a.m_row[i].w = 0.f;\n"
" ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n"
" ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n"
" ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n"
" ans.m_row[i].w = 0.f;\n"
" }\n"
" return ans;\n"
"}\n"
"\n"
"\n"
"__kernel void initializeGpuAabbsFull( const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global b3AABBCL* plocalShapeAABB, __global b3AABBCL* pAABB)\n"
"{\n"
" int nodeID = get_global_id(0);\n"
" \n"
" if( nodeID < numNodes )\n"
" {\n"
" float4 position = gBodies[nodeID].m_pos;\n"
" float4 orientation = gBodies[nodeID].m_quat;\n"
" \n"
" \n"
" int collidableIndex = gBodies[nodeID].m_collidableIdx;\n"
" int shapeIndex = collidables[collidableIndex].m_shapeIndex;\n"
" \n"
" if (shapeIndex>=0)\n"
" {\n"
" b3AABBCL minAabb = plocalShapeAABB[collidableIndex*2];\n"
" b3AABBCL maxAabb = plocalShapeAABB[collidableIndex*2+1];\n"
" \n"
" float4 halfExtents = ((float4)(maxAabb.fx - minAabb.fx,maxAabb.fy - minAabb.fy,maxAabb.fz - minAabb.fz,0.f))*0.5f;\n"
" float4 localCenter = ((float4)(maxAabb.fx + minAabb.fx,maxAabb.fy + minAabb.fy,maxAabb.fz + minAabb.fz,0.f))*0.5f;\n"
" \n"
" float4 worldCenter = transform(&localCenter,&position,&orientation);\n"
" \n"
" Matrix3x3 abs_b = qtGetRotationMatrix(orientation);\n"
" float4 extent = (float4) ( dot(abs_b.m_row[0],halfExtents),dot(abs_b.m_row[1],halfExtents),dot(abs_b.m_row[2],halfExtents),0.f);\n"
" \n"
" \n"
" pAABB[nodeID*2].fx = worldCenter.x-extent.x;\n"
" pAABB[nodeID*2].fy = worldCenter.y-extent.y;\n"
" pAABB[nodeID*2].fz = worldCenter.z-extent.z;\n"
" pAABB[nodeID*2].uw = nodeID;\n"
" \n"
" pAABB[nodeID*2+1].fx = worldCenter.x+extent.x;\n"
" pAABB[nodeID*2+1].fy = worldCenter.y+extent.y;\n"
" pAABB[nodeID*2+1].fz = worldCenter.z+extent.z;\n"
" pAABB[nodeID*2+1].uw = gBodies[nodeID].m_invMass==0.f? 0 : 1;\n"
" }\n"
" } \n"
"}\n"
"\n"
;

View File

@@ -0,0 +1,28 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("Bullet3OpenCL_" .. vendor)
initOpenCL(vendor)
kind "StaticLib"
targetdir "../../lib"
includedirs {
".",".."
}
files {
"**.cpp",
"**.h"
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")