move OpenCL initialization for the unit tests in a shared header file, and support some basic command-line arguments
--cl_device=1 --cl_platform=1 --allow_opencl_cpu add chaindemo, test for mass ratios restore sleeping/activation mode in featherstone demo Use _VARIADIC_MAX=10 to avoid Google Test issues with Visual Studio 2012, thanks to Mobeen for the report Enable verbose printf for unit tests
This commit is contained in:
@@ -9,11 +9,11 @@ bool searchIncremental3dSapOnGpu = true;
|
||||
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "kernels/sapKernels.h"
|
||||
#include "kernels/sapFastKernels.h"
|
||||
|
||||
#include "Bullet3Common/b3MinMax.h"
|
||||
|
||||
#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
|
||||
#define B3_BROADPHASE_SAPFAST_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl"
|
||||
|
||||
|
||||
b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType)
|
||||
:m_context(ctx),
|
||||
@@ -48,7 +48,7 @@ m_addedCountGPU(ctx,q),
|
||||
m_removedCountGPU(ctx,q)
|
||||
{
|
||||
const char* sapSrc = sapCL;
|
||||
const char* sapFastSrc = sapFastCL;
|
||||
|
||||
|
||||
cl_int errNum=0;
|
||||
|
||||
@@ -56,8 +56,8 @@ m_removedCountGPU(ctx,q)
|
||||
b3Assert(m_device);
|
||||
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"",B3_BROADPHASE_SAPFAST_PATH);
|
||||
//cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,0,&errNum,"",B3_BROADPHASE_SAPFAST_PATH,true);
|
||||
|
||||
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
#ifndef __APPLE__
|
||||
m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context,m_device,m_queue);
|
||||
@@ -95,11 +95,6 @@ m_removedCountGPU(ctx,q)
|
||||
break;
|
||||
}
|
||||
|
||||
case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY_BATCH_WRITE:
|
||||
{
|
||||
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernelLocalSharedMemoryBatchWrite",&errNum,sapFastProg );
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
|
||||
@@ -115,24 +110,7 @@ m_removedCountGPU(ctx,q)
|
||||
m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "prepareSumVarianceKernel",&errNum,sapProg );
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
|
||||
m_computePairsIncremental3dSapKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsIncremental3dSapKernel",&errNum,sapFastProg );
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
|
||||
/*
|
||||
#if 0
|
||||
|
||||
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
#else
|
||||
#ifndef __APPLE__
|
||||
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernelLocalSharedMemoryBatchWrite",&errNum,sapFastProg );
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
#else
|
||||
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg );
|
||||
|
||||
m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
|
||||
@@ -153,7 +131,7 @@ b3GpuSapBroadphase::~b3GpuSapBroadphase()
|
||||
clReleaseKernel(m_sapKernel);
|
||||
clReleaseKernel(m_sap2Kernel);
|
||||
clReleaseKernel(m_prepareSumVarianceKernel);
|
||||
clReleaseKernel(m_computePairsIncremental3dSapKernel);
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -469,7 +447,7 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
|
||||
int c = m_objectMinMaxIndexCPU[2][m_currentBuffer].size();
|
||||
b3Assert(a==b);
|
||||
b3Assert(b==c);
|
||||
|
||||
/*
|
||||
if (searchIncremental3dSapOnGpu)
|
||||
{
|
||||
B3_PROFILE("computePairsIncremental3dSapKernelGPU");
|
||||
@@ -547,6 +525,7 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
|
||||
|
||||
}
|
||||
else
|
||||
*/
|
||||
{
|
||||
int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ class b3GpuSapBroadphase : public b3GpuBroadphaseInterface
|
||||
cl_kernel m_sapKernel;
|
||||
cl_kernel m_sap2Kernel;
|
||||
cl_kernel m_prepareSumVarianceKernel;
|
||||
cl_kernel m_computePairsIncremental3dSapKernel;
|
||||
|
||||
|
||||
class b3RadixSort32CL* m_sorter;
|
||||
|
||||
@@ -95,8 +95,7 @@ public:
|
||||
B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU,
|
||||
B3_GPU_SAP_KERNEL_ORIGINAL,
|
||||
B3_GPU_SAP_KERNEL_BARRIER,
|
||||
B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY,
|
||||
B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY_BATCH_WRITE
|
||||
B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY
|
||||
};
|
||||
|
||||
b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
|
||||
@@ -124,10 +123,7 @@ public:
|
||||
{
|
||||
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
|
||||
}
|
||||
static b3GpuBroadphaseInterface* CreateFuncLocalMemoryBatchWrite(cl_context ctx,cl_device_id device, cl_command_queue q)
|
||||
{
|
||||
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY_BATCH_WRITE);
|
||||
}
|
||||
|
||||
|
||||
virtual void calculateOverlappingPairs(int maxPairs);
|
||||
virtual void calculateOverlappingPairsHost(int maxPairs);
|
||||
|
||||
@@ -1,453 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Erwin Coumans
|
||||
|
||||
#define NEW_PAIR_MARKER -1
|
||||
#define REMOVED_PAIR_MARKER -2
|
||||
|
||||
typedef struct
|
||||
{
|
||||
union
|
||||
{
|
||||
float4 m_min;
|
||||
float m_minElems[4];
|
||||
int m_minIndices[4];
|
||||
};
|
||||
union
|
||||
{
|
||||
float4 m_max;
|
||||
float m_maxElems[4];
|
||||
int m_maxIndices[4];
|
||||
};
|
||||
} btAabbCL;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
union
|
||||
{
|
||||
unsigned int m_key;
|
||||
unsigned int x;
|
||||
};
|
||||
|
||||
union
|
||||
{
|
||||
unsigned int m_value;
|
||||
unsigned int y;
|
||||
|
||||
};
|
||||
}b3SortData;
|
||||
|
||||
|
||||
/// conservative test for overlap between two aabbs
|
||||
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
|
||||
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
|
||||
{
|
||||
//skip pairs between static (mass=0) objects
|
||||
if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))
|
||||
return false;
|
||||
|
||||
bool overlap = true;
|
||||
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
|
||||
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
|
||||
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
|
||||
return overlap;
|
||||
}
|
||||
|
||||
__kernel void computePairsIncremental3dSapKernel( __global const uint2* objectMinMaxIndexGPUaxis0,
|
||||
__global const uint2* objectMinMaxIndexGPUaxis1,
|
||||
__global const uint2* objectMinMaxIndexGPUaxis2,
|
||||
__global const uint2* objectMinMaxIndexGPUaxis0prev,
|
||||
__global const uint2* objectMinMaxIndexGPUaxis1prev,
|
||||
__global const uint2* objectMinMaxIndexGPUaxis2prev,
|
||||
__global const b3SortData* sortedAxisGPU0,
|
||||
__global const b3SortData* sortedAxisGPU1,
|
||||
__global const b3SortData* sortedAxisGPU2,
|
||||
__global const b3SortData* sortedAxisGPU0prev,
|
||||
__global const b3SortData* sortedAxisGPU1prev,
|
||||
__global const b3SortData* sortedAxisGPU2prev,
|
||||
__global int4* addedHostPairsGPU,
|
||||
__global int4* removedHostPairsGPU,
|
||||
volatile __global int* addedHostPairsCount,
|
||||
volatile __global int* removedHostPairsCount,
|
||||
int maxCapacity,
|
||||
int numObjects)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
if (i>=numObjects)
|
||||
return;
|
||||
|
||||
__global const uint2* objectMinMaxIndexGPU[3][2];
|
||||
objectMinMaxIndexGPU[0][0]=objectMinMaxIndexGPUaxis0;
|
||||
objectMinMaxIndexGPU[1][0]=objectMinMaxIndexGPUaxis1;
|
||||
objectMinMaxIndexGPU[2][0]=objectMinMaxIndexGPUaxis2;
|
||||
objectMinMaxIndexGPU[0][1]=objectMinMaxIndexGPUaxis0prev;
|
||||
objectMinMaxIndexGPU[1][1]=objectMinMaxIndexGPUaxis1prev;
|
||||
objectMinMaxIndexGPU[2][1]=objectMinMaxIndexGPUaxis2prev;
|
||||
|
||||
__global const b3SortData* sortedAxisGPU[3][2];
|
||||
sortedAxisGPU[0][0] = sortedAxisGPU0;
|
||||
sortedAxisGPU[1][0] = sortedAxisGPU1;
|
||||
sortedAxisGPU[2][0] = sortedAxisGPU2;
|
||||
sortedAxisGPU[0][1] = sortedAxisGPU0prev;
|
||||
sortedAxisGPU[1][1] = sortedAxisGPU1prev;
|
||||
sortedAxisGPU[2][1] = sortedAxisGPU2prev;
|
||||
|
||||
int m_currentBuffer = 0;
|
||||
|
||||
for (int axis=0;axis<3;axis++)
|
||||
{
|
||||
//int i = checkObjects[a];
|
||||
|
||||
unsigned int curMinIndex = objectMinMaxIndexGPU[axis][m_currentBuffer][i].x;
|
||||
unsigned int curMaxIndex = objectMinMaxIndexGPU[axis][m_currentBuffer][i].y;
|
||||
unsigned int prevMinIndex = objectMinMaxIndexGPU[axis][1-m_currentBuffer][i].x;
|
||||
int dmin = curMinIndex - prevMinIndex;
|
||||
|
||||
unsigned int prevMaxIndex = objectMinMaxIndexGPU[axis][1-m_currentBuffer][i].y;
|
||||
|
||||
int dmax = curMaxIndex - prevMaxIndex;
|
||||
|
||||
for (int otherbuffer = 0;otherbuffer<2;otherbuffer++)
|
||||
{
|
||||
if (dmin!=0)
|
||||
{
|
||||
int stepMin = dmin<0 ? -1 : 1;
|
||||
for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin)
|
||||
{
|
||||
int otherIndex2 = sortedAxisGPU[axis][otherbuffer][j].y;
|
||||
int otherIndex = otherIndex2/2;
|
||||
if (otherIndex!=i)
|
||||
{
|
||||
bool otherIsMax = ((otherIndex2&1)!=0);
|
||||
|
||||
if (otherIsMax)
|
||||
{
|
||||
|
||||
bool overlap = true;
|
||||
|
||||
for (int ax=0;ax<3;ax++)
|
||||
{
|
||||
if ((objectMinMaxIndexGPU[ax][m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].y) ||
|
||||
(objectMinMaxIndexGPU[ax][m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].x))
|
||||
overlap=false;
|
||||
}
|
||||
|
||||
// b3Assert(overlap2==overlap);
|
||||
|
||||
bool prevOverlap = true;
|
||||
|
||||
for (int ax=0;ax<3;ax++)
|
||||
{
|
||||
if ((objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].y) ||
|
||||
(objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].x))
|
||||
prevOverlap=false;
|
||||
}
|
||||
|
||||
|
||||
//b3Assert(overlap==overlap2);
|
||||
|
||||
|
||||
|
||||
if (dmin<0)
|
||||
{
|
||||
if (overlap && !prevOverlap)
|
||||
{
|
||||
//add a pair
|
||||
int4 newPair;
|
||||
if (i<=otherIndex)
|
||||
{
|
||||
newPair.x = i;
|
||||
newPair.y = otherIndex;
|
||||
} else
|
||||
{
|
||||
newPair.x = otherIndex;
|
||||
newPair.y = i;
|
||||
}
|
||||
|
||||
{
|
||||
int curPair = atomic_inc(addedHostPairsCount);
|
||||
if (curPair<maxCapacity)
|
||||
{
|
||||
addedHostPairsGPU[curPair].x = newPair.x;
|
||||
addedHostPairsGPU[curPair].y = newPair.y;
|
||||
addedHostPairsGPU[curPair].z = NEW_PAIR_MARKER;
|
||||
addedHostPairsGPU[curPair].w = NEW_PAIR_MARKER;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!overlap && prevOverlap)
|
||||
{
|
||||
|
||||
//remove a pair
|
||||
int4 removedPair;
|
||||
if (i<=otherIndex)
|
||||
{
|
||||
removedPair.x = i;
|
||||
removedPair.y = otherIndex;
|
||||
} else
|
||||
{
|
||||
removedPair.x = otherIndex;
|
||||
removedPair.y = i;
|
||||
}
|
||||
{
|
||||
int curPair = atomic_inc(removedHostPairsCount);
|
||||
if (curPair<maxCapacity)
|
||||
{
|
||||
|
||||
removedHostPairsGPU[curPair].x = removedPair.x;
|
||||
removedHostPairsGPU[curPair].y = removedPair.y;
|
||||
removedHostPairsGPU[curPair].z = REMOVED_PAIR_MARKER;
|
||||
removedHostPairsGPU[curPair].w = REMOVED_PAIR_MARKER;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}//otherisMax
|
||||
}//if (dmin<0)
|
||||
}//if (otherIndex!=i)
|
||||
}//for (int j=
|
||||
}
|
||||
|
||||
if (dmax!=0)
|
||||
{
|
||||
int stepMax = dmax<0 ? -1 : 1;
|
||||
for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax)
|
||||
{
|
||||
int otherIndex2 = sortedAxisGPU[axis][otherbuffer][j].y;
|
||||
int otherIndex = otherIndex2/2;
|
||||
if (otherIndex!=i)
|
||||
{
|
||||
bool otherIsMin = ((otherIndex2&1)==0);
|
||||
if (otherIsMin)
|
||||
{
|
||||
|
||||
bool overlap = true;
|
||||
|
||||
for (int ax=0;ax<3;ax++)
|
||||
{
|
||||
if ((objectMinMaxIndexGPU[ax][m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].y) ||
|
||||
(objectMinMaxIndexGPU[ax][m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].x))
|
||||
overlap=false;
|
||||
}
|
||||
//b3Assert(overlap2==overlap);
|
||||
|
||||
bool prevOverlap = true;
|
||||
|
||||
for (int ax=0;ax<3;ax++)
|
||||
{
|
||||
if ((objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].y) ||
|
||||
(objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].x))
|
||||
prevOverlap=false;
|
||||
}
|
||||
|
||||
|
||||
if (dmax>0)
|
||||
{
|
||||
if (overlap && !prevOverlap)
|
||||
{
|
||||
//add a pair
|
||||
int4 newPair;
|
||||
if (i<=otherIndex)
|
||||
{
|
||||
newPair.x = i;
|
||||
newPair.y = otherIndex;
|
||||
} else
|
||||
{
|
||||
newPair.x = otherIndex;
|
||||
newPair.y = i;
|
||||
}
|
||||
{
|
||||
int curPair = atomic_inc(addedHostPairsCount);
|
||||
if (curPair<maxCapacity)
|
||||
{
|
||||
|
||||
addedHostPairsGPU[curPair].x = newPair.x;
|
||||
addedHostPairsGPU[curPair].y = newPair.y;
|
||||
addedHostPairsGPU[curPair].z = NEW_PAIR_MARKER;
|
||||
addedHostPairsGPU[curPair].w = NEW_PAIR_MARKER;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!overlap && prevOverlap)
|
||||
{
|
||||
//if (otherIndex2&1==0) -> min?
|
||||
//remove a pair
|
||||
int4 removedPair;
|
||||
if (i<=otherIndex)
|
||||
{
|
||||
removedPair.x = i;
|
||||
removedPair.y = otherIndex;
|
||||
} else
|
||||
{
|
||||
removedPair.x = otherIndex;
|
||||
removedPair.y = i;
|
||||
}
|
||||
{
|
||||
int curPair = atomic_inc(removedHostPairsCount);
|
||||
if (curPair<maxCapacity)
|
||||
{
|
||||
|
||||
removedHostPairsGPU[curPair].x = removedPair.x;
|
||||
removedHostPairsGPU[curPair].y = removedPair.y;
|
||||
removedHostPairsGPU[curPair].z = REMOVED_PAIR_MARKER;
|
||||
removedHostPairsGPU[curPair].w = REMOVED_PAIR_MARKER;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}//if (dmin<0)
|
||||
}//if (otherIndex!=i)
|
||||
}//for (int j=
|
||||
}
|
||||
}//for (int otherbuffer
|
||||
}//for (int axis=0;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
__kernel void computePairsKernelLocalSharedMemoryBatchWrite( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
int localId = get_local_id(0);
|
||||
|
||||
__local int numActiveWgItems[1];
|
||||
__local int breakRequest[1];
|
||||
__local btAabbCL localAabbs[128];// = aabbs[i];
|
||||
|
||||
int2 myPairs[64];
|
||||
|
||||
btAabbCL myAabb;
|
||||
|
||||
myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
|
||||
float testValue = myAabb.m_maxElems[axis];
|
||||
|
||||
if (localId==0)
|
||||
{
|
||||
numActiveWgItems[0] = 0;
|
||||
breakRequest[0] = 0;
|
||||
}
|
||||
int localCount=0;
|
||||
int block=0;
|
||||
localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
|
||||
localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
atomic_inc(numActiveWgItems);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int localBreak = 0;
|
||||
int curNumPairs = 0;
|
||||
|
||||
int j=i+1;
|
||||
do
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j<numObjects)
|
||||
{
|
||||
if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
|
||||
{
|
||||
if (!localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j>=numObjects && !localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (!localBreak)
|
||||
{
|
||||
if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
|
||||
{
|
||||
int2 myPair;
|
||||
myPair.x = myAabb.m_minIndices[3];
|
||||
myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
|
||||
myPairs[curNumPairs] = myPair;
|
||||
curNumPairs++;
|
||||
if (curNumPairs==64)
|
||||
{
|
||||
int curPair = atomic_add(pairCount,curNumPairs);
|
||||
for (int p=0;p<curNumPairs;p++)
|
||||
{
|
||||
if ((curPair+p)<maxPairs)
|
||||
{
|
||||
int4 tmpPair;
|
||||
tmpPair.x = myPairs[p].x;
|
||||
tmpPair.y = myPairs[p].y;
|
||||
tmpPair.z = NEW_PAIR_MARKER;
|
||||
tmpPair.w = NEW_PAIR_MARKER;
|
||||
|
||||
|
||||
pairsOut[curPair+p] = tmpPair; //flush to main memory
|
||||
}
|
||||
}
|
||||
curNumPairs = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
localCount++;
|
||||
if (localCount==64)
|
||||
{
|
||||
localCount = 0;
|
||||
block+=64;
|
||||
localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
|
||||
localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
|
||||
}
|
||||
j++;
|
||||
|
||||
} while (breakRequest[0]<numActiveWgItems[0]);
|
||||
|
||||
|
||||
if (curNumPairs>0)
|
||||
{
|
||||
//avoid a buffer overrun
|
||||
int curPair = atomic_add(pairCount,curNumPairs);
|
||||
for (int p=0;p<curNumPairs;p++)
|
||||
{
|
||||
if ((curPair+p)<maxPairs)
|
||||
{
|
||||
int4 tmpPair;
|
||||
tmpPair.x = myPairs[p].x;
|
||||
tmpPair.y = myPairs[p].y;
|
||||
tmpPair.z = NEW_PAIR_MARKER;
|
||||
tmpPair.w = NEW_PAIR_MARKER;
|
||||
pairsOut[curPair+p] = tmpPair; //flush to main memory
|
||||
}
|
||||
}
|
||||
curNumPairs = 0;
|
||||
}
|
||||
}
|
||||
@@ -1,419 +0,0 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* sapFastCL= \
|
||||
"/*\n"
|
||||
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose, \n"
|
||||
"including commercial applications, and to alter it and redistribute it freely, \n"
|
||||
"subject to the following restrictions:\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Originally written by Erwin Coumans\n"
|
||||
"#define NEW_PAIR_MARKER -1\n"
|
||||
"#define REMOVED_PAIR_MARKER -2\n"
|
||||
"typedef struct \n"
|
||||
"{\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_min;\n"
|
||||
" float m_minElems[4];\n"
|
||||
" int m_minIndices[4];\n"
|
||||
" };\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_max;\n"
|
||||
" float m_maxElems[4];\n"
|
||||
" int m_maxIndices[4];\n"
|
||||
" };\n"
|
||||
"} btAabbCL;\n"
|
||||
"typedef struct \n"
|
||||
"{\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" unsigned int m_key;\n"
|
||||
" unsigned int x;\n"
|
||||
" };\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" unsigned int m_value;\n"
|
||||
" unsigned int y;\n"
|
||||
" \n"
|
||||
" };\n"
|
||||
"}b3SortData;\n"
|
||||
"/// conservative test for overlap between two aabbs\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
"//skip pairs between static (mass=0) objects\n"
|
||||
" if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n"
|
||||
" return false;\n"
|
||||
" \n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"__kernel void computePairsIncremental3dSapKernel( __global const uint2* objectMinMaxIndexGPUaxis0,\n"
|
||||
" __global const uint2* objectMinMaxIndexGPUaxis1,\n"
|
||||
" __global const uint2* objectMinMaxIndexGPUaxis2,\n"
|
||||
" __global const uint2* objectMinMaxIndexGPUaxis0prev,\n"
|
||||
" __global const uint2* objectMinMaxIndexGPUaxis1prev,\n"
|
||||
" __global const uint2* objectMinMaxIndexGPUaxis2prev,\n"
|
||||
" __global const b3SortData* sortedAxisGPU0,\n"
|
||||
" __global const b3SortData* sortedAxisGPU1,\n"
|
||||
" __global const b3SortData* sortedAxisGPU2,\n"
|
||||
" __global const b3SortData* sortedAxisGPU0prev,\n"
|
||||
" __global const b3SortData* sortedAxisGPU1prev,\n"
|
||||
" __global const b3SortData* sortedAxisGPU2prev,\n"
|
||||
" __global int4* addedHostPairsGPU,\n"
|
||||
" __global int4* removedHostPairsGPU,\n"
|
||||
" volatile __global int* addedHostPairsCount,\n"
|
||||
" volatile __global int* removedHostPairsCount,\n"
|
||||
" int maxCapacity,\n"
|
||||
" int numObjects)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" __global const uint2* objectMinMaxIndexGPU[3][2];\n"
|
||||
" objectMinMaxIndexGPU[0][0]=objectMinMaxIndexGPUaxis0;\n"
|
||||
" objectMinMaxIndexGPU[1][0]=objectMinMaxIndexGPUaxis1;\n"
|
||||
" objectMinMaxIndexGPU[2][0]=objectMinMaxIndexGPUaxis2;\n"
|
||||
" objectMinMaxIndexGPU[0][1]=objectMinMaxIndexGPUaxis0prev;\n"
|
||||
" objectMinMaxIndexGPU[1][1]=objectMinMaxIndexGPUaxis1prev;\n"
|
||||
" objectMinMaxIndexGPU[2][1]=objectMinMaxIndexGPUaxis2prev;\n"
|
||||
" __global const b3SortData* sortedAxisGPU[3][2];\n"
|
||||
" sortedAxisGPU[0][0] = sortedAxisGPU0;\n"
|
||||
" sortedAxisGPU[1][0] = sortedAxisGPU1;\n"
|
||||
" sortedAxisGPU[2][0] = sortedAxisGPU2;\n"
|
||||
" sortedAxisGPU[0][1] = sortedAxisGPU0prev;\n"
|
||||
" sortedAxisGPU[1][1] = sortedAxisGPU1prev;\n"
|
||||
" sortedAxisGPU[2][1] = sortedAxisGPU2prev;\n"
|
||||
" int m_currentBuffer = 0;\n"
|
||||
" for (int axis=0;axis<3;axis++)\n"
|
||||
" {\n"
|
||||
" //int i = checkObjects[a];\n"
|
||||
" unsigned int curMinIndex = objectMinMaxIndexGPU[axis][m_currentBuffer][i].x;\n"
|
||||
" unsigned int curMaxIndex = objectMinMaxIndexGPU[axis][m_currentBuffer][i].y;\n"
|
||||
" unsigned int prevMinIndex = objectMinMaxIndexGPU[axis][1-m_currentBuffer][i].x;\n"
|
||||
" int dmin = curMinIndex - prevMinIndex;\n"
|
||||
" \n"
|
||||
" unsigned int prevMaxIndex = objectMinMaxIndexGPU[axis][1-m_currentBuffer][i].y;\n"
|
||||
" int dmax = curMaxIndex - prevMaxIndex;\n"
|
||||
" \n"
|
||||
" for (int otherbuffer = 0;otherbuffer<2;otherbuffer++)\n"
|
||||
" {\n"
|
||||
" if (dmin!=0)\n"
|
||||
" {\n"
|
||||
" int stepMin = dmin<0 ? -1 : 1;\n"
|
||||
" for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin)\n"
|
||||
" {\n"
|
||||
" int otherIndex2 = sortedAxisGPU[axis][otherbuffer][j].y;\n"
|
||||
" int otherIndex = otherIndex2/2;\n"
|
||||
" if (otherIndex!=i)\n"
|
||||
" {\n"
|
||||
" bool otherIsMax = ((otherIndex2&1)!=0);\n"
|
||||
" if (otherIsMax)\n"
|
||||
" {\n"
|
||||
" \n"
|
||||
" bool overlap = true;\n"
|
||||
" for (int ax=0;ax<3;ax++)\n"
|
||||
" {\n"
|
||||
" if ((objectMinMaxIndexGPU[ax][m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].y) ||\n"
|
||||
" (objectMinMaxIndexGPU[ax][m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].x))\n"
|
||||
" overlap=false;\n"
|
||||
" }\n"
|
||||
" // b3Assert(overlap2==overlap);\n"
|
||||
" bool prevOverlap = true;\n"
|
||||
" for (int ax=0;ax<3;ax++)\n"
|
||||
" {\n"
|
||||
" if ((objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].y) ||\n"
|
||||
" (objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].x))\n"
|
||||
" prevOverlap=false;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" //b3Assert(overlap==overlap2);\n"
|
||||
" \n"
|
||||
" if (dmin<0)\n"
|
||||
" {\n"
|
||||
" if (overlap && !prevOverlap)\n"
|
||||
" {\n"
|
||||
" //add a pair\n"
|
||||
" int4 newPair;\n"
|
||||
" if (i<=otherIndex)\n"
|
||||
" {\n"
|
||||
" newPair.x = i;\n"
|
||||
" newPair.y = otherIndex;\n"
|
||||
" } else\n"
|
||||
" {\n"
|
||||
" newPair.x = otherIndex;\n"
|
||||
" newPair.y = i;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" {\n"
|
||||
" int curPair = atomic_inc(addedHostPairsCount);\n"
|
||||
" if (curPair<maxCapacity)\n"
|
||||
" {\n"
|
||||
" addedHostPairsGPU[curPair].x = newPair.x;\n"
|
||||
" addedHostPairsGPU[curPair].y = newPair.y;\n"
|
||||
" addedHostPairsGPU[curPair].z = NEW_PAIR_MARKER;\n"
|
||||
" addedHostPairsGPU[curPair].w = NEW_PAIR_MARKER;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" } \n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" if (!overlap && prevOverlap)\n"
|
||||
" {\n"
|
||||
" \n"
|
||||
" //remove a pair\n"
|
||||
" int4 removedPair;\n"
|
||||
" if (i<=otherIndex)\n"
|
||||
" {\n"
|
||||
" removedPair.x = i;\n"
|
||||
" removedPair.y = otherIndex;\n"
|
||||
" } else\n"
|
||||
" {\n"
|
||||
" removedPair.x = otherIndex;\n"
|
||||
" removedPair.y = i;\n"
|
||||
" }\n"
|
||||
" {\n"
|
||||
" int curPair = atomic_inc(removedHostPairsCount);\n"
|
||||
" if (curPair<maxCapacity)\n"
|
||||
" {\n"
|
||||
" \n"
|
||||
" removedHostPairsGPU[curPair].x = removedPair.x;\n"
|
||||
" removedHostPairsGPU[curPair].y = removedPair.y;\n"
|
||||
" removedHostPairsGPU[curPair].z = REMOVED_PAIR_MARKER;\n"
|
||||
" removedHostPairsGPU[curPair].w = REMOVED_PAIR_MARKER;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }//otherisMax\n"
|
||||
" }//if (dmin<0)\n"
|
||||
" }//if (otherIndex!=i)\n"
|
||||
" }//for (int j=\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" if (dmax!=0)\n"
|
||||
" {\n"
|
||||
" int stepMax = dmax<0 ? -1 : 1;\n"
|
||||
" for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax)\n"
|
||||
" {\n"
|
||||
" int otherIndex2 = sortedAxisGPU[axis][otherbuffer][j].y;\n"
|
||||
" int otherIndex = otherIndex2/2;\n"
|
||||
" if (otherIndex!=i)\n"
|
||||
" {\n"
|
||||
" bool otherIsMin = ((otherIndex2&1)==0);\n"
|
||||
" if (otherIsMin)\n"
|
||||
" {\n"
|
||||
" \n"
|
||||
" bool overlap = true;\n"
|
||||
" for (int ax=0;ax<3;ax++)\n"
|
||||
" {\n"
|
||||
" if ((objectMinMaxIndexGPU[ax][m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].y) ||\n"
|
||||
" (objectMinMaxIndexGPU[ax][m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].x))\n"
|
||||
" overlap=false;\n"
|
||||
" }\n"
|
||||
" //b3Assert(overlap2==overlap);\n"
|
||||
" bool prevOverlap = true;\n"
|
||||
" for (int ax=0;ax<3;ax++)\n"
|
||||
" {\n"
|
||||
" if ((objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].y) ||\n"
|
||||
" (objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].x))\n"
|
||||
" prevOverlap=false;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" if (dmax>0)\n"
|
||||
" {\n"
|
||||
" if (overlap && !prevOverlap)\n"
|
||||
" {\n"
|
||||
" //add a pair\n"
|
||||
" int4 newPair;\n"
|
||||
" if (i<=otherIndex)\n"
|
||||
" {\n"
|
||||
" newPair.x = i;\n"
|
||||
" newPair.y = otherIndex;\n"
|
||||
" } else\n"
|
||||
" {\n"
|
||||
" newPair.x = otherIndex;\n"
|
||||
" newPair.y = i;\n"
|
||||
" }\n"
|
||||
" {\n"
|
||||
" int curPair = atomic_inc(addedHostPairsCount);\n"
|
||||
" if (curPair<maxCapacity)\n"
|
||||
" {\n"
|
||||
" \n"
|
||||
" addedHostPairsGPU[curPair].x = newPair.x;\n"
|
||||
" addedHostPairsGPU[curPair].y = newPair.y;\n"
|
||||
" addedHostPairsGPU[curPair].z = NEW_PAIR_MARKER;\n"
|
||||
" addedHostPairsGPU[curPair].w = NEW_PAIR_MARKER;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" }\n"
|
||||
" } \n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" if (!overlap && prevOverlap)\n"
|
||||
" {\n"
|
||||
" //if (otherIndex2&1==0) -> min?\n"
|
||||
" //remove a pair\n"
|
||||
" int4 removedPair;\n"
|
||||
" if (i<=otherIndex)\n"
|
||||
" {\n"
|
||||
" removedPair.x = i;\n"
|
||||
" removedPair.y = otherIndex;\n"
|
||||
" } else\n"
|
||||
" {\n"
|
||||
" removedPair.x = otherIndex;\n"
|
||||
" removedPair.y = i;\n"
|
||||
" }\n"
|
||||
" {\n"
|
||||
" int curPair = atomic_inc(removedHostPairsCount);\n"
|
||||
" if (curPair<maxCapacity)\n"
|
||||
" {\n"
|
||||
" \n"
|
||||
" removedHostPairsGPU[curPair].x = removedPair.x;\n"
|
||||
" removedHostPairsGPU[curPair].y = removedPair.y;\n"
|
||||
" removedHostPairsGPU[curPair].z = REMOVED_PAIR_MARKER;\n"
|
||||
" removedHostPairsGPU[curPair].w = REMOVED_PAIR_MARKER;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" }//if (dmin<0)\n"
|
||||
" }//if (otherIndex!=i)\n"
|
||||
" }//for (int j=\n"
|
||||
" }\n"
|
||||
" }//for (int otherbuffer\n"
|
||||
" }//for (int axis=0;\n"
|
||||
"}\n"
|
||||
"__kernel void computePairsKernelLocalSharedMemoryBatchWrite( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
" __local int numActiveWgItems[1];\n"
|
||||
" __local int breakRequest[1];\n"
|
||||
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
|
||||
" \n"
|
||||
" int2 myPairs[64];\n"
|
||||
" \n"
|
||||
" btAabbCL myAabb;\n"
|
||||
" \n"
|
||||
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
|
||||
" float testValue = myAabb.m_maxElems[axis];\n"
|
||||
" \n"
|
||||
" if (localId==0)\n"
|
||||
" {\n"
|
||||
" numActiveWgItems[0] = 0;\n"
|
||||
" breakRequest[0] = 0;\n"
|
||||
" }\n"
|
||||
" int localCount=0;\n"
|
||||
" int block=0;\n"
|
||||
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" atomic_inc(numActiveWgItems);\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" int localBreak = 0;\n"
|
||||
" int curNumPairs = 0;\n"
|
||||
" \n"
|
||||
" int j=i+1;\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j<numObjects)\n"
|
||||
" {\n"
|
||||
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j>=numObjects && !localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
|
||||
" {\n"
|
||||
" int2 myPair;\n"
|
||||
" myPair.x = myAabb.m_minIndices[3];\n"
|
||||
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
|
||||
" myPairs[curNumPairs] = myPair;\n"
|
||||
" curNumPairs++;\n"
|
||||
" if (curNumPairs==64)\n"
|
||||
" {\n"
|
||||
" int curPair = atomic_add(pairCount,curNumPairs);\n"
|
||||
" for (int p=0;p<curNumPairs;p++)\n"
|
||||
" {\n"
|
||||
" if ((curPair+p)<maxPairs)\n"
|
||||
" {\n"
|
||||
" int4 tmpPair;\n"
|
||||
" tmpPair.x = myPairs[p].x;\n"
|
||||
" tmpPair.y = myPairs[p].y;\n"
|
||||
" tmpPair.z = NEW_PAIR_MARKER;\n"
|
||||
" tmpPair.w = NEW_PAIR_MARKER;\n"
|
||||
" \n"
|
||||
" pairsOut[curPair+p] = tmpPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" curNumPairs = 0;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" localCount++;\n"
|
||||
" if (localCount==64)\n"
|
||||
" {\n"
|
||||
" localCount = 0;\n"
|
||||
" block+=64; \n"
|
||||
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
|
||||
" }\n"
|
||||
" j++;\n"
|
||||
" \n"
|
||||
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
|
||||
" \n"
|
||||
" \n"
|
||||
" if (curNumPairs>0)\n"
|
||||
" {\n"
|
||||
" //avoid a buffer overrun\n"
|
||||
" int curPair = atomic_add(pairCount,curNumPairs);\n"
|
||||
" for (int p=0;p<curNumPairs;p++)\n"
|
||||
" {\n"
|
||||
" if ((curPair+p)<maxPairs)\n"
|
||||
" {\n"
|
||||
" int4 tmpPair;\n"
|
||||
" tmpPair.x = myPairs[p].x;\n"
|
||||
" tmpPair.y = myPairs[p].y;\n"
|
||||
" tmpPair.z = NEW_PAIR_MARKER;\n"
|
||||
" tmpPair.w = NEW_PAIR_MARKER;\n"
|
||||
" pairsOut[curPair+p] = tmpPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" curNumPairs = 0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
;
|
||||
Reference in New Issue
Block a user