made the simulation deterministic
disable 'simdwidth' optimization for determinism (need to double-check) made the spatial batching 3D
This commit is contained in:
@@ -1,7 +1,10 @@
|
||||
|
||||
bool b3GpuBatchContacts = true;
|
||||
bool b3GpuSolveConstraint = true;
|
||||
|
||||
bool gpuRadixSort=true;
|
||||
bool gpuSetSortData = true;
|
||||
bool gpuSortContacts = true;
|
||||
bool optionalSortContactsDeterminism = true;
|
||||
|
||||
#include "b3GpuBatchingPgsSolver.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
|
||||
@@ -31,15 +34,6 @@ bool b3GpuSolveConstraint = true;
|
||||
|
||||
|
||||
|
||||
enum
|
||||
{
|
||||
B3_SOLVER_N_SPLIT = 16,
|
||||
B3_SOLVER_N_BATCHES = 4,
|
||||
B3_SOLVER_N_OBJ_PER_SPLIT = 10,
|
||||
B3_SOLVER_N_TASKS_PER_BATCH = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT,
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
struct b3GpuBatchingPgsSolverInternalData
|
||||
@@ -65,6 +59,10 @@ struct b3GpuBatchingPgsSolverInternalData
|
||||
cl_kernel m_reorderContactKernel;
|
||||
cl_kernel m_copyConstraintKernel;
|
||||
|
||||
cl_kernel m_setDeterminismSortDataBodyAKernel;
|
||||
cl_kernel m_setDeterminismSortDataBodyBKernel;
|
||||
|
||||
|
||||
class b3RadixSort32CL* m_sort32;
|
||||
class b3BoundSearchCL* m_search;
|
||||
class b3PrefixScanCL* m_scan;
|
||||
@@ -75,6 +73,9 @@ struct b3GpuBatchingPgsSolverInternalData
|
||||
b3OpenCLArray<b3RigidBodyCL>* m_bodyBufferGPU;
|
||||
b3OpenCLArray<b3InertiaCL>* m_inertiaBufferGPU;
|
||||
b3OpenCLArray<b3Contact4>* m_pBufContactOutGPU;
|
||||
|
||||
b3OpenCLArray<b3Contact4>* m_pBufContactOutGPUCopy;
|
||||
b3OpenCLArray<b3SortData>* m_contactKeyValues;
|
||||
|
||||
|
||||
b3AlignedObjectArray<unsigned int> m_idxBuffer;
|
||||
@@ -86,6 +87,7 @@ struct b3GpuBatchingPgsSolverInternalData
|
||||
|
||||
b3GpuBatchingPgsSolver::b3GpuBatchingPgsSolver(cl_context ctx,cl_device_id device, cl_command_queue q,int pairCapacity)
|
||||
{
|
||||
m_debugOutput=0;
|
||||
m_data = new b3GpuBatchingPgsSolverInternalData;
|
||||
m_data->m_context = ctx;
|
||||
m_data->m_device = device;
|
||||
@@ -97,24 +99,28 @@ b3GpuBatchingPgsSolver::b3GpuBatchingPgsSolver(cl_context ctx,cl_device_id devic
|
||||
m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaCL>(ctx,q);
|
||||
m_data->m_pBufContactOutGPU = new b3OpenCLArray<b3Contact4>(ctx,q);
|
||||
|
||||
m_data->m_pBufContactOutGPUCopy = new b3OpenCLArray<b3Contact4>(ctx,q);
|
||||
m_data->m_contactKeyValues = new b3OpenCLArray<b3SortData>(ctx,q);
|
||||
|
||||
|
||||
m_data->m_solverGPU = new b3Solver(ctx,device,q,512*1024);
|
||||
|
||||
m_data->m_sort32 = new b3RadixSort32CL(ctx,device,m_data->m_queue);
|
||||
m_data->m_scan = new b3PrefixScanCL(ctx,device,m_data->m_queue,B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT);
|
||||
m_data->m_search = new b3BoundSearchCL(ctx,device,m_data->m_queue,B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT);
|
||||
m_data->m_scan = new b3PrefixScanCL(ctx,device,m_data->m_queue,B3_SOLVER_N_CELLS);
|
||||
m_data->m_search = new b3BoundSearchCL(ctx,device,m_data->m_queue,B3_SOLVER_N_CELLS);
|
||||
|
||||
const int sortSize = B3NEXTMULTIPLEOF( pairCapacity, 512 );
|
||||
|
||||
m_data->m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx,m_data->m_queue,sortSize);
|
||||
m_data->m_contactBuffer = new b3OpenCLArray<b3Contact4>(ctx,m_data->m_queue);
|
||||
|
||||
m_data->m_numConstraints = new b3OpenCLArray<unsigned int>(ctx,m_data->m_queue,B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT );
|
||||
m_data->m_numConstraints->resize(B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT);
|
||||
m_data->m_numConstraints = new b3OpenCLArray<unsigned int>(ctx,m_data->m_queue,B3_SOLVER_N_CELLS);
|
||||
m_data->m_numConstraints->resize(B3_SOLVER_N_CELLS);
|
||||
|
||||
m_data->m_contactCGPU = new b3OpenCLArray<b3GpuConstraint4>(ctx,q,pairCapacity);
|
||||
|
||||
m_data->m_offsets = new b3OpenCLArray<unsigned int>( ctx,m_data->m_queue, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT );
|
||||
m_data->m_offsets->resize(B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT);
|
||||
m_data->m_offsets = new b3OpenCLArray<unsigned int>( ctx,m_data->m_queue,B3_SOLVER_N_CELLS);
|
||||
m_data->m_offsets->resize(B3_SOLVER_N_CELLS);
|
||||
const char* additionalMacros = "";
|
||||
const char* srcFileNameForCaching="";
|
||||
|
||||
@@ -132,7 +138,7 @@ b3GpuBatchingPgsSolver::b3GpuBatchingPgsSolver(cl_context ctx,cl_device_id devic
|
||||
|
||||
{
|
||||
|
||||
cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
|
||||
cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH,true);
|
||||
b3Assert(solveContactProg);
|
||||
|
||||
cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
|
||||
@@ -149,7 +155,7 @@ b3GpuBatchingPgsSolver::b3GpuBatchingPgsSolver(cl_context ctx,cl_device_id devic
|
||||
m_data->m_solveFrictionKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros );
|
||||
b3Assert(m_data->m_solveFrictionKernel);
|
||||
|
||||
m_data->m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros );
|
||||
m_data->m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, 0, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros );
|
||||
b3Assert(m_data->m_solveContactKernel);
|
||||
|
||||
m_data->m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros );
|
||||
@@ -157,7 +163,13 @@ b3GpuBatchingPgsSolver::b3GpuBatchingPgsSolver(cl_context ctx,cl_device_id devic
|
||||
|
||||
m_data->m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros );
|
||||
b3Assert(m_data->m_setSortDataKernel);
|
||||
|
||||
|
||||
m_data->m_setDeterminismSortDataBodyAKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyA", &pErrNum, solverSetup2Prog,additionalMacros );
|
||||
b3Assert(m_data->m_setDeterminismSortDataBodyAKernel);
|
||||
|
||||
m_data->m_setDeterminismSortDataBodyBKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyB", &pErrNum, solverSetup2Prog,additionalMacros );
|
||||
b3Assert(m_data->m_setDeterminismSortDataBodyBKernel);
|
||||
|
||||
m_data->m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros );
|
||||
b3Assert(m_data->m_reorderContactKernel);
|
||||
|
||||
@@ -196,6 +208,10 @@ b3GpuBatchingPgsSolver::~b3GpuBatchingPgsSolver()
|
||||
delete m_data->m_bodyBufferGPU;
|
||||
delete m_data->m_inertiaBufferGPU;
|
||||
delete m_data->m_pBufContactOutGPU;
|
||||
delete m_data->m_pBufContactOutGPUCopy;
|
||||
delete m_data->m_contactKeyValues;
|
||||
|
||||
|
||||
|
||||
delete m_data->m_contactCGPU;
|
||||
delete m_data->m_numConstraints;
|
||||
@@ -232,23 +248,24 @@ struct b3ConstraintCfg
|
||||
float m_positionConstraintCoeff;
|
||||
float m_dt;
|
||||
bool m_enableParallelSolve;
|
||||
float m_averageExtent;
|
||||
float m_batchCellSize;
|
||||
int m_staticIdx;
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf, const b3OpenCLArray<b3InertiaCL>* shapeBuf,
|
||||
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,int numIterations)
|
||||
{
|
||||
|
||||
//sort the contacts
|
||||
|
||||
|
||||
b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 );
|
||||
{
|
||||
|
||||
const int nn = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
|
||||
const int nn = B3_SOLVER_N_CELLS;
|
||||
|
||||
cdata.x = 0;
|
||||
cdata.y = maxNumBatches;//250;
|
||||
@@ -276,7 +293,7 @@ void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArray<b3Rigi
|
||||
|
||||
|
||||
cdata.z = ib;
|
||||
cdata.w = B3_SOLVER_N_SPLIT;
|
||||
|
||||
|
||||
b3LauncherCL launcher( m_data->m_queue, m_data->m_solveContactKernel );
|
||||
#if 1
|
||||
@@ -286,8 +303,8 @@ void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArray<b3Rigi
|
||||
b3BufferInfoCL( bodyBuf->getBufferCL() ),
|
||||
b3BufferInfoCL( shapeBuf->getBufferCL() ),
|
||||
b3BufferInfoCL( constraint->getBufferCL() ),
|
||||
b3BufferInfoCL( m_data->m_numConstraints->getBufferCL() ),
|
||||
b3BufferInfoCL( m_data->m_offsets->getBufferCL() )
|
||||
b3BufferInfoCL( m_data->m_solverGPU->m_numConstraints->getBufferCL() ),
|
||||
b3BufferInfoCL( m_data->m_solverGPU->m_offsets->getBufferCL() )
|
||||
#ifdef DEBUG_ME
|
||||
, b3BufferInfoCL(&gpuDebugInfo)
|
||||
#endif
|
||||
@@ -299,7 +316,12 @@ void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArray<b3Rigi
|
||||
//launcher.setConst( cdata.x );
|
||||
launcher.setConst( cdata.y );
|
||||
launcher.setConst( cdata.z );
|
||||
launcher.setConst( cdata.w );
|
||||
b3Int4 nSplit;
|
||||
nSplit.x = B3_SOLVER_N_SPLIT_X;
|
||||
nSplit.y = B3_SOLVER_N_SPLIT_Y;
|
||||
nSplit.z = B3_SOLVER_N_SPLIT_Z;
|
||||
|
||||
launcher.setConst( nSplit );
|
||||
launcher.launch1D( numWorkItems, 64 );
|
||||
|
||||
|
||||
@@ -366,14 +388,14 @@ void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArray<b3Rigi
|
||||
for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++)
|
||||
{
|
||||
cdata.z = ib;
|
||||
cdata.w = B3_SOLVER_N_SPLIT;
|
||||
|
||||
|
||||
b3BufferInfoCL bInfo[] = {
|
||||
b3BufferInfoCL( bodyBuf->getBufferCL() ),
|
||||
b3BufferInfoCL( shapeBuf->getBufferCL() ),
|
||||
b3BufferInfoCL( constraint->getBufferCL() ),
|
||||
b3BufferInfoCL( m_data->m_numConstraints->getBufferCL() ),
|
||||
b3BufferInfoCL( m_data->m_offsets->getBufferCL() )
|
||||
b3BufferInfoCL( m_data->m_solverGPU->m_numConstraints->getBufferCL() ),
|
||||
b3BufferInfoCL( m_data->m_solverGPU->m_offsets->getBufferCL() )
|
||||
#ifdef DEBUG_ME
|
||||
,b3BufferInfoCL(&gpuDebugInfo)
|
||||
#endif //DEBUG_ME
|
||||
@@ -383,7 +405,13 @@ void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArray<b3Rigi
|
||||
//launcher.setConst( cdata.x );
|
||||
launcher.setConst( cdata.y );
|
||||
launcher.setConst( cdata.z );
|
||||
launcher.setConst( cdata.w );
|
||||
|
||||
b3Int4 nSplit;
|
||||
nSplit.x = B3_SOLVER_N_SPLIT_X;
|
||||
nSplit.y = B3_SOLVER_N_SPLIT_Y;
|
||||
nSplit.z = B3_SOLVER_N_SPLIT_Z;
|
||||
|
||||
launcher.setConst( nSplit );
|
||||
|
||||
launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 );
|
||||
}
|
||||
@@ -414,14 +442,196 @@ static bool sortfnc(const b3SortData& a,const b3SortData& b)
|
||||
return (a.m_key<b.m_key);
|
||||
}
|
||||
|
||||
static bool b3ContactCmp(const b3Contact4& p, const b3Contact4& q)
|
||||
{
|
||||
return ((p.m_bodyAPtrAndSignBit<q.m_bodyAPtrAndSignBit) ||
|
||||
(p.m_bodyAPtrAndSignBit==q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit<q.m_bodyBPtrAndSignBit));
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static const int gridTable4x4[] =
|
||||
{
|
||||
0,1,17,16,
|
||||
1,2,18,19,
|
||||
17,18,32,3,
|
||||
16,19,3,34
|
||||
};
|
||||
|
||||
static const int gridTable8x8[] =
|
||||
{
|
||||
0, 2, 3, 16, 17, 18, 19, 1,
|
||||
66, 64, 80, 67, 82, 81, 65, 83,
|
||||
131,144,128,130,147,129,145,146,
|
||||
208,195,194,192,193,211,210,209,
|
||||
21, 22, 23, 5, 4, 6, 7, 20,
|
||||
86, 85, 69, 87, 70, 68, 84, 71,
|
||||
151,133,149,150,135,148,132,134,
|
||||
197,27,214,213,212,199,198,196
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
#define USE_SPATIAL_BATCHING 1
|
||||
#define USE_4x4_GRID 1
|
||||
|
||||
|
||||
void SetSortDataCPU(b3Contact4* gContact, b3RigidBodyCL* gBodies, b3SortData* gSortDataOut, int nContacts,float scale,const b3Int4& nSplit,int staticIdx)
|
||||
{
|
||||
for (int gIdx=0;gIdx<nContacts;gIdx++)
|
||||
{
|
||||
if( gIdx < nContacts )
|
||||
{
|
||||
int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit;
|
||||
int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit;
|
||||
|
||||
int aIdx = abs(aPtrAndSignBit );
|
||||
int bIdx = abs(bPtrAndSignBit);
|
||||
|
||||
bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);
|
||||
bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);
|
||||
|
||||
#if USE_SPATIAL_BATCHING
|
||||
int idx = (aStatic)? bIdx: aIdx;
|
||||
b3Vector3 p = gBodies[idx].m_pos;
|
||||
int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1);
|
||||
int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1);
|
||||
int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1);
|
||||
|
||||
int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y);
|
||||
|
||||
#else//USE_SPATIAL_BATCHING
|
||||
#if USE_4x4_GRID
|
||||
int aa = aIdx&3;
|
||||
int bb = bIdx&3;
|
||||
if (aStatic)
|
||||
aa = bb;
|
||||
if (bStatic)
|
||||
bb = aa;
|
||||
|
||||
int gridIndex = aa + bb*4;
|
||||
int newIndex = gridTable4x4[gridIndex];
|
||||
#else//USE_4x4_GRID
|
||||
int aa = aIdx&7;
|
||||
int bb = bIdx&7;
|
||||
if (aStatic)
|
||||
aa = bb;
|
||||
if (bStatic)
|
||||
bb = aa;
|
||||
|
||||
int gridIndex = aa + bb*8;
|
||||
int newIndex = gridTable8x8[gridIndex];
|
||||
#endif//USE_4x4_GRID
|
||||
#endif//USE_SPATIAL_BATCHING
|
||||
|
||||
|
||||
gSortDataOut[gIdx].x = newIndex;
|
||||
gSortDataOut[gIdx].y = gIdx;
|
||||
}
|
||||
else
|
||||
{
|
||||
gSortDataOut[gIdx].x = 0xffffffff;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const b3Config& config, int static0Index)
|
||||
{
|
||||
B3_PROFILE("solveContacts");
|
||||
m_data->m_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf,numBodies);
|
||||
m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf,numBodies);
|
||||
m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf,numContacts);
|
||||
|
||||
if (optionalSortContactsDeterminism)
|
||||
{
|
||||
if (gpuSortContacts)
|
||||
{
|
||||
B3_PROFILE("GPU Sort contact constraints (determinism)");
|
||||
|
||||
m_data->m_pBufContactOutGPUCopy->resize(numContacts);
|
||||
m_data->m_contactKeyValues->resize(numContacts);
|
||||
|
||||
m_data->m_pBufContactOutGPU->copyToCL(m_data->m_pBufContactOutGPUCopy->getBufferCL(),numContacts,0,0);
|
||||
|
||||
|
||||
{
|
||||
b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyAKernel);
|
||||
launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
|
||||
launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
|
||||
launcher.setConst(numContacts);
|
||||
launcher.launch1D( numContacts, 64 );
|
||||
}
|
||||
|
||||
m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
|
||||
|
||||
{
|
||||
b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyBKernel);
|
||||
launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
|
||||
launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
|
||||
launcher.setConst(numContacts);
|
||||
launcher.launch1D( numContacts, 64 );
|
||||
}
|
||||
|
||||
m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
|
||||
|
||||
//__global Contact4* in, __global Contact4* out, __global int2* sortData, int4 cb )
|
||||
|
||||
{
|
||||
B3_PROFILE("gpu reorderContactKernel (determinism)");
|
||||
|
||||
b3Int4 cdata;
|
||||
cdata.x = numContacts;
|
||||
|
||||
//b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL())
|
||||
// , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
|
||||
b3LauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel);
|
||||
launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
|
||||
launcher.setBuffer(m_data->m_pBufContactOutGPU->getBufferCL());
|
||||
launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
|
||||
launcher.setConst( cdata );
|
||||
launcher.launch1D( numContacts, 64 );
|
||||
}
|
||||
|
||||
} else
|
||||
{
|
||||
B3_PROFILE("CPU Sort contact constraints (determinism)");
|
||||
b3AlignedObjectArray<b3Contact4> cpuConstraints;
|
||||
m_data->m_pBufContactOutGPU->copyToHost(cpuConstraints);
|
||||
bool sort = true;
|
||||
if (sort)
|
||||
{
|
||||
cpuConstraints.quickSort(b3ContactCmp);
|
||||
|
||||
for (int i=0;i<cpuConstraints.size();i++)
|
||||
{
|
||||
cpuConstraints[i].m_batchIdx = i;
|
||||
}
|
||||
}
|
||||
m_data->m_pBufContactOutGPU->copyFromHost(cpuConstraints);
|
||||
if (m_debugOutput==100)
|
||||
{
|
||||
for (int i=0;i<cpuConstraints.size();i++)
|
||||
{
|
||||
printf("c[%d].m_bodyA = %d, m_bodyB = %d, batchId = %d\n",i,cpuConstraints[i].m_bodyAPtrAndSignBit,cpuConstraints[i].m_bodyBPtrAndSignBit, cpuConstraints[i].m_batchIdx);
|
||||
}
|
||||
}
|
||||
|
||||
m_debugOutput++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
int nContactOut = m_data->m_pBufContactOutGPU->size();
|
||||
|
||||
bool useSolver = true;
|
||||
@@ -431,7 +641,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
|
||||
float dt=1./60.;
|
||||
b3ConstraintCfg csCfg( dt );
|
||||
csCfg.m_enableParallelSolve = true;
|
||||
csCfg.m_averageExtent = 0.3;//0.1;//2;//.2f;//@TODO m_averageObjExtent;
|
||||
csCfg.m_batchCellSize = 6;
|
||||
csCfg.m_staticIdx = static0Index;
|
||||
|
||||
|
||||
@@ -485,7 +695,9 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
|
||||
|
||||
b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
|
||||
b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
|
||||
|
||||
|
||||
|
||||
if (gpuSetSortData)
|
||||
{ // 2. set cell idx
|
||||
B3_PROFILE("GPU set cell idx");
|
||||
struct CB
|
||||
@@ -493,15 +705,17 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
|
||||
int m_nContacts;
|
||||
int m_staticIdx;
|
||||
float m_scale;
|
||||
int m_nSplit;
|
||||
b3Int4 m_nSplit;
|
||||
};
|
||||
|
||||
b3Assert( sortSize%64 == 0 );
|
||||
CB cdata;
|
||||
cdata.m_nContacts = nContacts;
|
||||
cdata.m_staticIdx = csCfg.m_staticIdx;
|
||||
cdata.m_scale = 1.f/(B3_SOLVER_N_OBJ_PER_SPLIT*csCfg.m_averageExtent);
|
||||
cdata.m_nSplit = B3_SOLVER_N_SPLIT;
|
||||
cdata.m_scale = 1.f/csCfg.m_batchCellSize;
|
||||
cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X;
|
||||
cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y;
|
||||
cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z;
|
||||
|
||||
m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
|
||||
|
||||
@@ -516,10 +730,30 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
|
||||
|
||||
|
||||
launcher.launch1D( sortSize, 64 );
|
||||
}
|
||||
} else
|
||||
{
|
||||
m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
|
||||
b3AlignedObjectArray<b3SortData> sortDataCPU;
|
||||
m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataCPU);
|
||||
|
||||
b3AlignedObjectArray<b3Contact4> contactCPU;
|
||||
m_data->m_pBufContactOutGPU->copyToHost(contactCPU);
|
||||
b3AlignedObjectArray<b3RigidBodyCL> bodiesCPU;
|
||||
bodyBuf->copyToHost(bodiesCPU);
|
||||
float scale = 1.f/csCfg.m_batchCellSize;
|
||||
b3Int4 nSplit;
|
||||
nSplit.x = B3_SOLVER_N_SPLIT_X;
|
||||
nSplit.y = B3_SOLVER_N_SPLIT_Y;
|
||||
nSplit.z = B3_SOLVER_N_SPLIT_Z;
|
||||
|
||||
SetSortDataCPU(&contactCPU[0], &bodiesCPU[0], &sortDataCPU[0], nContacts,scale,nSplit,csCfg.m_staticIdx);
|
||||
|
||||
|
||||
m_data->m_solverGPU->m_sortDataBuffer->copyFromHost(sortDataCPU);
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool gpuRadixSort=true;
|
||||
if (gpuRadixSort)
|
||||
{ // 3. sort by cell idx
|
||||
B3_PROFILE("gpuRadixSort");
|
||||
@@ -543,21 +777,21 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
|
||||
keyValuesInOut.copyFromHost(hostValues);
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
// 4. find entries
|
||||
B3_PROFILE("gpuBoundSearch");
|
||||
|
||||
m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,
|
||||
B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT,b3BoundSearchCL::COUNT);
|
||||
|
||||
m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT);
|
||||
|
||||
|
||||
//adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative,
|
||||
// B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT );
|
||||
|
||||
//unsigned int sum;
|
||||
m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT);//,&sum );
|
||||
m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum );
|
||||
//printf("sum = %d\n",sum);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -640,7 +874,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
|
||||
|
||||
{
|
||||
B3_PROFILE("batch grid");
|
||||
for(int i=0; i<B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT; i++)
|
||||
for(int i=0; i<B3_SOLVER_N_CELLS; i++)
|
||||
{
|
||||
int n = (nNativeHost)[i];
|
||||
int offset = (offsetsNativeHost)[i];
|
||||
@@ -648,12 +882,12 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
|
||||
if( n )
|
||||
{
|
||||
numNonzeroGrid++;
|
||||
//printf("cpu batch\n");
|
||||
//printf("cpu batch cell %d\n",i);
|
||||
|
||||
|
||||
int simdWidth =64;//-1;//32;
|
||||
//int numBatches = sortConstraintByBatch( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies); // on GPU
|
||||
int numBatches = sortConstraintByBatch2( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies); // on GPU
|
||||
int simdWidth =numBodies+1;//-1;//64;//-1;//32;
|
||||
int numBatches = sortConstraintByBatch( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies); // on GPU
|
||||
//int numBatches = sortConstraintByBatch2( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies); // on GPU
|
||||
//int numBatches = sortConstraintByBatch3( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies); // on GPU
|
||||
|
||||
|
||||
@@ -704,17 +938,28 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
|
||||
|
||||
if (1)
|
||||
{
|
||||
m_data->m_solverGPU->m_nIterations = 4;//10
|
||||
int numIter = 4;
|
||||
|
||||
m_data->m_solverGPU->m_nIterations = numIter;//10
|
||||
if (b3GpuSolveConstraint)
|
||||
{
|
||||
B3_PROFILE("GPU solveContactConstraint");
|
||||
|
||||
m_data->m_solverGPU->solveContactConstraint(
|
||||
/*m_data->m_solverGPU->solveContactConstraint(
|
||||
m_data->m_bodyBufferGPU,
|
||||
m_data->m_inertiaBufferGPU,
|
||||
m_data->m_contactCGPU,0,
|
||||
nContactOut ,
|
||||
maxNumBatches);
|
||||
*/
|
||||
|
||||
solveContactConstraint(
|
||||
m_data->m_bodyBufferGPU,
|
||||
m_data->m_inertiaBufferGPU,
|
||||
m_data->m_contactCGPU,0,
|
||||
nContactOut ,
|
||||
maxNumBatches,numIter);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -803,6 +1048,8 @@ inline int b3GpuBatchingPgsSolver::sortConstraintByBatch( b3Contact4* cs, int n,
|
||||
for(int i=0; i<nIdxSrc; i++)
|
||||
{
|
||||
int idx = idxSrc[i];
|
||||
|
||||
|
||||
b3Assert( idx < n );
|
||||
// check if it can go
|
||||
int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
|
||||
|
||||
Reference in New Issue
Block a user