batching reports the size of the batches, and solver uses this as termination condition, giving a good speedup
This commit is contained in:
@@ -97,6 +97,7 @@ int GpuConvexScene::createDynamicsObjects2(const ConstructionInfo& ci, const flo
|
||||
{
|
||||
int strideInBytes = 9*sizeof(float);
|
||||
int textureIndex = -1;
|
||||
if (0)
|
||||
{
|
||||
int width,height,n;
|
||||
|
||||
|
||||
@@ -411,6 +411,7 @@ void b3GpuPgsContactSolver::solveContactConstraint( const b3OpenCLArray<b3Rigid
|
||||
|
||||
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffer( m_data->m_solverGPU->m_batchSizes.getBufferCL());
|
||||
//launcher.setConst( cdata.x );
|
||||
launcher.setConst( cdata.y );
|
||||
launcher.setConst( cdata.z );
|
||||
@@ -500,6 +501,7 @@ void b3GpuPgsContactSolver::solveContactConstraint( const b3OpenCLArray<b3Rigid
|
||||
};
|
||||
b3LauncherCL launcher( m_data->m_queue, m_data->m_solveFrictionKernel,"m_solveFrictionKernel" );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffer( m_data->m_solverGPU->m_batchSizes.getBufferCL());
|
||||
//launcher.setConst( cdata.x );
|
||||
launcher.setConst( cdata.y );
|
||||
launcher.setConst( cdata.z );
|
||||
@@ -1037,7 +1039,7 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
|
||||
if (!gCpuBatchContacts)
|
||||
{
|
||||
B3_PROFILE("gpu batchContacts");
|
||||
maxNumBatches = 150;//250;
|
||||
maxNumBatches = 250;//250;
|
||||
m_data->m_solverGPU->batchContacts( m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx );
|
||||
clFinish(m_data->m_queue);
|
||||
} else
|
||||
|
||||
@@ -90,7 +90,8 @@ b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue,
|
||||
:m_nIterations(4),
|
||||
m_context(ctx),
|
||||
m_device(device),
|
||||
m_queue(queue)
|
||||
m_queue(queue),
|
||||
m_batchSizes(ctx,queue)
|
||||
{
|
||||
m_sort32 = new b3RadixSort32CL(ctx,device,queue);
|
||||
m_scan = new b3PrefixScanCL(ctx,device,queue,B3_SOLVER_N_CELLS);
|
||||
@@ -1136,6 +1137,7 @@ void b3Solver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContact
|
||||
|
||||
|
||||
{
|
||||
m_batchSizes.resize(nNative->size());
|
||||
B3_PROFILE("batchingKernel");
|
||||
//b3LauncherCL launcher( m_queue, m_batchingKernel);
|
||||
cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;
|
||||
@@ -1148,12 +1150,18 @@ void b3Solver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContact
|
||||
launcher.setBuffer( m_contactBuffer2->getBufferCL() );
|
||||
launcher.setBuffer( nNative->getBufferCL());
|
||||
launcher.setBuffer( offsetsNative->getBufferCL());
|
||||
|
||||
launcher.setBuffer(m_batchSizes.getBufferCL());
|
||||
|
||||
|
||||
//launcher.setConst( cdata );
|
||||
launcher.setConst(staticIdx);
|
||||
|
||||
launcher.launch1D( numWorkItems, 64 );
|
||||
//clFinish(m_queue);
|
||||
//b3AlignedObjectArray<int> batchSizesCPU;
|
||||
//m_batchSizes.copyToHost(batchSizesCPU);
|
||||
//printf(".\n");
|
||||
}
|
||||
|
||||
#ifdef BATCH_DEBUG
|
||||
|
||||
@@ -72,6 +72,7 @@ class b3Solver : public b3SolverBase
|
||||
|
||||
b3OpenCLArray<unsigned int>* m_numConstraints;
|
||||
b3OpenCLArray<unsigned int>* m_offsets;
|
||||
b3OpenCLArray<int> m_batchSizes;
|
||||
|
||||
|
||||
int m_nIterations;
|
||||
|
||||
@@ -120,7 +120,7 @@ u32 tryWrite(__local u32* buff, int idx)
|
||||
|
||||
// batching on the GPU
|
||||
__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,
|
||||
__global const u32* gN, __global const u32* gStart,
|
||||
__global const u32* gN, __global const u32* gStart, __global int* batchSizes,
|
||||
int m_staticIdx )
|
||||
{
|
||||
__local u32 ldsStackIdx[STACK_SIZE];
|
||||
@@ -147,9 +147,13 @@ __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints,
|
||||
ldsDstEnd = m_start;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// while(1)
|
||||
//was 250
|
||||
for(int ie=0; ie<50; ie++)
|
||||
int ie=0;
|
||||
int maxBatch = 0;
|
||||
for(ie=0; ie<50; ie++)
|
||||
{
|
||||
ldsFixedBuffer[lIdx] = 0;
|
||||
|
||||
@@ -297,7 +301,12 @@ __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints,
|
||||
int idx = m_start + ldsRingElem[i].m_idx;
|
||||
int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
|
||||
gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
|
||||
gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;
|
||||
int curBatch = 100+i;
|
||||
if (maxBatch < curBatch)
|
||||
maxBatch = curBatch;
|
||||
|
||||
gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;
|
||||
|
||||
}
|
||||
GROUP_LDS_BARRIER;
|
||||
if( lIdx == 0 ) ldsRingEnd = 0;
|
||||
@@ -312,6 +321,12 @@ __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints,
|
||||
break;
|
||||
}
|
||||
|
||||
if( lIdx == 0 )
|
||||
{
|
||||
if (maxBatch < ie)
|
||||
maxBatch=ie;
|
||||
batchSizes[wgIdx]=maxBatch;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -196,7 +196,7 @@ static const char* batchingKernelsCL= \
|
||||
"}\n"
|
||||
"// batching on the GPU\n"
|
||||
"__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n"
|
||||
" __global const u32* gN, __global const u32* gStart, \n"
|
||||
" __global const u32* gN, __global const u32* gStart, __global int* batchSizes, \n"
|
||||
" int m_staticIdx )\n"
|
||||
"{\n"
|
||||
" __local u32 ldsStackIdx[STACK_SIZE];\n"
|
||||
@@ -222,9 +222,13 @@ static const char* batchingKernelsCL= \
|
||||
" ldsDstEnd = m_start;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" \n"
|
||||
" \n"
|
||||
"// while(1)\n"
|
||||
"//was 250\n"
|
||||
" for(int ie=0; ie<50; ie++)\n"
|
||||
" int ie=0;\n"
|
||||
" int maxBatch = 0;\n"
|
||||
" for(ie=0; ie<50; ie++)\n"
|
||||
" {\n"
|
||||
" ldsFixedBuffer[lIdx] = 0;\n"
|
||||
" for(int giter=0; giter<4; giter++)\n"
|
||||
@@ -357,7 +361,12 @@ static const char* batchingKernelsCL= \
|
||||
" int idx = m_start + ldsRingElem[i].m_idx;\n"
|
||||
" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
|
||||
" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
|
||||
" gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;\n"
|
||||
" int curBatch = 100+i;\n"
|
||||
" if (maxBatch < curBatch)\n"
|
||||
" maxBatch = curBatch;\n"
|
||||
" \n"
|
||||
" gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;\n"
|
||||
" \n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" if( lIdx == 0 ) ldsRingEnd = 0;\n"
|
||||
@@ -368,5 +377,11 @@ static const char* batchingKernelsCL= \
|
||||
" if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" if( lIdx == 0 )\n"
|
||||
" {\n"
|
||||
" if (maxBatch < ie)\n"
|
||||
" maxBatch=ie;\n"
|
||||
" batchSizes[wgIdx]=maxBatch;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
;
|
||||
|
||||
@@ -141,7 +141,7 @@ u32 tryWrite(__local u32* buff, int idx)
|
||||
|
||||
|
||||
// batching on the GPU
|
||||
__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )
|
||||
__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )
|
||||
{
|
||||
int wgIdx = GET_GROUP_IDX;
|
||||
int lIdx = GET_LOCAL_IDX;
|
||||
@@ -222,6 +222,9 @@ __kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __
|
||||
}//for
|
||||
batchIdx ++;
|
||||
}//while
|
||||
|
||||
batchSizes[wgIdx] = batchIdx;
|
||||
|
||||
}//if( lIdx == 0 )
|
||||
|
||||
//return batchIdx;
|
||||
|
||||
@@ -209,7 +209,7 @@ static const char* batchingKernelsNewCL= \
|
||||
" return ((ans >> bitIdx)&1) == 0;\n"
|
||||
"}\n"
|
||||
"// batching on the GPU\n"
|
||||
"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )\n"
|
||||
"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )\n"
|
||||
"{\n"
|
||||
" int wgIdx = GET_GROUP_IDX;\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
@@ -281,6 +281,8 @@ static const char* batchingKernelsNewCL= \
|
||||
" }//for\n"
|
||||
" batchIdx ++;\n"
|
||||
" }//while\n"
|
||||
" \n"
|
||||
" batchSizes[wgIdx] = batchIdx;\n"
|
||||
" }//if( lIdx == 0 )\n"
|
||||
" \n"
|
||||
" //return batchIdx;\n"
|
||||
|
||||
@@ -401,7 +401,8 @@ void BatchSolveKernelContact(__global Body* gBodies,
|
||||
__global Constraint4* gConstraints,
|
||||
__global int* gN,
|
||||
__global int* gOffsets,
|
||||
int maxBatch,
|
||||
__global int* batchSizes,
|
||||
int maxBatch1,
|
||||
int cellBatch,
|
||||
int4 nSplit
|
||||
)
|
||||
@@ -418,6 +419,8 @@ void BatchSolveKernelContact(__global Body* gBodies,
|
||||
// debugInfo[gIdx].m_valInt0 = gIdx;
|
||||
//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;
|
||||
|
||||
|
||||
|
||||
|
||||
int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);
|
||||
int remain= (wgIdx%((nSplit.x*nSplit.y)/4));
|
||||
@@ -432,6 +435,7 @@ void BatchSolveKernelContact(__global Body* gBodies,
|
||||
if( gN[cellIdx] == 0 )
|
||||
return;
|
||||
|
||||
int maxBatch = batchSizes[cellIdx];
|
||||
|
||||
|
||||
const int start = gOffsets[cellIdx];
|
||||
|
||||
@@ -307,7 +307,8 @@ static const char* solveContactCL= \
|
||||
" __global Constraint4* gConstraints,\n"
|
||||
" __global int* gN,\n"
|
||||
" __global int* gOffsets,\n"
|
||||
" int maxBatch,\n"
|
||||
" __global int* batchSizes,\n"
|
||||
" int maxBatch1,\n"
|
||||
" int cellBatch,\n"
|
||||
" int4 nSplit\n"
|
||||
" )\n"
|
||||
@@ -321,6 +322,8 @@ static const char* solveContactCL= \
|
||||
"// int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"// debugInfo[gIdx].m_valInt0 = gIdx;\n"
|
||||
" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
|
||||
" \n"
|
||||
" \n"
|
||||
" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
|
||||
" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
|
||||
" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
|
||||
@@ -332,6 +335,7 @@ static const char* solveContactCL= \
|
||||
" \n"
|
||||
" if( gN[cellIdx] == 0 ) \n"
|
||||
" return;\n"
|
||||
" int maxBatch = batchSizes[cellIdx];\n"
|
||||
" \n"
|
||||
" \n"
|
||||
" const int start = gOffsets[cellIdx];\n"
|
||||
|
||||
@@ -430,7 +430,8 @@ void BatchSolveKernelFriction(__global Body* gBodies,
|
||||
__global Constraint4* gConstraints,
|
||||
__global int* gN,
|
||||
__global int* gOffsets,
|
||||
int maxBatch,
|
||||
__global int* batchSizes,
|
||||
int maxBatch1,
|
||||
int cellBatch,
|
||||
int4 nSplit
|
||||
)
|
||||
@@ -458,6 +459,8 @@ void BatchSolveKernelFriction(__global Body* gBodies,
|
||||
if( gN[cellIdx] == 0 )
|
||||
return;
|
||||
|
||||
int maxBatch = batchSizes[cellIdx];
|
||||
|
||||
const int start = gOffsets[cellIdx];
|
||||
const int end = start + gN[cellIdx];
|
||||
|
||||
|
||||
@@ -343,7 +343,8 @@ static const char* solveFrictionCL= \
|
||||
" __global Constraint4* gConstraints,\n"
|
||||
" __global int* gN,\n"
|
||||
" __global int* gOffsets,\n"
|
||||
" int maxBatch,\n"
|
||||
" __global int* batchSizes,\n"
|
||||
" int maxBatch1,\n"
|
||||
" int cellBatch,\n"
|
||||
" int4 nSplit\n"
|
||||
" )\n"
|
||||
@@ -365,6 +366,7 @@ static const char* solveFrictionCL= \
|
||||
" \n"
|
||||
" if( gN[cellIdx] == 0 ) \n"
|
||||
" return;\n"
|
||||
" int maxBatch = batchSizes[cellIdx];\n"
|
||||
" const int start = gOffsets[cellIdx];\n"
|
||||
" const int end = start + gN[cellIdx];\n"
|
||||
" \n"
|
||||
|
||||
Reference in New Issue
Block a user