batching reports the size of the batches, and solver uses this as termination condition, giving a good speedup

This commit is contained in:
erwincoumans
2014-02-07 06:28:15 -08:00
parent 3768a30bb2
commit bd5c2ff5ec
12 changed files with 74 additions and 14 deletions

View File

@@ -97,6 +97,7 @@ int GpuConvexScene::createDynamicsObjects2(const ConstructionInfo& ci, const flo
{
int strideInBytes = 9*sizeof(float);
int textureIndex = -1;
if (0)
{
int width,height,n;

View File

@@ -411,6 +411,7 @@ void b3GpuPgsContactSolver::solveContactConstraint( const b3OpenCLArray<b3Rigid
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffer( m_data->m_solverGPU->m_batchSizes.getBufferCL());
//launcher.setConst( cdata.x );
launcher.setConst( cdata.y );
launcher.setConst( cdata.z );
@@ -500,6 +501,7 @@ void b3GpuPgsContactSolver::solveContactConstraint( const b3OpenCLArray<b3Rigid
};
b3LauncherCL launcher( m_data->m_queue, m_data->m_solveFrictionKernel,"m_solveFrictionKernel" );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffer( m_data->m_solverGPU->m_batchSizes.getBufferCL());
//launcher.setConst( cdata.x );
launcher.setConst( cdata.y );
launcher.setConst( cdata.z );
@@ -1037,7 +1039,7 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
if (!gCpuBatchContacts)
{
B3_PROFILE("gpu batchContacts");
maxNumBatches = 150;//250;
maxNumBatches = 250;//250;
m_data->m_solverGPU->batchContacts( m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx );
clFinish(m_data->m_queue);
} else

View File

@@ -90,7 +90,8 @@ b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue,
:m_nIterations(4),
m_context(ctx),
m_device(device),
m_queue(queue)
m_queue(queue),
m_batchSizes(ctx,queue)
{
m_sort32 = new b3RadixSort32CL(ctx,device,queue);
m_scan = new b3PrefixScanCL(ctx,device,queue,B3_SOLVER_N_CELLS);
@@ -1136,6 +1137,7 @@ void b3Solver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContact
{
m_batchSizes.resize(nNative->size());
B3_PROFILE("batchingKernel");
//b3LauncherCL launcher( m_queue, m_batchingKernel);
cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;
@@ -1148,12 +1150,18 @@ void b3Solver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContact
launcher.setBuffer( m_contactBuffer2->getBufferCL() );
launcher.setBuffer( nNative->getBufferCL());
launcher.setBuffer( offsetsNative->getBufferCL());
launcher.setBuffer(m_batchSizes.getBufferCL());
//launcher.setConst( cdata );
launcher.setConst(staticIdx);
launcher.launch1D( numWorkItems, 64 );
//clFinish(m_queue);
//b3AlignedObjectArray<int> batchSizesCPU;
//m_batchSizes.copyToHost(batchSizesCPU);
//printf(".\n");
}
#ifdef BATCH_DEBUG

View File

@@ -72,6 +72,7 @@ class b3Solver : public b3SolverBase
b3OpenCLArray<unsigned int>* m_numConstraints;
b3OpenCLArray<unsigned int>* m_offsets;
b3OpenCLArray<int> m_batchSizes;
int m_nIterations;

View File

@@ -120,7 +120,7 @@ u32 tryWrite(__local u32* buff, int idx)
// batching on the GPU
__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,
__global const u32* gN, __global const u32* gStart,
__global const u32* gN, __global const u32* gStart, __global int* batchSizes,
int m_staticIdx )
{
__local u32 ldsStackIdx[STACK_SIZE];
@@ -147,9 +147,13 @@ __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints,
ldsDstEnd = m_start;
}
// while(1)
//was 250
for(int ie=0; ie<50; ie++)
int ie=0;
int maxBatch = 0;
for(ie=0; ie<50; ie++)
{
ldsFixedBuffer[lIdx] = 0;
@@ -297,7 +301,12 @@ __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints,
int idx = m_start + ldsRingElem[i].m_idx;
int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;
int curBatch = 100+i;
if (maxBatch < curBatch)
maxBatch = curBatch;
gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;
}
GROUP_LDS_BARRIER;
if( lIdx == 0 ) ldsRingEnd = 0;
@@ -312,6 +321,12 @@ __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints,
break;
}
if( lIdx == 0 )
{
if (maxBatch < ie)
maxBatch=ie;
batchSizes[wgIdx]=maxBatch;
}
}

View File

@@ -196,7 +196,7 @@ static const char* batchingKernelsCL= \
"}\n"
"// batching on the GPU\n"
"__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n"
" __global const u32* gN, __global const u32* gStart, \n"
" __global const u32* gN, __global const u32* gStart, __global int* batchSizes, \n"
" int m_staticIdx )\n"
"{\n"
" __local u32 ldsStackIdx[STACK_SIZE];\n"
@@ -222,9 +222,13 @@ static const char* batchingKernelsCL= \
" ldsDstEnd = m_start;\n"
" }\n"
" \n"
" \n"
" \n"
"// while(1)\n"
"//was 250\n"
" for(int ie=0; ie<50; ie++)\n"
" int ie=0;\n"
" int maxBatch = 0;\n"
" for(ie=0; ie<50; ie++)\n"
" {\n"
" ldsFixedBuffer[lIdx] = 0;\n"
" for(int giter=0; giter<4; giter++)\n"
@@ -357,7 +361,12 @@ static const char* batchingKernelsCL= \
" int idx = m_start + ldsRingElem[i].m_idx;\n"
" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
" gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;\n"
" int curBatch = 100+i;\n"
" if (maxBatch < curBatch)\n"
" maxBatch = curBatch;\n"
" \n"
" gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;\n"
" \n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 ) ldsRingEnd = 0;\n"
@@ -368,5 +377,11 @@ static const char* batchingKernelsCL= \
" if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
" break;\n"
" }\n"
" if( lIdx == 0 )\n"
" {\n"
" if (maxBatch < ie)\n"
" maxBatch=ie;\n"
" batchSizes[wgIdx]=maxBatch;\n"
" }\n"
"}\n"
;

View File

@@ -141,7 +141,7 @@ u32 tryWrite(__local u32* buff, int idx)
// batching on the GPU
__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )
__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )
{
int wgIdx = GET_GROUP_IDX;
int lIdx = GET_LOCAL_IDX;
@@ -222,6 +222,9 @@ __kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __
}//for
batchIdx ++;
}//while
batchSizes[wgIdx] = batchIdx;
}//if( lIdx == 0 )
//return batchIdx;

View File

@@ -209,7 +209,7 @@ static const char* batchingKernelsNewCL= \
" return ((ans >> bitIdx)&1) == 0;\n"
"}\n"
"// batching on the GPU\n"
"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )\n"
"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )\n"
"{\n"
" int wgIdx = GET_GROUP_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
@@ -281,6 +281,8 @@ static const char* batchingKernelsNewCL= \
" }//for\n"
" batchIdx ++;\n"
" }//while\n"
" \n"
" batchSizes[wgIdx] = batchIdx;\n"
" }//if( lIdx == 0 )\n"
" \n"
" //return batchIdx;\n"

View File

@@ -401,7 +401,8 @@ void BatchSolveKernelContact(__global Body* gBodies,
__global Constraint4* gConstraints,
__global int* gN,
__global int* gOffsets,
int maxBatch,
__global int* batchSizes,
int maxBatch1,
int cellBatch,
int4 nSplit
)
@@ -418,6 +419,8 @@ void BatchSolveKernelContact(__global Body* gBodies,
// debugInfo[gIdx].m_valInt0 = gIdx;
//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;
int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);
int remain= (wgIdx%((nSplit.x*nSplit.y)/4));
@@ -432,6 +435,7 @@ void BatchSolveKernelContact(__global Body* gBodies,
if( gN[cellIdx] == 0 )
return;
int maxBatch = batchSizes[cellIdx];
const int start = gOffsets[cellIdx];

View File

@@ -307,7 +307,8 @@ static const char* solveContactCL= \
" __global Constraint4* gConstraints,\n"
" __global int* gN,\n"
" __global int* gOffsets,\n"
" int maxBatch,\n"
" __global int* batchSizes,\n"
" int maxBatch1,\n"
" int cellBatch,\n"
" int4 nSplit\n"
" )\n"
@@ -321,6 +322,8 @@ static const char* solveContactCL= \
"// int gIdx = GET_GLOBAL_IDX;\n"
"// debugInfo[gIdx].m_valInt0 = gIdx;\n"
" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
" \n"
" \n"
" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
@@ -332,6 +335,7 @@ static const char* solveContactCL= \
" \n"
" if( gN[cellIdx] == 0 ) \n"
" return;\n"
" int maxBatch = batchSizes[cellIdx];\n"
" \n"
" \n"
" const int start = gOffsets[cellIdx];\n"

View File

@@ -430,7 +430,8 @@ void BatchSolveKernelFriction(__global Body* gBodies,
__global Constraint4* gConstraints,
__global int* gN,
__global int* gOffsets,
int maxBatch,
__global int* batchSizes,
int maxBatch1,
int cellBatch,
int4 nSplit
)
@@ -458,6 +459,8 @@ void BatchSolveKernelFriction(__global Body* gBodies,
if( gN[cellIdx] == 0 )
return;
int maxBatch = batchSizes[cellIdx];
const int start = gOffsets[cellIdx];
const int end = start + gN[cellIdx];

View File

@@ -343,7 +343,8 @@ static const char* solveFrictionCL= \
" __global Constraint4* gConstraints,\n"
" __global int* gN,\n"
" __global int* gOffsets,\n"
" int maxBatch,\n"
" __global int* batchSizes,\n"
" int maxBatch1,\n"
" int cellBatch,\n"
" int4 nSplit\n"
" )\n"
@@ -365,6 +366,7 @@ static const char* solveFrictionCL= \
" \n"
" if( gN[cellIdx] == 0 ) \n"
" return;\n"
" int maxBatch = batchSizes[cellIdx];\n"
" const int start = gOffsets[cellIdx];\n"
" const int end = start + gN[cellIdx];\n"
" \n"