add some clFinish for better profile timings

improved parallel batching, don't try to write for static objects,
this fixed a bug, when the hash of a static object was identical with hash of dynamic objects, causing it to be assigned a bogus 100+i batching number
The parallel batching is still not enabled, because we need to measure the batching size (todo)
This commit is contained in:
erwincoumans
2013-07-15 10:22:40 -07:00
parent 05ed1fdbcf
commit a5d00c8219
4 changed files with 28 additions and 20 deletions

View File

@@ -373,7 +373,7 @@ void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArray<b3Rigi
} }
} }
//clFinish(m_data->m_queue); clFinish(m_data->m_queue);
} }
@@ -416,7 +416,7 @@ void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArray<b3Rigi
launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 ); launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 );
} }
} }
//clFinish(m_data->m_queue); clFinish(m_data->m_queue);
} }
#ifdef DEBUG_ME #ifdef DEBUG_ME
@@ -838,7 +838,8 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata ); launcher.setConst( cdata );
launcher.launch1D( nContacts, 64 ); launcher.launch1D( nContacts, 64 );
//clFinish(m_data->m_queue); //we use the clFinish for proper benchmark/profile
clFinish(m_data->m_queue);
} }
@@ -900,7 +901,9 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
b3Printf("maxNumBatches = %d\n",maxNumBatches); b3Printf("maxNumBatches = %d\n",maxNumBatches);
} }
//clFinish(m_data->m_queue); //we use the clFinish for proper benchmark/profile
clFinish(m_data->m_queue);
} }
} }
@@ -925,7 +928,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
contactConstraintOut, contactConstraintOut,
additionalData, nContacts, additionalData, nContacts,
(b3SolverBase::ConstraintCfg&) csCfg ); (b3SolverBase::ConstraintCfg&) csCfg );
//clFinish(m_data->m_queue); clFinish(m_data->m_queue);
} }
@@ -968,7 +971,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU,0, nContactOut ,maxNumBatches); m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU,0, nContactOut ,maxNumBatches);
} }
//clFinish(m_data->m_queue);
} }

View File

@@ -159,6 +159,7 @@ b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue,
{ {
cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, B3_BATCHING_PATH); cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, B3_BATCHING_PATH);
//cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_BATCHING_PATH,true);
b3Assert(batchingProg); b3Assert(batchingProg);
m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros ); m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros );

View File

@@ -216,19 +216,21 @@ __kernel void CreateBatches( __global const Contact4* gConstraints, __global Con
if( aUsed==0 && bUsed==0 ) if( aUsed==0 && bUsed==0 )
{ {
int aAvailable; int aAvailable=1;
int bAvailable; int bAvailable=1;
int ea = abs(e.m_a); int ea = abs(e.m_a);
int eb = abs(e.m_b); int eb = abs(e.m_b);
aAvailable = tryWrite( ldsCheckBuffer, ea );
bAvailable = tryWrite( ldsCheckBuffer, eb );
bool aStatic = (e.m_a<0) ||(ea==m_staticIdx); bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);
bool bStatic = (e.m_b<0) ||(eb==m_staticIdx); bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);
aAvailable = aStatic? 1: aAvailable; if (!aStatic)
bAvailable = bStatic? 1: bAvailable; aAvailable = tryWrite( ldsCheckBuffer, ea );
if (!bStatic)
bAvailable = tryWrite( ldsCheckBuffer, eb );
//aAvailable = aStatic? 1: aAvailable;
//bAvailable = bStatic? 1: bAvailable;
bool success = (aAvailable && bAvailable); bool success = (aAvailable && bAvailable);
if(success) if(success)

View File

@@ -218,19 +218,21 @@ static const char* batchingKernelsCL= \
"\n" "\n"
" if( aUsed==0 && bUsed==0 )\n" " if( aUsed==0 && bUsed==0 )\n"
" {\n" " {\n"
" int aAvailable;\n" " int aAvailable=1;\n"
" int bAvailable;\n" " int bAvailable=1;\n"
" int ea = abs(e.m_a);\n" " int ea = abs(e.m_a);\n"
" int eb = abs(e.m_b);\n" " int eb = abs(e.m_b);\n"
"\n" "\n"
" aAvailable = tryWrite( ldsCheckBuffer, ea );\n"
" bAvailable = tryWrite( ldsCheckBuffer, eb );\n"
"\n"
" bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n" " bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n"
" bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n" " bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n"
" \n" " \n"
" aAvailable = aStatic? 1: aAvailable;\n" " if (!aStatic)\n"
" bAvailable = bStatic? 1: bAvailable;\n" " aAvailable = tryWrite( ldsCheckBuffer, ea );\n"
" if (!bStatic)\n"
" bAvailable = tryWrite( ldsCheckBuffer, eb );\n"
" \n"
" //aAvailable = aStatic? 1: aAvailable;\n"
" //bAvailable = bStatic? 1: bAvailable;\n"
"\n" "\n"
" bool success = (aAvailable && bAvailable);\n" " bool success = (aAvailable && bAvailable);\n"
" if(success)\n" " if(success)\n"