Accidently left some very slow copyToHost in the batching code, removing it makes it faster :-)

This commit is contained in:
erwincoumans
2013-11-12 09:03:30 -08:00
parent f7abea1b2c
commit a78cbcf354
10 changed files with 187 additions and 196 deletions

View File

@@ -895,7 +895,7 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
} }
m_data->m_solverGPU->m_contactBuffer2->copyFromHost(outContacts); m_data->m_solverGPU->m_contactBuffer2->copyFromHost(outContacts);
/* "void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n" /* "void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n"
"{\n" "{\n"
" int nContacts = cb.x;\n" " int nContacts = cb.x;\n"
" int gIdx = GET_GLOBAL_IDX;\n" " int gIdx = GET_GLOBAL_IDX;\n"
@@ -934,11 +934,11 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
//clFinish(m_data->m_queue); //clFinish(m_data->m_queue);
// { // {
// b3AlignedObjectArray<unsigned int> histogram; // b3AlignedObjectArray<unsigned int> histogram;
// m_data->m_solverGPU->m_numConstraints->copyToHost(histogram); // m_data->m_solverGPU->m_numConstraints->copyToHost(histogram);
// printf(",,,\n"); // printf(",,,\n");
// } // }
if (nContacts) if (nContacts)
@@ -949,7 +949,7 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
for (int i=0;i<nContacts;i++) for (int i=0;i<nContacts;i++)
{ {
m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2); m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2);
// m_data->m_solverGPU->m_contactBuffer2->getBufferCL(); // m_data->m_solverGPU->m_contactBuffer2->getBufferCL();
// m_data->m_pBufContactOutGPU->getBufferCL() // m_data->m_pBufContactOutGPU->getBufferCL()
} }
@@ -980,6 +980,7 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
B3_PROFILE("gpu batchContacts"); B3_PROFILE("gpu batchContacts");
maxNumBatches = 150;//250; maxNumBatches = 150;//250;
m_data->m_solverGPU->batchContacts( m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx ); m_data->m_solverGPU->batchContacts( m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx );
clFinish(m_data->m_queue);
} else } else
{ {
B3_PROFILE("cpu batchContacts"); B3_PROFILE("cpu batchContacts");
@@ -1003,25 +1004,16 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
int numNonzeroGrid=0; int numNonzeroGrid=0;
{ {
B3_PROFILE("batch grid"); B3_PROFILE("cpu batch grid");
for(int i=0; i<B3_SOLVER_N_CELLS; i++) for(int i=0; i<B3_SOLVER_N_CELLS; i++)
{ {
int n = (nNativeHost)[i]; int n = (nNativeHost)[i];
int offset = (offsetsNativeHost)[i]; int offset = (offsetsNativeHost)[i];
if( n ) if( n )
{ {
numNonzeroGrid++; numNonzeroGrid++;
//printf("cpu batch cell %d\n",i);
int simdWidth =numBodies+1;//-1;//64;//-1;//32; int simdWidth =numBodies+1;//-1;//64;//-1;//32;
//int numBatches = sortConstraintByBatch( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies); // on GPU
//int numBatches = sortConstraintByBatch2( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies); // on GPU
int numBatches = sortConstraintByBatch3( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies); // on GPU int numBatches = sortConstraintByBatch3( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies); // on GPU
maxNumBatches = b3Max(numBatches,maxNumBatches); maxNumBatches = b3Max(numBatches,maxNumBatches);
static int globalMaxBatch = 0; static int globalMaxBatch = 0;
if (maxNumBatches>globalMaxBatch ) if (maxNumBatches>globalMaxBatch )
@@ -1029,13 +1021,11 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
globalMaxBatch = maxNumBatches; globalMaxBatch = maxNumBatches;
b3Printf("maxNumBatches = %d\n",maxNumBatches); b3Printf("maxNumBatches = %d\n",maxNumBatches);
} }
//we use the clFinish for proper benchmark/profile //we use the clFinish for proper benchmark/profile
}
}
clFinish(m_data->m_queue); clFinish(m_data->m_queue);
}
}
} }
{ {
B3_PROFILE("m_contactBuffer->copyFromHost"); B3_PROFILE("m_contactBuffer->copyFromHost");
@@ -1051,7 +1041,7 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
if (nContacts) if (nContacts)
{ {
//B3_PROFILE("gpu convertToConstraints"); B3_PROFILE("gpu convertToConstraints");
m_data->m_solverGPU->convertToConstraints( bodyBuf, m_data->m_solverGPU->convertToConstraints( bodyBuf,
shapeBuf, m_data->m_solverGPU->m_contactBuffer2, shapeBuf, m_data->m_solverGPU->m_contactBuffer2,
contactConstraintOut, contactConstraintOut,

View File

@@ -978,6 +978,9 @@ void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf
cdata.m_positionDrift = cfg.m_positionDrift; cdata.m_positionDrift = cfg.m_positionDrift;
cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff; cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
if (convertConstraintOnCpu)
{
b3AlignedObjectArray<b3RigidBodyCL> gBodies; b3AlignedObjectArray<b3RigidBodyCL> gBodies;
bodyBuf->copyToHost(gBodies); bodyBuf->copyToHost(gBodies);
@@ -990,8 +993,6 @@ void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf
b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut; b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
gConstraintOut.resize(nContacts); gConstraintOut.resize(nContacts);
if (convertConstraintOnCpu)
{
B3_PROFILE("cpu contactToConstraintKernel"); B3_PROFILE("cpu contactToConstraintKernel");
for (int gIdx=0;gIdx<nContacts;gIdx++) for (int gIdx=0;gIdx<nContacts;gIdx++)
{ {

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.