Accidently left some very slow copyToHost in the batching code, removing it makes it faster :-)

This commit is contained in:
erwincoumans
2013-11-12 09:03:30 -08:00
parent f7abea1b2c
commit a78cbcf354
10 changed files with 187 additions and 196 deletions

View File

@@ -730,65 +730,65 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
{ {
B3_PROFILE("batching"); B3_PROFILE("batching");
//@todo: just reserve it, without copy of original contact (unless we use warmstarting) //@todo: just reserve it, without copy of original contact (unless we use warmstarting)
const b3OpenCLArray<b3RigidBodyCL>* bodyNative = bodyBuf; const b3OpenCLArray<b3RigidBodyCL>* bodyNative = bodyBuf;
{ {
//b3OpenCLArray<b3RigidBodyCL>* bodyNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf ); //b3OpenCLArray<b3RigidBodyCL>* bodyNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf );
//b3OpenCLArray<b3Contact4>* contactNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn ); //b3OpenCLArray<b3Contact4>* contactNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn );
const int sortAlignment = 512; // todo. get this out of sort const int sortAlignment = 512; // todo. get this out of sort
if( csCfg.m_enableParallelSolve ) if( csCfg.m_enableParallelSolve )
{ {
int sortSize = B3NEXTMULTIPLEOF( nContacts, sortAlignment ); int sortSize = B3NEXTMULTIPLEOF( nContacts, sortAlignment );
b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints; b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets; b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
if (gpuSetSortData) if (gpuSetSortData)
{ // 2. set cell idx { // 2. set cell idx
B3_PROFILE("GPU set cell idx"); B3_PROFILE("GPU set cell idx");
struct CB struct CB
{ {
int m_nContacts; int m_nContacts;
int m_staticIdx; int m_staticIdx;
float m_scale; float m_scale;
b3Int4 m_nSplit; b3Int4 m_nSplit;
}; };
b3Assert( sortSize%64 == 0 ); b3Assert( sortSize%64 == 0 );
CB cdata; CB cdata;
cdata.m_nContacts = nContacts; cdata.m_nContacts = nContacts;
cdata.m_staticIdx = csCfg.m_staticIdx; cdata.m_staticIdx = csCfg.m_staticIdx;
cdata.m_scale = 1.f/csCfg.m_batchCellSize; cdata.m_scale = 1.f/csCfg.m_batchCellSize;
cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X; cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X;
cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y; cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y;
cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z; cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z;
m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts); m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL()), b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL()), b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel ); b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata.m_nContacts ); launcher.setConst( cdata.m_nContacts );
launcher.setConst( cdata.m_scale ); launcher.setConst( cdata.m_scale );
launcher.setConst(cdata.m_nSplit); launcher.setConst(cdata.m_nSplit);
launcher.setConst(cdata.m_staticIdx); launcher.setConst(cdata.m_staticIdx);
launcher.launch1D( sortSize, 64 ); launcher.launch1D( sortSize, 64 );
} else } else
{ {
m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts); m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
b3AlignedObjectArray<b3SortData> sortDataCPU; b3AlignedObjectArray<b3SortData> sortDataCPU;
@@ -812,34 +812,34 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
if (gpuRadixSort) if (gpuRadixSort)
{ // 3. sort by cell idx { // 3. sort by cell idx
B3_PROFILE("gpuRadixSort"); B3_PROFILE("gpuRadixSort");
//int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT; //int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
//int sortBit = 32; //int sortBit = 32;
//if( n <= 0xffff ) sortBit = 16; //if( n <= 0xffff ) sortBit = 16;
//if( n <= 0xff ) sortBit = 8; //if( n <= 0xff ) sortBit = 8;
//adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize ); //adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize );
//adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize ); //adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize );
b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer); b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
this->m_data->m_solverGPU->m_sort32->execute(keyValuesInOut); this->m_data->m_solverGPU->m_sort32->execute(keyValuesInOut);
} else } else
{ {
b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer); b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
b3AlignedObjectArray<b3SortData> hostValues; b3AlignedObjectArray<b3SortData> hostValues;
keyValuesInOut.copyToHost(hostValues); keyValuesInOut.copyToHost(hostValues);
hostValues.quickSort(sortfnc); hostValues.quickSort(sortfnc);
keyValuesInOut.copyFromHost(hostValues); keyValuesInOut.copyFromHost(hostValues);
} }
if (useScanHost) if (useScanHost)
{ {
// 4. find entries // 4. find entries
B3_PROFILE("cpuBoundSearch"); B3_PROFILE("cpuBoundSearch");
b3AlignedObjectArray<unsigned int> countsHost; b3AlignedObjectArray<unsigned int> countsHost;
countsNative->copyToHost(countsHost); countsNative->copyToHost(countsHost);
@@ -853,32 +853,32 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
countsNative->copyFromHost(countsHost); countsNative->copyFromHost(countsHost);
//adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, //adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative,
// B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT ); // B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT );
//unsigned int sum; //unsigned int sum;
//m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum ); //m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum );
b3AlignedObjectArray<unsigned int> offsetsHost; b3AlignedObjectArray<unsigned int> offsetsHost;
offsetsHost.resize(offsetsNative->size()); offsetsHost.resize(offsetsNative->size());
m_data->m_solverGPU->m_scan->executeHost(countsHost,offsetsHost, B3_SOLVER_N_CELLS);//,&sum ); m_data->m_solverGPU->m_scan->executeHost(countsHost,offsetsHost, B3_SOLVER_N_CELLS);//,&sum );
offsetsNative->copyFromHost(offsetsHost); offsetsNative->copyFromHost(offsetsHost);
//printf("sum = %d\n",sum); //printf("sum = %d\n",sum);
} else } else
{ {
// 4. find entries // 4. find entries
B3_PROFILE("gpuBoundSearch"); B3_PROFILE("gpuBoundSearch");
m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT); m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT);
m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum ); m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum );
} }
if (nContacts) if (nContacts)
{ // 5. sort constraints by cellIdx { // 5. sort constraints by cellIdx
if (reorderContactsOnCpu) if (reorderContactsOnCpu)
{ {
B3_PROFILE("cpu m_reorderContactKernel"); B3_PROFILE("cpu m_reorderContactKernel");
@@ -895,7 +895,7 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
} }
m_data->m_solverGPU->m_contactBuffer2->copyFromHost(outContacts); m_data->m_solverGPU->m_contactBuffer2->copyFromHost(outContacts);
/* "void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n" /* "void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n"
"{\n" "{\n"
" int nContacts = cb.x;\n" " int nContacts = cb.x;\n"
" int gIdx = GET_GLOBAL_IDX;\n" " int gIdx = GET_GLOBAL_IDX;\n"
@@ -907,38 +907,38 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
"}\n" "}\n"
*/ */
} else } else
{ {
B3_PROFILE("gpu m_reorderContactKernel"); B3_PROFILE("gpu m_reorderContactKernel");
b3Int4 cdata; b3Int4 cdata;
cdata.x = nContacts; cdata.x = nContacts;
b3BufferInfoCL bInfo[] = { b3BufferInfoCL bInfo[] = {
b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ),
b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL()) b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL())
, b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
b3LauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel); b3LauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata ); launcher.setConst( cdata );
launcher.launch1D( nContacts, 64 ); launcher.launch1D( nContacts, 64 );
} }
} }
} }
} }
//clFinish(m_data->m_queue); //clFinish(m_data->m_queue);
// { // {
// b3AlignedObjectArray<unsigned int> histogram; // b3AlignedObjectArray<unsigned int> histogram;
// m_data->m_solverGPU->m_numConstraints->copyToHost(histogram); // m_data->m_solverGPU->m_numConstraints->copyToHost(histogram);
// printf(",,,\n"); // printf(",,,\n");
// } // }
if (nContacts) if (nContacts)
@@ -949,8 +949,8 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
for (int i=0;i<nContacts;i++) for (int i=0;i<nContacts;i++)
{ {
m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2); m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2);
// m_data->m_solverGPU->m_contactBuffer2->getBufferCL(); // m_data->m_solverGPU->m_contactBuffer2->getBufferCL();
// m_data->m_pBufContactOutGPU->getBufferCL() // m_data->m_pBufContactOutGPU->getBufferCL()
} }
} else } else
@@ -972,7 +972,7 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
} }
bool compareGPU = false; bool compareGPU = false;
if (nContacts) if (nContacts)
{ {
if (b3GpuBatchContacts) if (b3GpuBatchContacts)
@@ -980,6 +980,7 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
B3_PROFILE("gpu batchContacts"); B3_PROFILE("gpu batchContacts");
maxNumBatches = 150;//250; maxNumBatches = 150;//250;
m_data->m_solverGPU->batchContacts( m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx ); m_data->m_solverGPU->batchContacts( m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx );
clFinish(m_data->m_queue);
} else } else
{ {
B3_PROFILE("cpu batchContacts"); B3_PROFILE("cpu batchContacts");
@@ -1003,25 +1004,16 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
int numNonzeroGrid=0; int numNonzeroGrid=0;
{ {
B3_PROFILE("batch grid"); B3_PROFILE("cpu batch grid");
for(int i=0; i<B3_SOLVER_N_CELLS; i++) for(int i=0; i<B3_SOLVER_N_CELLS; i++)
{ {
int n = (nNativeHost)[i]; int n = (nNativeHost)[i];
int offset = (offsetsNativeHost)[i]; int offset = (offsetsNativeHost)[i];
if( n ) if( n )
{ {
numNonzeroGrid++; numNonzeroGrid++;
//printf("cpu batch cell %d\n",i);
int simdWidth =numBodies+1;//-1;//64;//-1;//32; int simdWidth =numBodies+1;//-1;//64;//-1;//32;
//int numBatches = sortConstraintByBatch( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies); // on GPU
//int numBatches = sortConstraintByBatch2( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies); // on GPU
int numBatches = sortConstraintByBatch3( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies); // on GPU int numBatches = sortConstraintByBatch3( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies); // on GPU
maxNumBatches = b3Max(numBatches,maxNumBatches); maxNumBatches = b3Max(numBatches,maxNumBatches);
static int globalMaxBatch = 0; static int globalMaxBatch = 0;
if (maxNumBatches>globalMaxBatch ) if (maxNumBatches>globalMaxBatch )
@@ -1029,13 +1021,11 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
globalMaxBatch = maxNumBatches; globalMaxBatch = maxNumBatches;
b3Printf("maxNumBatches = %d\n",maxNumBatches); b3Printf("maxNumBatches = %d\n",maxNumBatches);
} }
//we use the clFinish for proper benchmark/profile //we use the clFinish for proper benchmark/profile
clFinish(m_data->m_queue);
} }
} }
clFinish(m_data->m_queue);
} }
{ {
B3_PROFILE("m_contactBuffer->copyFromHost"); B3_PROFILE("m_contactBuffer->copyFromHost");
@@ -1044,45 +1034,45 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
} }
} }
//printf("maxNumBatches = %d\n", maxNumBatches); //printf("maxNumBatches = %d\n", maxNumBatches);
if (nContacts) if (nContacts)
{ {
//B3_PROFILE("gpu convertToConstraints"); B3_PROFILE("gpu convertToConstraints");
m_data->m_solverGPU->convertToConstraints( bodyBuf, m_data->m_solverGPU->convertToConstraints( bodyBuf,
shapeBuf, m_data->m_solverGPU->m_contactBuffer2, shapeBuf, m_data->m_solverGPU->m_contactBuffer2,
contactConstraintOut, contactConstraintOut,
additionalData, nContacts, additionalData, nContacts,
(b3SolverBase::ConstraintCfg&) csCfg ); (b3SolverBase::ConstraintCfg&) csCfg );
clFinish(m_data->m_queue); clFinish(m_data->m_queue);
} }
} }
} }
if (1) if (1)
{ {
int numIter = 4; int numIter = 4;
m_data->m_solverGPU->m_nIterations = numIter;//10 m_data->m_solverGPU->m_nIterations = numIter;//10
if (b3GpuSolveConstraint) if (b3GpuSolveConstraint)
{ {
B3_PROFILE("GPU solveContactConstraint"); B3_PROFILE("GPU solveContactConstraint");
/*m_data->m_solverGPU->solveContactConstraint( /*m_data->m_solverGPU->solveContactConstraint(
m_data->m_bodyBufferGPU, m_data->m_bodyBufferGPU,
m_data->m_inertiaBufferGPU, m_data->m_inertiaBufferGPU,
m_data->m_contactCGPU,0, m_data->m_contactCGPU,0,
nContactOut , nContactOut ,
maxNumBatches); maxNumBatches);
*/ */
solveContactConstraint( solveContactConstraint(

View File

@@ -978,7 +978,10 @@ void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf
cdata.m_positionDrift = cfg.m_positionDrift; cdata.m_positionDrift = cfg.m_positionDrift;
cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff; cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
b3AlignedObjectArray<b3RigidBodyCL> gBodies;
if (convertConstraintOnCpu)
{
b3AlignedObjectArray<b3RigidBodyCL> gBodies;
bodyBuf->copyToHost(gBodies); bodyBuf->copyToHost(gBodies);
b3AlignedObjectArray<b3Contact4> gContact; b3AlignedObjectArray<b3Contact4> gContact;
@@ -990,8 +993,6 @@ void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf
b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut; b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
gConstraintOut.resize(nContacts); gConstraintOut.resize(nContacts);
if (convertConstraintOnCpu)
{
B3_PROFILE("cpu contactToConstraintKernel"); B3_PROFILE("cpu contactToConstraintKernel");
for (int gIdx=0;gIdx<nContacts;gIdx++) for (int gIdx=0;gIdx<nContacts;gIdx++)
{ {

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.