remove clFinish and copyToHost from GPU joint solver, performance is looking better now.

This commit is contained in:
erwin coumans
2013-07-09 11:19:16 -07:00
parent c4375a09e4
commit bd2cd7b2a7
3 changed files with 22 additions and 22 deletions

View File

@@ -187,7 +187,7 @@ int GpuConstraintsDemo::createDynamicsObjects2(const ConstructionInfo& ci, const
{ {
case 0: case 0:
{ {
///enable next line to force CPU constraint solving
//c = new b3Point2PointConstraint(pid,prevBody,b3Vector3(-1.1,0,0),b3Vector3(1.1,0,0)); //c = new b3Point2PointConstraint(pid,prevBody,b3Vector3(-1.1,0,0),b3Vector3(1.1,0,0));
// c->setBreakingImpulseThreshold(14); // c->setBreakingImpulseThreshold(14);
b3Vector3 pivotInA(-1.1,0,0); b3Vector3 pivotInA(-1.1,0,0);

View File

@@ -351,7 +351,7 @@ void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArray<b3Rigi
} }
} }
clFinish(m_data->m_queue); //clFinish(m_data->m_queue);
} }
@@ -388,7 +388,7 @@ void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArray<b3Rigi
launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 ); launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 );
} }
} }
clFinish(m_data->m_queue); //clFinish(m_data->m_queue);
} }
#ifdef DEBUG_ME #ifdef DEBUG_ME
@@ -458,7 +458,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
m_data->m_solverGPU->m_contactBuffer2->resize(nContacts); m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
} }
clFinish(m_data->m_queue); //clFinish(m_data->m_queue);
@@ -586,7 +586,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
} }
clFinish(m_data->m_queue); //clFinish(m_data->m_queue);
// { // {
// b3AlignedObjectArray<unsigned int> histogram; // b3AlignedObjectArray<unsigned int> histogram;
@@ -604,7 +604,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( cdata ); launcher.setConst( cdata );
launcher.launch1D( nContacts, 64 ); launcher.launch1D( nContacts, 64 );
clFinish(m_data->m_queue); //clFinish(m_data->m_queue);
} }
@@ -666,7 +666,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
b3Printf("maxNumBatches = %d\n",maxNumBatches); b3Printf("maxNumBatches = %d\n",maxNumBatches);
} }
clFinish(m_data->m_queue); //clFinish(m_data->m_queue);
} }
} }
@@ -691,7 +691,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
contactConstraintOut, contactConstraintOut,
additionalData, nContacts, additionalData, nContacts,
(b3SolverBase::ConstraintCfg&) csCfg ); (b3SolverBase::ConstraintCfg&) csCfg );
clFinish(m_data->m_queue); //clFinish(m_data->m_queue);
} }
@@ -723,7 +723,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU,0, nContactOut ,maxNumBatches); m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU,0, nContactOut ,maxNumBatches);
} }
clFinish(m_data->m_queue); //clFinish(m_data->m_queue);
} }

View File

@@ -207,7 +207,7 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3Rigi
launcher.setBuffer(gpuBodies->getBufferCL()); launcher.setBuffer(gpuBodies->getBufferCL());
launcher.setConst(numBodies); launcher.setConst(numBodies);
launcher.launch1D(numBodies); launcher.launch1D(numBodies);
clFinish(m_gpuData->m_queue); //clFinish(m_gpuData->m_queue);
// m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); // m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
} else } else
@@ -264,20 +264,20 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3Rigi
launcher.setConst(numConstraints); launcher.setConst(numConstraints);
launcher.launch1D(numConstraints); launcher.launch1D(numConstraints);
} }
clFinish(m_gpuData->m_queue); //clFinish(m_gpuData->m_queue);
if (batches.size()==0) if (batches.size()==0)
m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints);
if (1) if (1)
{ {
m_gpuData->m_gpuConstraintInfo1->copyToHost(m_tmpConstraintSizesPool); //m_gpuData->m_gpuConstraintInfo1->copyToHost(m_tmpConstraintSizesPool);
b3OpenCLArray<unsigned int> dst(m_gpuData->m_context,m_gpuData->m_queue); b3OpenCLArray<unsigned int> dst(m_gpuData->m_context,m_gpuData->m_queue);
dst.resize(numConstraints); dst.resize(numConstraints);
unsigned int total=0; unsigned int total=0;
m_gpuData->m_prefixScan->execute(*m_gpuData->m_gpuConstraintInfo1,dst,numConstraints,&total); m_gpuData->m_prefixScan->execute(*m_gpuData->m_gpuConstraintInfo1,dst,numConstraints,&total);
unsigned int lastElem = m_gpuData->m_gpuConstraintInfo1->at(numConstraints-1); unsigned int lastElem = m_gpuData->m_gpuConstraintInfo1->at(numConstraints-1);
b3AlignedObjectArray<unsigned int> dstHost; //b3AlignedObjectArray<unsigned int> dstHost;
dst.copyToHost(dstHost); //dst.copyToHost(dstHost);
totalNumRows = total+lastElem; totalNumRows = total+lastElem;
{ {
@@ -287,7 +287,7 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3Rigi
launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL());
launcher.setConst(numConstraints); launcher.setConst(numConstraints);
launcher.launch1D(numConstraints); launcher.launch1D(numConstraints);
clFinish(m_gpuData->m_queue); //clFinish(m_gpuData->m_queue);
} }
if (batches.size()==0) if (batches.size()==0)
m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints);
@@ -346,12 +346,12 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3Rigi
launcher.setConst(infoGlobal.m_numIterations); launcher.setConst(infoGlobal.m_numIterations);
launcher.setConst(numConstraints); launcher.setConst(numConstraints);
launcher.launch1D(numConstraints); launcher.launch1D(numConstraints);
clFinish(m_gpuData->m_queue); //clFinish(m_gpuData->m_queue);
if (batches.size()==0) if (batches.size()==0)
m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints);
//m_gpuData->m_gpuConstraintRows->copyToHost(verify); //m_gpuData->m_gpuConstraintRows->copyToHost(verify);
m_gpuData->m_gpuConstraintRows->copyToHost(m_tmpSolverNonContactConstraintPool); //m_gpuData->m_gpuConstraintRows->copyToHost(m_tmpSolverNonContactConstraintPool);
@@ -645,7 +645,7 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3OpenCLArray<b
{ {
//only create the batches once. //only create the batches once.
//@todo: incrementally update batches when constraints are added/activated and/or removed/deactivated //@todo: incrementally update batches when constraints are added/activated and/or removed/deactivated
bool createBatches = true;//batches.size()==0; bool createBatches = batches.size()==0;
{ {
B3_PROFILE("GpuSolveGroupCacheFriendlyIterations"); B3_PROFILE("GpuSolveGroupCacheFriendlyIterations");
if (createBatches) if (createBatches)
@@ -701,7 +701,7 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3OpenCLArray<b
launcher.setConst(numConstraintsInBatch); launcher.setConst(numConstraintsInBatch);
launcher.launch1D(numConstraintsInBatch); launcher.launch1D(numConstraintsInBatch);
clFinish(m_gpuData->m_queue); //clFinish(m_gpuData->m_queue);
} else } else
{ {
@@ -740,8 +740,8 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3OpenCLArray<b
if (useGpu) if (useGpu)
{ {
B3_PROFILE("copy to host"); //B3_PROFILE("copy to host");
m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); //m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
} }
//int sz = sizeof(b3GpuSolverBody); //int sz = sizeof(b3GpuSolverBody);
//printf("cpu sizeof(b3GpuSolverBody)=%d\n",sz); //printf("cpu sizeof(b3GpuSolverBody)=%d\n",sz);
@@ -971,7 +971,7 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b3Rig
launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL());
launcher.setConst(numBodies); launcher.setConst(numBodies);
launcher.launch1D(numBodies); launcher.launch1D(numBodies);
clFinish(m_gpuData->m_queue); //clFinish(m_gpuData->m_queue);
// m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); // m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
// m_gpuData->m_gpuBodies->copyToHostPointer(bodies,numBodies); // m_gpuData->m_gpuBodies->copyToHostPointer(bodies,numBodies);
//m_gpuData->m_gpuBodies->copyToHost(testBodies); //m_gpuData->m_gpuBodies->copyToHost(testBodies);