From 1e31073f4b0b31501a655273359377354eacf09c Mon Sep 17 00:00:00 2001 From: erwin coumans Date: Sun, 14 Jul 2013 19:16:33 -0700 Subject: [PATCH] made the simulation deterministic disable 'simdwidth' optimization for determinism (need to double-check) made the spatial batching 3D --- Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp | 5 +- btgui/OpenGLWindow/GLInstancingRenderer.cpp | 5 +- .../ParallelPrimitives/b3RadixSort32CL.h | 13 +- .../RigidBody/b3GpuBatchingPgsSolver.cpp | 345 ++++++++++++--- .../RigidBody/b3GpuBatchingPgsSolver.h | 2 +- .../RigidBody/b3GpuRigidBodyPipeline.cpp | 1 + src/Bullet3OpenCL/RigidBody/b3Solver.cpp | 413 ++++++++++++++---- src/Bullet3OpenCL/RigidBody/b3Solver.h | 20 +- .../RigidBody/kernels/solveContact.cl | 27 +- .../RigidBody/kernels/solveContact.h | 27 +- .../RigidBody/kernels/solveFriction.cl | 13 +- .../RigidBody/kernels/solveFriction.h | 13 +- .../RigidBody/kernels/solverSetup2.cl | 45 +- .../RigidBody/kernels/solverSetup2.h | 45 +- 14 files changed, 805 insertions(+), 169 deletions(-) diff --git a/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp b/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp index a73a376ef..48d4e3e8a 100644 --- a/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp +++ b/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp @@ -40,7 +40,10 @@ void GpuConvexScene::setupScene(const ConstructionInfo& ci) //float camPos[4]={1,12.5,1.5,0}; m_instancingRenderer->setCameraTargetPosition(camPos); - m_instancingRenderer->setCameraDistance(100); + m_instancingRenderer->setCameraDistance(120); + //m_instancingRenderer->setCameraYaw(85); + m_instancingRenderer->setCameraYaw(30); + m_instancingRenderer->setCameraPitch(225); m_instancingRenderer->updateCamera(); diff --git a/btgui/OpenGLWindow/GLInstancingRenderer.cpp b/btgui/OpenGLWindow/GLInstancingRenderer.cpp index e57306f7d..997022b55 100644 --- a/btgui/OpenGLWindow/GLInstancingRenderer.cpp +++ b/btgui/OpenGLWindow/GLInstancingRenderer.cpp @@ -195,7 +195,10 @@ struct InternalDataRenderer : public GLInstanceRendererInternalData m_ele += yDelta*0.1f; // } } - + + //printf("m_azi/pitch = %f\n", m_azi); +// printf("m_ele/yaw = %f\n", m_ele); + m_mouseXpos = x; m_mouseYpos = y; m_mouseInitialized = true; diff --git a/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h b/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h index 61d4573bc..0adebb947 100644 --- a/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h +++ b/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h @@ -6,8 +6,17 @@ struct b3SortData { - int m_key; - int m_value; + union + { + int m_key; + int x; + }; + + union + { + int m_value; + int y; + }; }; #include "b3BufferInfoCL.h" diff --git a/src/Bullet3OpenCL/RigidBody/b3GpuBatchingPgsSolver.cpp b/src/Bullet3OpenCL/RigidBody/b3GpuBatchingPgsSolver.cpp index 267b33a5a..120f596ff 100644 --- a/src/Bullet3OpenCL/RigidBody/b3GpuBatchingPgsSolver.cpp +++ b/src/Bullet3OpenCL/RigidBody/b3GpuBatchingPgsSolver.cpp @@ -1,7 +1,10 @@ bool b3GpuBatchContacts = true; bool b3GpuSolveConstraint = true; - +bool gpuRadixSort=true; +bool gpuSetSortData = true; +bool gpuSortContacts = true; +bool optionalSortContactsDeterminism = true; #include "b3GpuBatchingPgsSolver.h" #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" @@ -31,15 +34,6 @@ bool b3GpuSolveConstraint = true; -enum -{ - B3_SOLVER_N_SPLIT = 16, - B3_SOLVER_N_BATCHES = 4, - B3_SOLVER_N_OBJ_PER_SPLIT = 10, - B3_SOLVER_N_TASKS_PER_BATCH = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, -}; - - struct b3GpuBatchingPgsSolverInternalData @@ -65,6 +59,10 @@ struct b3GpuBatchingPgsSolverInternalData cl_kernel m_reorderContactKernel; cl_kernel m_copyConstraintKernel; + cl_kernel m_setDeterminismSortDataBodyAKernel; + cl_kernel m_setDeterminismSortDataBodyBKernel; + + class b3RadixSort32CL* m_sort32; class b3BoundSearchCL* m_search; class b3PrefixScanCL* m_scan; @@ -75,6 +73,9 @@ struct b3GpuBatchingPgsSolverInternalData b3OpenCLArray* m_bodyBufferGPU; b3OpenCLArray* m_inertiaBufferGPU; b3OpenCLArray* m_pBufContactOutGPU; + + b3OpenCLArray* m_pBufContactOutGPUCopy; + b3OpenCLArray* m_contactKeyValues; b3AlignedObjectArray m_idxBuffer; @@ -86,6 +87,7 @@ struct b3GpuBatchingPgsSolverInternalData b3GpuBatchingPgsSolver::b3GpuBatchingPgsSolver(cl_context ctx,cl_device_id device, cl_command_queue q,int pairCapacity) { + m_debugOutput=0; m_data = new b3GpuBatchingPgsSolverInternalData; m_data->m_context = ctx; m_data->m_device = device; @@ -97,24 +99,28 @@ b3GpuBatchingPgsSolver::b3GpuBatchingPgsSolver(cl_context ctx,cl_device_id devic m_data->m_inertiaBufferGPU = new b3OpenCLArray(ctx,q); m_data->m_pBufContactOutGPU = new b3OpenCLArray(ctx,q); + m_data->m_pBufContactOutGPUCopy = new b3OpenCLArray(ctx,q); + m_data->m_contactKeyValues = new b3OpenCLArray(ctx,q); + + m_data->m_solverGPU = new b3Solver(ctx,device,q,512*1024); m_data->m_sort32 = new b3RadixSort32CL(ctx,device,m_data->m_queue); - m_data->m_scan = new b3PrefixScanCL(ctx,device,m_data->m_queue,B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT); - m_data->m_search = new b3BoundSearchCL(ctx,device,m_data->m_queue,B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT); + m_data->m_scan = new b3PrefixScanCL(ctx,device,m_data->m_queue,B3_SOLVER_N_CELLS); + m_data->m_search = new b3BoundSearchCL(ctx,device,m_data->m_queue,B3_SOLVER_N_CELLS); const int sortSize = B3NEXTMULTIPLEOF( pairCapacity, 512 ); m_data->m_sortDataBuffer = new b3OpenCLArray(ctx,m_data->m_queue,sortSize); m_data->m_contactBuffer = new b3OpenCLArray(ctx,m_data->m_queue); - m_data->m_numConstraints = new b3OpenCLArray(ctx,m_data->m_queue,B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT ); - m_data->m_numConstraints->resize(B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT); + m_data->m_numConstraints = new b3OpenCLArray(ctx,m_data->m_queue,B3_SOLVER_N_CELLS); + m_data->m_numConstraints->resize(B3_SOLVER_N_CELLS); m_data->m_contactCGPU = new b3OpenCLArray(ctx,q,pairCapacity); - m_data->m_offsets = new b3OpenCLArray( ctx,m_data->m_queue, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT ); - m_data->m_offsets->resize(B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT); + m_data->m_offsets = new b3OpenCLArray( ctx,m_data->m_queue,B3_SOLVER_N_CELLS); + m_data->m_offsets->resize(B3_SOLVER_N_CELLS); const char* additionalMacros = ""; const char* srcFileNameForCaching=""; @@ -132,7 +138,7 @@ b3GpuBatchingPgsSolver::b3GpuBatchingPgsSolver(cl_context ctx,cl_device_id devic { - cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH); + cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH,true); b3Assert(solveContactProg); cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH); @@ -149,7 +155,7 @@ b3GpuBatchingPgsSolver::b3GpuBatchingPgsSolver(cl_context ctx,cl_device_id devic m_data->m_solveFrictionKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros ); b3Assert(m_data->m_solveFrictionKernel); - m_data->m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros ); + m_data->m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, 0, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros ); b3Assert(m_data->m_solveContactKernel); m_data->m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros ); @@ -157,7 +163,13 @@ b3GpuBatchingPgsSolver::b3GpuBatchingPgsSolver(cl_context ctx,cl_device_id devic m_data->m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros ); b3Assert(m_data->m_setSortDataKernel); - + + m_data->m_setDeterminismSortDataBodyAKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyA", &pErrNum, solverSetup2Prog,additionalMacros ); + b3Assert(m_data->m_setDeterminismSortDataBodyAKernel); + + m_data->m_setDeterminismSortDataBodyBKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyB", &pErrNum, solverSetup2Prog,additionalMacros ); + b3Assert(m_data->m_setDeterminismSortDataBodyBKernel); + m_data->m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros ); b3Assert(m_data->m_reorderContactKernel); @@ -196,6 +208,10 @@ b3GpuBatchingPgsSolver::~b3GpuBatchingPgsSolver() delete m_data->m_bodyBufferGPU; delete m_data->m_inertiaBufferGPU; delete m_data->m_pBufContactOutGPU; + delete m_data->m_pBufContactOutGPUCopy; + delete m_data->m_contactKeyValues; + + delete m_data->m_contactCGPU; delete m_data->m_numConstraints; @@ -232,23 +248,24 @@ struct b3ConstraintCfg float m_positionConstraintCoeff; float m_dt; bool m_enableParallelSolve; - float m_averageExtent; + float m_batchCellSize; int m_staticIdx; }; - void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArray* bodyBuf, const b3OpenCLArray* shapeBuf, b3OpenCLArray* constraint, void* additionalData, int n ,int maxNumBatches,int numIterations) { + //sort the contacts + b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 ); { - const int nn = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT; + const int nn = B3_SOLVER_N_CELLS; cdata.x = 0; cdata.y = maxNumBatches;//250; @@ -276,7 +293,7 @@ void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArraym_queue, m_data->m_solveContactKernel ); #if 1 @@ -286,8 +303,8 @@ void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArraygetBufferCL() ), b3BufferInfoCL( shapeBuf->getBufferCL() ), b3BufferInfoCL( constraint->getBufferCL() ), - b3BufferInfoCL( m_data->m_numConstraints->getBufferCL() ), - b3BufferInfoCL( m_data->m_offsets->getBufferCL() ) + b3BufferInfoCL( m_data->m_solverGPU->m_numConstraints->getBufferCL() ), + b3BufferInfoCL( m_data->m_solverGPU->m_offsets->getBufferCL() ) #ifdef DEBUG_ME , b3BufferInfoCL(&gpuDebugInfo) #endif @@ -299,7 +316,12 @@ void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArraygetBufferCL() ), b3BufferInfoCL( shapeBuf->getBufferCL() ), b3BufferInfoCL( constraint->getBufferCL() ), - b3BufferInfoCL( m_data->m_numConstraints->getBufferCL() ), - b3BufferInfoCL( m_data->m_offsets->getBufferCL() ) + b3BufferInfoCL( m_data->m_solverGPU->m_numConstraints->getBufferCL() ), + b3BufferInfoCL( m_data->m_solverGPU->m_offsets->getBufferCL() ) #ifdef DEBUG_ME ,b3BufferInfoCL(&gpuDebugInfo) #endif //DEBUG_ME @@ -383,7 +405,13 @@ void b3GpuBatchingPgsSolver::solveContactConstraint( const b3OpenCLArraym_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf,numBodies); m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf,numBodies); m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf,numContacts); + if (optionalSortContactsDeterminism) + { + if (gpuSortContacts) + { + B3_PROFILE("GPU Sort contact constraints (determinism)"); + + m_data->m_pBufContactOutGPUCopy->resize(numContacts); + m_data->m_contactKeyValues->resize(numContacts); + + m_data->m_pBufContactOutGPU->copyToCL(m_data->m_pBufContactOutGPUCopy->getBufferCL(),numContacts,0,0); + + + { + b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyAKernel); + launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); + launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); + launcher.setConst(numContacts); + launcher.launch1D( numContacts, 64 ); + } + + m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); + + { + b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyBKernel); + launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); + launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); + launcher.setConst(numContacts); + launcher.launch1D( numContacts, 64 ); + } + + m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); + + //__global Contact4* in, __global Contact4* out, __global int2* sortData, int4 cb ) + + { + B3_PROFILE("gpu reorderContactKernel (determinism)"); + + b3Int4 cdata; + cdata.x = numContacts; + + //b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL()) + // , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; + b3LauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel); + launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); + launcher.setBuffer(m_data->m_pBufContactOutGPU->getBufferCL()); + launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); + launcher.setConst( cdata ); + launcher.launch1D( numContacts, 64 ); + } + + } else + { + B3_PROFILE("CPU Sort contact constraints (determinism)"); + b3AlignedObjectArray cpuConstraints; + m_data->m_pBufContactOutGPU->copyToHost(cpuConstraints); + bool sort = true; + if (sort) + { + cpuConstraints.quickSort(b3ContactCmp); + + for (int i=0;im_pBufContactOutGPU->copyFromHost(cpuConstraints); + if (m_debugOutput==100) + { + for (int i=0;im_pBufContactOutGPU->size(); bool useSolver = true; @@ -431,7 +641,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem float dt=1./60.; b3ConstraintCfg csCfg( dt ); csCfg.m_enableParallelSolve = true; - csCfg.m_averageExtent = 0.3;//0.1;//2;//.2f;//@TODO m_averageObjExtent; + csCfg.m_batchCellSize = 6; csCfg.m_staticIdx = static0Index; @@ -485,7 +695,9 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem b3OpenCLArray* countsNative = m_data->m_solverGPU->m_numConstraints; b3OpenCLArray* offsetsNative = m_data->m_solverGPU->m_offsets; - + + + if (gpuSetSortData) { // 2. set cell idx B3_PROFILE("GPU set cell idx"); struct CB @@ -493,15 +705,17 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem int m_nContacts; int m_staticIdx; float m_scale; - int m_nSplit; + b3Int4 m_nSplit; }; b3Assert( sortSize%64 == 0 ); CB cdata; cdata.m_nContacts = nContacts; cdata.m_staticIdx = csCfg.m_staticIdx; - cdata.m_scale = 1.f/(B3_SOLVER_N_OBJ_PER_SPLIT*csCfg.m_averageExtent); - cdata.m_nSplit = B3_SOLVER_N_SPLIT; + cdata.m_scale = 1.f/csCfg.m_batchCellSize; + cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X; + cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y; + cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z; m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts); @@ -516,10 +730,30 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem launcher.launch1D( sortSize, 64 ); - } + } else + { + m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts); + b3AlignedObjectArray sortDataCPU; + m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataCPU); + + b3AlignedObjectArray contactCPU; + m_data->m_pBufContactOutGPU->copyToHost(contactCPU); + b3AlignedObjectArray bodiesCPU; + bodyBuf->copyToHost(bodiesCPU); + float scale = 1.f/csCfg.m_batchCellSize; + b3Int4 nSplit; + nSplit.x = B3_SOLVER_N_SPLIT_X; + nSplit.y = B3_SOLVER_N_SPLIT_Y; + nSplit.z = B3_SOLVER_N_SPLIT_Z; + + SetSortDataCPU(&contactCPU[0], &bodiesCPU[0], &sortDataCPU[0], nContacts,scale,nSplit,csCfg.m_staticIdx); + + + m_data->m_solverGPU->m_sortDataBuffer->copyFromHost(sortDataCPU); + } + - bool gpuRadixSort=true; if (gpuRadixSort) { // 3. sort by cell idx B3_PROFILE("gpuRadixSort"); @@ -543,21 +777,21 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem keyValuesInOut.copyFromHost(hostValues); } + { // 4. find entries B3_PROFILE("gpuBoundSearch"); - m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative, - B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT,b3BoundSearchCL::COUNT); - + m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT); + //adl::BoundSearch::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, // B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT ); //unsigned int sum; - m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT);//,&sum ); + m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum ); //printf("sum = %d\n",sum); - } + } @@ -640,7 +874,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem { B3_PROFILE("batch grid"); - for(int i=0; im_solverGPU->m_nIterations = 4;//10 + int numIter = 4; + + m_data->m_solverGPU->m_nIterations = numIter;//10 if (b3GpuSolveConstraint) { B3_PROFILE("GPU solveContactConstraint"); - m_data->m_solverGPU->solveContactConstraint( + /*m_data->m_solverGPU->solveContactConstraint( m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU,0, nContactOut , maxNumBatches); + */ + + solveContactConstraint( + m_data->m_bodyBufferGPU, + m_data->m_inertiaBufferGPU, + m_data->m_contactCGPU,0, + nContactOut , + maxNumBatches,numIter); + } else { @@ -803,6 +1048,8 @@ inline int b3GpuBatchingPgsSolver::sortConstraintByBatch( b3Contact4* cs, int n, for(int i=0; i(ctx,queue,sortSize); m_contactBuffer2 = new b3OpenCLArray(ctx,queue); - m_numConstraints = new b3OpenCLArray(ctx,queue,N_SPLIT*N_SPLIT ); - m_numConstraints->resize(N_SPLIT*N_SPLIT); + m_numConstraints = new b3OpenCLArray(ctx,queue,B3_SOLVER_N_CELLS ); + m_numConstraints->resize(B3_SOLVER_N_CELLS); - m_offsets = new b3OpenCLArray( ctx,queue, N_SPLIT*N_SPLIT ); - m_offsets->resize(N_SPLIT*N_SPLIT); + m_offsets = new b3OpenCLArray( ctx,queue,B3_SOLVER_N_CELLS); + m_offsets->resize(B3_SOLVER_N_CELLS); const char* additionalMacros = ""; const char* srcFileNameForCaching=""; @@ -122,7 +122,7 @@ b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, { - cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH); + cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH,false); b3Assert(solveContactProg); cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH); @@ -168,8 +168,8 @@ b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, B3_BATCHING_NEW_PATH); b3Assert(batchingNewProg); - m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros ); - //m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros ); + //m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros ); + m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros ); b3Assert(m_batchingKernelNew); } } @@ -454,69 +454,160 @@ void solveContact(b3GpuConstraint4& cs, struct SolveTask// : public ThreadPool::Task { SolveTask(b3AlignedObjectArray& bodies, b3AlignedObjectArray& shapes, b3AlignedObjectArray& constraints, - int start, int nConstraints) + int start, int nConstraints,int maxNumBatches,b3AlignedObjectArray* wgUsedBodies, int curWgidx) : m_bodies( bodies ), m_shapes( shapes ), m_constraints( constraints ), m_start( start ), m_nConstraints( nConstraints ), - m_solveFriction( true ){} + m_solveFriction( true ),m_maxNumBatches(maxNumBatches), + m_wgUsedBodies(wgUsedBodies),m_curWgidx(curWgidx) + {} unsigned short int getType(){ return 0; } void run(int tIdx) { + b3AlignedObjectArray usedBodies; + //printf("run..............\n"); + - - for(int ic=0; ic=0; ic--) + //for(int ic=0; ic( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, - (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld, - maxRambdaDt, minRambdaDt ); + float frictionCoeff = m_constraints[i].getFrictionCoeff(); + int aIdx = (int)m_constraints[i].m_bodyA; + int bIdx = (int)m_constraints[i].m_bodyB; + int localBatch = m_constraints[i].m_batchIdx; + b3RigidBodyCL& bodyA = m_bodies[aIdx]; + b3RigidBodyCL& bodyB = m_bodies[bIdx]; - } - else - { - float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; - float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; - - float sum = 0; - for(int j=0; j<4; j++) + if ((bodyA.m_invMass) && (bodyB.m_invMass)) { - sum +=m_constraints[i].m_appliedRambdaDt[j]; + // printf("aIdx=%d, bIdx=%d\n", aIdx,bIdx); } - frictionCoeff = 0.7f; - for(int j=0; j<4; j++) + if (bIdx==10) { - maxRambdaDt[j] = frictionCoeff*sum; - minRambdaDt[j] = -maxRambdaDt[j]; + //printf("ic(b)=%d, localBatch=%d\n",ic,localBatch); } - solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, - (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld, - maxRambdaDt, minRambdaDt ); + if (aIdx==10) + { + //printf("ic(a)=%d, localBatch=%d\n",ic,localBatch); + } + if (usedBodies.size()<(aIdx+1)) + { + usedBodies.resize(aIdx+1,0); + } + + if (usedBodies.size()<(bIdx+1)) + { + usedBodies.resize(bIdx+1,0); + } + + if (bodyA.m_invMass) + { + b3Assert(usedBodies[aIdx]==0); + } + if (m_wgUsedBodies) + { + for (int w=0;waIdx) + { + b3Assert(m_wgUsedBodies[w][aIdx]==0); + } + } + if (bodyB.m_invMass) + { + if (m_wgUsedBodies[w].size()>bIdx) + { + b3Assert(m_wgUsedBodies[w][bIdx]==0); + } + } + } + } + } + usedBodies[aIdx]++; + if (bodyB.m_invMass) + { + b3Assert(usedBodies[bIdx]==0); + } + usedBodies[bIdx]++; + + if( !m_solveFriction ) + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + solveContact( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt ); + + } + else + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + float sum = 0; + for(int j=0; j<4; j++) + { + sum +=m_constraints[i].m_appliedRambdaDt[j]; + } + frictionCoeff = 0.7f; + for(int j=0; j<4; j++) + { + maxRambdaDt[j] = frictionCoeff*sum; + minRambdaDt[j] = -maxRambdaDt[j]; + } + + solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt ); + } } + + if (m_wgUsedBodies) + { + if (m_wgUsedBodies[m_curWgidx].size()& m_bodies; b3AlignedObjectArray& m_shapes; b3AlignedObjectArray& m_constraints; + b3AlignedObjectArray* m_wgUsedBodies; + int m_curWgidx; int m_start; int m_nConstraints; bool m_solveFriction; + int m_maxNumBatches; }; @@ -524,6 +615,51 @@ void b3Solver::solveContactConstraintHost( b3OpenCLArray* bodyBu b3OpenCLArray* constraint, void* additionalData, int n ,int maxNumBatches) { +#if 0 + { + int nSplitX = B3_SOLVER_N_SPLIT_X; + int nSplitY = B3_SOLVER_N_SPLIT_Y; + int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES; + for (int z=0;z<4;z++) + { + for (int y=0;y<4;y++) + { + for (int x=0;x<4;x++) + { + int newIndex = (x+y*nSplitX+z*nSplitX*nSplitY); + // printf("newIndex=%d\n",newIndex); + + int zIdx = newIndex/(nSplitX*nSplitY); + int remain = newIndex%(nSplitX*nSplitY); + int yIdx = remain/nSplitX; + int xIdx = remain%nSplitX; + // printf("newIndex=%d\n",newIndex); + } + } + } + + //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--) + for (int cellBatch=0;cellBatch>2); + int remain= (wgIdx%((nSplitX*nSplitY)/4)); + int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1); + + /*int zIdx = newIndex/(nSplitX*nSplitY); + int remain = newIndex%(nSplitX*nSplitY); + int yIdx = remain/nSplitX; + int xIdx = remain%nSplitX; + */ + int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY); + // printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch); + } + } + } +#endif + b3AlignedObjectArray bodyNative; bodyBuf->copyToHost(bodyNative); b3AlignedObjectArray shapeNative; @@ -531,24 +667,129 @@ void b3Solver::solveContactConstraintHost( b3OpenCLArray* bodyBu b3AlignedObjectArray constraintNative; constraint->copyToHost(constraintNative); - for(int iter=0; iter numConstraintsHost; + m_numConstraints->copyToHost(numConstraintsHost); - for(int iter=0; iter offsetsHost; + m_offsets->copyToHost(offsetsHost); + static int frame=0; + bool useBatches=true; + if (useBatches) { - SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ); - task.m_solveFriction = true; - task.run(0); + for(int iter=0; iter usedBodies[B3_SOLVER_N_CELLS]; + for (int i=0;i=0;wgIdx--) + for (int wgIdx=0;wgIdx>2); + int remain= (wgIdx%((nSplitX*nSplitY)/4)); + int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1); + int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY); + + + if( numConstraintsHost[cellIdx] == 0 ) + continue; + + //printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch); + //printf("cell %d has %d constraints\n", cellIdx,numConstraintsHost[cellIdx]); + if (zIdx) + { + //printf("?\n"); + } + + if (iter==0) + { + //printf("frame=%d, Cell xIdx=%x, yIdx=%d ",frame, xIdx,yIdx); + //printf("cellBatch=%d, wgIdx=%d, #constraints in cell=%d\n",cellBatch,wgIdx,numConstraintsHost[cellIdx]); + } + const int start = offsetsHost[cellIdx]; + int numConstraintsInCell = numConstraintsHost[cellIdx]; + const int end = start + numConstraintsInCell; + + SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell ,maxNumBatches,usedBodies,wgIdx); + task.m_solveFriction = false; + task.run(0); + + } + } + } + + for(int iter=0; iter>2); + int remain= (wgIdx%((nSplitX*nSplitY)/4)); + int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1); + + int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY); + + if( numConstraintsHost[cellIdx] == 0 ) + continue; + + //printf("yIdx=%d\n",yIdx); + + const int start = offsetsHost[cellIdx]; + int numConstraintsInCell = numConstraintsHost[cellIdx]; + const int end = start + numConstraintsInCell; + + SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell,maxNumBatches, 0,0); + task.m_solveFriction = true; + task.run(0); + + } + } + } + + + } else + { + for(int iter=0; itercopyFromHost(bodyNative); shapeBuf->copyFromHost(shapeNative); constraint->copyFromHost(constraintNative); - + frame++; } @@ -563,14 +804,17 @@ void checkConstraintBatch(const b3OpenCLArray* bodyBuf, // b3BufferInfoCL( m_numConstraints->getBufferCL() ), // b3BufferInfoCL( m_offsets->getBufferCL() ) - const int nn = b3SolverBase::N_SPLIT*b3SolverBase::N_SPLIT; - int numWorkItems = 64*nn/b3SolverBase::N_BATCHES; + int cellBatch = batchId; + const int nn = B3_SOLVER_N_CELLS; + int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES; b3AlignedObjectArray gN; m_numConstraints->copyToHost(gN); b3AlignedObjectArray gOffsets; m_offsets->copyToHost(gOffsets); - int nSplit = b3SolverBase::N_SPLIT; + int nSplitX = B3_SOLVER_N_SPLIT_X; + int nSplitY = B3_SOLVER_N_SPLIT_Y; + int bIdx = batchId; b3AlignedObjectArray cpuConstraints; @@ -578,16 +822,21 @@ void checkConstraintBatch(const b3OpenCLArray* bodyBuf, printf("batch = %d\n", batchId); - int numWorkgroups = nn/b3SolverBase::N_BATCHES; + int numWorkgroups = nn/B3_SOLVER_N_BATCHES; b3AlignedObjectArray usedBodies; for (int wgIdx=0;wgIdx>1); - int cellIdx = xIdx+yIdx*nSplit; + + int zIdx = (wgIdx/((nSplitX*nSplitY))/2)*2+((cellBatch&4)>>2); + int remain = wgIdx%((nSplitX*nSplitY)); + int yIdx = (remain%(nSplitX/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain/(nSplitX/2))*2 + (cellBatch&1); + + + int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY); printf("cellIdx=%d\n",cellIdx); if( gN[cellIdx] == 0 ) continue; @@ -629,13 +878,13 @@ void b3Solver::solveContactConstraint( const b3OpenCLArray* body b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 ); { - const int nn = N_SPLIT*N_SPLIT; + const int nn = B3_SOLVER_N_CELLS; cdata.x = 0; cdata.y = maxNumBatches;//250; - int numWorkItems = 64*nn/N_BATCHES; + int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES; #ifdef DEBUG_ME SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; adl::b3OpenCLArray gpuDebugInfo(data->m_device,numWorkItems); @@ -648,7 +897,7 @@ void b3Solver::solveContactConstraint( const b3OpenCLArray* body B3_PROFILE("m_batchSolveKernel iterations"); for(int iter=0; iter* body cdata.z = ib; - cdata.w = N_SPLIT; + b3LauncherCL launcher( m_queue, m_solveContactKernel ); #if 1 @@ -686,7 +935,12 @@ void b3Solver::solveContactConstraint( const b3OpenCLArray* body //launcher.setConst( cdata.x ); launcher.setConst( cdata.y ); launcher.setConst( cdata.z ); - launcher.setConst( cdata.w ); + b3Int4 nSplit; + nSplit.x = B3_SOLVER_N_SPLIT_X; + nSplit.y = B3_SOLVER_N_SPLIT_Y; + nSplit.z = B3_SOLVER_N_SPLIT_Z; + + launcher.setConst( nSplit ); launcher.launch1D( numWorkItems, 64 ); @@ -750,10 +1004,10 @@ void b3Solver::solveContactConstraint( const b3OpenCLArray* body B3_PROFILE("m_batchSolveKernel iterations2"); for(int iter=0; itergetBufferCL() ), @@ -770,9 +1024,14 @@ void b3Solver::solveContactConstraint( const b3OpenCLArray* body //launcher.setConst( cdata.x ); launcher.setConst( cdata.y ); launcher.setConst( cdata.z ); - launcher.setConst( cdata.w ); + b3Int4 nSplit; + nSplit.x = B3_SOLVER_N_SPLIT_X; + nSplit.y = B3_SOLVER_N_SPLIT_Y; + nSplit.z = B3_SOLVER_N_SPLIT_Z; + + launcher.setConst( nSplit ); - launcher.launch1D( 64*nn/N_BATCHES, 64 ); + launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 ); } } clFinish(m_queue); @@ -861,7 +1120,7 @@ void b3Solver::sortContacts( const b3OpenCLArray* bodyBuf, cdata.m_nContacts = nContacts; cdata.m_staticIdx = cfg.m_staticIdx; cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent); - cdata.m_nSplit = N_SPLIT; + cdata.m_nSplit = B3_SOLVER_N_SPLIT; b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) }; @@ -872,16 +1131,16 @@ void b3Solver::sortContacts( const b3OpenCLArray* bodyBuf, } { // 3. sort by cell idx - int n = N_SPLIT*N_SPLIT; + int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT; int sortBit = 32; //if( n <= 0xffff ) sortBit = 16; //if( n <= 0xff ) sortBit = 8; m_sort32->execute(*m_sortDataBuffer,sortSize); } { // 4. find entries - m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, N_SPLIT*N_SPLIT, b3BoundSearchCL::COUNT); + m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, b3BoundSearchCL::COUNT); - m_scan->execute( *countsNative, *offsetsNative, N_SPLIT*N_SPLIT ); + m_scan->execute( *countsNative, *offsetsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT ); } { // 5. sort constraints by cellIdx @@ -911,7 +1170,7 @@ void b3Solver::sortContacts( const b3OpenCLArray* bodyBuf, void b3Solver::batchContacts( b3OpenCLArray* contacts, int nContacts, b3OpenCLArray* nNative, b3OpenCLArray* offsetsNative, int staticIdx ) { - int numWorkItems = 64*N_SPLIT*N_SPLIT; + int numWorkItems = 64*B3_SOLVER_N_CELLS; { B3_PROFILE("batch generation"); @@ -962,7 +1221,7 @@ void b3Solver::batchContacts( b3OpenCLArray* contacts, int nContact launcher.setConst(staticIdx); launcher.launch1D( numWorkItems, 64 ); - clFinish(m_queue); + //clFinish(m_queue); } #ifdef BATCH_DEBUG diff --git a/src/Bullet3OpenCL/RigidBody/b3Solver.h b/src/Bullet3OpenCL/RigidBody/b3Solver.h index 312057d76..9f483b992 100644 --- a/src/Bullet3OpenCL/RigidBody/b3Solver.h +++ b/src/Bullet3OpenCL/RigidBody/b3Solver.h @@ -32,6 +32,15 @@ subject to the following restrictions: #define B3NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment)) +enum +{ + B3_SOLVER_N_SPLIT_X = 8,//16,//4, + B3_SOLVER_N_SPLIT_Y = 4,//16,//4, + B3_SOLVER_N_SPLIT_Z = 8,//, + B3_SOLVER_N_CELLS = B3_SOLVER_N_SPLIT_X*B3_SOLVER_N_SPLIT_Y*B3_SOLVER_N_SPLIT_Z, + B3_SOLVER_N_BATCHES = 8,//4,//8,//4, +}; + class b3SolverBase { public: @@ -45,19 +54,10 @@ class b3SolverBase float m_positionConstraintCoeff; float m_dt; bool m_enableParallelSolve; - float m_averageExtent; + float m_batchCellSize; int m_staticIdx; }; - - - enum - { - N_SPLIT = 16, - N_BATCHES = 4,//8,//4, - N_OBJ_PER_SPLIT = 10, - N_TASKS_PER_BATCH = N_SPLIT*N_SPLIT, - }; }; class b3Solver : public b3SolverBase diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl index 4b7cb769b..0c971f403 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl +++ b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl @@ -411,8 +411,8 @@ void BatchSolveKernelContact(__global Body* gBodies, __global int* gN, __global int* gOffsets, int maxBatch, - int bIdx, - int nSplit + int cellBatch, + int4 nSplit ) { //__local int ldsBatchIdx[WG_SIZE+1]; @@ -428,17 +428,29 @@ void BatchSolveKernelContact(__global Body* gBodies, //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE; - int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1); - int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1); - int cellIdx = xIdx+yIdx*nSplit; + int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2); + int remain= (wgIdx%((nSplit.x*nSplit.y)/4)); + int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1); + int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y); + + //int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1); + //int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1); + //int cellIdx = xIdx+yIdx*nSplit; if( gN[cellIdx] == 0 ) return; + + const int start = gOffsets[cellIdx]; const int end = start + gN[cellIdx]; + //if (lIdx==0) + //printf("wgIdx = %d, start = %d, end=%d\n",wgIdx,start,end); + + if( lIdx == 0 ) { ldsCurBatch = 0; @@ -456,6 +468,9 @@ void BatchSolveKernelContact(__global Body* gBodies, { if (gConstraints[idx].m_batchIdx == ldsCurBatch) { + //if (wgIdx==0 && lIdx==0) + //printf("solved wgIdx=%d, ldsCurBatch=%d idx=%d \n", wgIdx, ldsCurBatch,idx); + solveContactConstraint( gBodies, gShapes, &gConstraints[idx] ); idx+=64; @@ -465,6 +480,8 @@ void BatchSolveKernelContact(__global Body* gBodies, } } GROUP_LDS_BARRIER; + // if (wgIdx==0 && lIdx==0) + // printf("-----------------------\n"); if( lIdx == 0 ) { ldsCurBatch++; diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h index b758f43d8..b19af6054 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h +++ b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h @@ -413,8 +413,8 @@ static const char* solveContactCL= \ " __global int* gN,\n" " __global int* gOffsets,\n" " int maxBatch,\n" -" int bIdx,\n" -" int nSplit\n" +" int cellBatch,\n" +" int4 nSplit\n" " )\n" "{\n" " //__local int ldsBatchIdx[WG_SIZE+1];\n" @@ -430,17 +430,29 @@ static const char* solveContactCL= \ " //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n" "\n" "\n" -" int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n" -" int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n" -" int cellIdx = xIdx+yIdx*nSplit;\n" +" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n" +" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n" +" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n" +" int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n" +" int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n" +"\n" +" //int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n" +" //int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n" +" //int cellIdx = xIdx+yIdx*nSplit;\n" " \n" " if( gN[cellIdx] == 0 ) \n" " return;\n" "\n" +" \n" +" \n" " const int start = gOffsets[cellIdx];\n" " const int end = start + gN[cellIdx];\n" "\n" " \n" +" //if (lIdx==0)\n" +" //printf(\"wgIdx = %d, start = %d, end=%d\n\",wgIdx,start,end);\n" +"\n" +" \n" " if( lIdx == 0 )\n" " {\n" " ldsCurBatch = 0;\n" @@ -458,6 +470,9 @@ static const char* solveContactCL= \ " {\n" " if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n" " {\n" +" //if (wgIdx==0 && lIdx==0)\n" +" //printf(\"solved wgIdx=%d, ldsCurBatch=%d idx=%d \n\", wgIdx, ldsCurBatch,idx);\n" +" \n" " solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n" "\n" " idx+=64;\n" @@ -467,6 +482,8 @@ static const char* solveContactCL= \ " }\n" " }\n" " GROUP_LDS_BARRIER;\n" +" // if (wgIdx==0 && lIdx==0)\n" +" // printf(\"-----------------------\n\");\n" " if( lIdx == 0 )\n" " {\n" " ldsCurBatch++;\n" diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl index d4276c24b..c0067b0ff 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl +++ b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl @@ -440,8 +440,8 @@ void BatchSolveKernelFriction(__global Body* gBodies, __global int* gN, __global int* gOffsets, int maxBatch, - int bIdx, - int nSplit + int cellBatch, + int4 nSplit ) { //__local int ldsBatchIdx[WG_SIZE+1]; @@ -457,9 +457,12 @@ void BatchSolveKernelFriction(__global Body* gBodies, //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE; - int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1); - int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1); - int cellIdx = xIdx+yIdx*nSplit; + int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2); + int remain= (wgIdx%((nSplit.x*nSplit.y)/4)); + int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1); + int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y); + if( gN[cellIdx] == 0 ) return; diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h index 9d6de6ccc..26d41a4d5 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h +++ b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h @@ -442,8 +442,8 @@ static const char* solveFrictionCL= \ " __global int* gN,\n" " __global int* gOffsets,\n" " int maxBatch,\n" -" int bIdx,\n" -" int nSplit\n" +" int cellBatch,\n" +" int4 nSplit\n" " )\n" "{\n" " //__local int ldsBatchIdx[WG_SIZE+1];\n" @@ -459,9 +459,12 @@ static const char* solveFrictionCL= \ " //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n" "\n" "\n" -" int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n" -" int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n" -" int cellIdx = xIdx+yIdx*nSplit;\n" +" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n" +" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n" +" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n" +" int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n" +" int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n" +"\n" " \n" " if( gN[cellIdx] == 0 ) \n" " return;\n" diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl index 5074ffb17..0af8bafaa 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl +++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl @@ -441,6 +441,42 @@ void ReorderContactKernel(__global Contact4* in, __global Contact4* out, __globa } } + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SetDeterminismSortDataBodyA(__global Contact4* contactsIn, __global int2* sortDataOut, int nContacts) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int2 sd; + sd.x = contactsIn[gIdx].m_bodyAPtrAndSignBit; + sd.y = gIdx; + sortDataOut[gIdx] = sd; + } +} + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SetDeterminismSortDataBodyB(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int2 sdIn; + sdIn = sortDataInOut[gIdx]; + int2 sdOut; + sdOut.x = contactsIn[sdIn.y].m_bodyBPtrAndSignBit; + sdOut.y = sdIn.y; + sortDataInOut[gIdx] = sdOut; + } +} + + + + typedef struct { int m_nContacts; @@ -480,7 +516,7 @@ static __constant const int gridTable8x8[] = __kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1))) void SetSortDataKernel(__global Contact4* gContact, __global Body* gBodies, __global int2* gSortDataOut, -int nContacts,float scale,int N_SPLIT, int staticIdx) +int nContacts,float scale,int4 nSplit,int staticIdx) { int gIdx = GET_GLOBAL_IDX; @@ -499,9 +535,10 @@ int nContacts,float scale,int N_SPLIT, int staticIdx) #if USE_SPATIAL_BATCHING int idx = (aStatic)? bIdx: aIdx; float4 p = gBodies[idx].m_pos; - int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (N_SPLIT-1); - int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (N_SPLIT-1); - int newIndex = (xIdx+zIdx*N_SPLIT); + int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1); + int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1); + int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1); + int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y); #else//USE_SPATIAL_BATCHING #if USE_4x4_GRID diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h index 73545ac85..39b1e158b 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h +++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h @@ -443,6 +443,42 @@ static const char* solverSetup2CL= \ " }\n" "}\n" "\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SetDeterminismSortDataBodyA(__global Contact4* contactsIn, __global int2* sortDataOut, int nContacts)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < nContacts )\n" +" {\n" +" int2 sd;\n" +" sd.x = contactsIn[gIdx].m_bodyAPtrAndSignBit;\n" +" sd.y = gIdx;\n" +" sortDataOut[gIdx] = sd;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SetDeterminismSortDataBodyB(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < nContacts )\n" +" {\n" +" int2 sdIn;\n" +" sdIn = sortDataInOut[gIdx];\n" +" int2 sdOut;\n" +" sdOut.x = contactsIn[sdIn.y].m_bodyBPtrAndSignBit;\n" +" sdOut.y = sdIn.y;\n" +" sortDataInOut[gIdx] = sdOut;\n" +" }\n" +"}\n" +"\n" +"\n" +"\n" +"\n" "typedef struct\n" "{\n" " int m_nContacts;\n" @@ -482,7 +518,7 @@ static const char* solverSetup2CL= \ "__kernel\n" "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" "void SetSortDataKernel(__global Contact4* gContact, __global Body* gBodies, __global int2* gSortDataOut, \n" -"int nContacts,float scale,int N_SPLIT, int staticIdx)\n" +"int nContacts,float scale,int4 nSplit,int staticIdx)\n" "\n" "{\n" " int gIdx = GET_GLOBAL_IDX;\n" @@ -501,9 +537,10 @@ static const char* solverSetup2CL= \ "#if USE_SPATIAL_BATCHING \n" " int idx = (aStatic)? bIdx: aIdx;\n" " float4 p = gBodies[idx].m_pos;\n" -" int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (N_SPLIT-1);\n" -" int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (N_SPLIT-1);\n" -" int newIndex = (xIdx+zIdx*N_SPLIT);\n" +" int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1);\n" +" int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1);\n" +" int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1);\n" +" int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y);\n" " \n" "#else//USE_SPATIAL_BATCHING\n" " #if USE_4x4_GRID\n"