diff --git a/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp b/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp index 95946a84a..b1e4218fb 100644 --- a/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp +++ b/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp @@ -97,6 +97,7 @@ int GpuConvexScene::createDynamicsObjects2(const ConstructionInfo& ci, const flo { int strideInBytes = 9*sizeof(float); int textureIndex = -1; + if (0) { int width,height,n; diff --git a/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp b/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp index ba9fffa50..694e7c1bb 100644 --- a/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp +++ b/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp @@ -411,6 +411,7 @@ void b3GpuPgsContactSolver::solveContactConstraint( const b3OpenCLArraym_solverGPU->m_batchSizes.getBufferCL()); //launcher.setConst( cdata.x ); launcher.setConst( cdata.y ); launcher.setConst( cdata.z ); @@ -500,6 +501,7 @@ void b3GpuPgsContactSolver::solveContactConstraint( const b3OpenCLArraym_queue, m_data->m_solveFrictionKernel,"m_solveFrictionKernel" ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffer( m_data->m_solverGPU->m_batchSizes.getBufferCL()); //launcher.setConst( cdata.x ); launcher.setConst( cdata.y ); launcher.setConst( cdata.z ); @@ -1037,7 +1039,7 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem if (!gCpuBatchContacts) { B3_PROFILE("gpu batchContacts"); - maxNumBatches = 150;//250; + maxNumBatches = 250;//250; m_data->m_solverGPU->batchContacts( m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx ); clFinish(m_data->m_queue); } else diff --git a/src/Bullet3OpenCL/RigidBody/b3Solver.cpp b/src/Bullet3OpenCL/RigidBody/b3Solver.cpp index 5aa7eb70b..c5bdf49c2 100644 --- a/src/Bullet3OpenCL/RigidBody/b3Solver.cpp +++ b/src/Bullet3OpenCL/RigidBody/b3Solver.cpp @@ -90,7 +90,8 @@ b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, :m_nIterations(4), m_context(ctx), m_device(device), - m_queue(queue) + m_queue(queue), + m_batchSizes(ctx,queue) { m_sort32 = new b3RadixSort32CL(ctx,device,queue); m_scan = new b3PrefixScanCL(ctx,device,queue,B3_SOLVER_N_CELLS); @@ -1136,6 +1137,7 @@ void b3Solver::batchContacts( b3OpenCLArray* contacts, int nContact { + m_batchSizes.resize(nNative->size()); B3_PROFILE("batchingKernel"); //b3LauncherCL launcher( m_queue, m_batchingKernel); cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel; @@ -1148,12 +1150,18 @@ void b3Solver::batchContacts( b3OpenCLArray* contacts, int nContact launcher.setBuffer( m_contactBuffer2->getBufferCL() ); launcher.setBuffer( nNative->getBufferCL()); launcher.setBuffer( offsetsNative->getBufferCL()); + + launcher.setBuffer(m_batchSizes.getBufferCL()); + //launcher.setConst( cdata ); launcher.setConst(staticIdx); launcher.launch1D( numWorkItems, 64 ); //clFinish(m_queue); + //b3AlignedObjectArray batchSizesCPU; + //m_batchSizes.copyToHost(batchSizesCPU); + //printf(".\n"); } #ifdef BATCH_DEBUG diff --git a/src/Bullet3OpenCL/RigidBody/b3Solver.h b/src/Bullet3OpenCL/RigidBody/b3Solver.h index ce77c798d..b37f2f1be 100644 --- a/src/Bullet3OpenCL/RigidBody/b3Solver.h +++ b/src/Bullet3OpenCL/RigidBody/b3Solver.h @@ -72,6 +72,7 @@ class b3Solver : public b3SolverBase b3OpenCLArray* m_numConstraints; b3OpenCLArray* m_offsets; + b3OpenCLArray m_batchSizes; int m_nIterations; diff --git a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl index 1c8c48a58..3b891b863 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl +++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl @@ -120,7 +120,7 @@ u32 tryWrite(__local u32* buff, int idx) // batching on the GPU __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut, - __global const u32* gN, __global const u32* gStart, + __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int m_staticIdx ) { __local u32 ldsStackIdx[STACK_SIZE]; @@ -147,9 +147,13 @@ __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, ldsDstEnd = m_start; } + + // while(1) //was 250 - for(int ie=0; ie<50; ie++) + int ie=0; + int maxBatch = 0; + for(ie=0; ie<50; ie++) { ldsFixedBuffer[lIdx] = 0; @@ -297,7 +301,12 @@ __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, int idx = m_start + ldsRingElem[i].m_idx; int dstIdx; AtomInc1( ldsDstEnd, dstIdx ); gConstraintsOut[ dstIdx ] = gConstraints[ idx ]; - gConstraintsOut[ dstIdx ].m_batchIdx = 100+i; + int curBatch = 100+i; + if (maxBatch < curBatch) + maxBatch = curBatch; + + gConstraintsOut[ dstIdx ].m_batchIdx = curBatch; + } GROUP_LDS_BARRIER; if( lIdx == 0 ) ldsRingEnd = 0; @@ -312,6 +321,12 @@ __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, break; } + if( lIdx == 0 ) + { + if (maxBatch < ie) + maxBatch=ie; + batchSizes[wgIdx]=maxBatch; + } } diff --git a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h index dca5fd030..f52131bc4 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h +++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h @@ -196,7 +196,7 @@ static const char* batchingKernelsCL= \ "}\n" "// batching on the GPU\n" "__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n" -" __global const u32* gN, __global const u32* gStart, \n" +" __global const u32* gN, __global const u32* gStart, __global int* batchSizes, \n" " int m_staticIdx )\n" "{\n" " __local u32 ldsStackIdx[STACK_SIZE];\n" @@ -222,9 +222,13 @@ static const char* batchingKernelsCL= \ " ldsDstEnd = m_start;\n" " }\n" " \n" +" \n" +" \n" "// while(1)\n" "//was 250\n" -" for(int ie=0; ie<50; ie++)\n" +" int ie=0;\n" +" int maxBatch = 0;\n" +" for(ie=0; ie<50; ie++)\n" " {\n" " ldsFixedBuffer[lIdx] = 0;\n" " for(int giter=0; giter<4; giter++)\n" @@ -357,7 +361,12 @@ static const char* batchingKernelsCL= \ " int idx = m_start + ldsRingElem[i].m_idx;\n" " int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n" " gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n" -" gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;\n" +" int curBatch = 100+i;\n" +" if (maxBatch < curBatch)\n" +" maxBatch = curBatch;\n" +" \n" +" gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;\n" +" \n" " }\n" " GROUP_LDS_BARRIER;\n" " if( lIdx == 0 ) ldsRingEnd = 0;\n" @@ -368,5 +377,11 @@ static const char* batchingKernelsCL= \ " if( ldsGEnd == m_n && ldsRingEnd == 0 )\n" " break;\n" " }\n" +" if( lIdx == 0 )\n" +" {\n" +" if (maxBatch < ie)\n" +" maxBatch=ie;\n" +" batchSizes[wgIdx]=maxBatch;\n" +" }\n" "}\n" ; diff --git a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl index 8a52c6817..ba1b66d2c 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl +++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl @@ -141,7 +141,7 @@ u32 tryWrite(__local u32* buff, int idx) // batching on the GPU -__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx ) +__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx ) { int wgIdx = GET_GROUP_IDX; int lIdx = GET_LOCAL_IDX; @@ -222,6 +222,9 @@ __kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __ }//for batchIdx ++; }//while + + batchSizes[wgIdx] = batchIdx; + }//if( lIdx == 0 ) //return batchIdx; diff --git a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h index 2d987a3ed..b7bf3345b 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h +++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h @@ -209,7 +209,7 @@ static const char* batchingKernelsNewCL= \ " return ((ans >> bitIdx)&1) == 0;\n" "}\n" "// batching on the GPU\n" -"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )\n" +"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )\n" "{\n" " int wgIdx = GET_GROUP_IDX;\n" " int lIdx = GET_LOCAL_IDX;\n" @@ -281,6 +281,8 @@ static const char* batchingKernelsNewCL= \ " }//for\n" " batchIdx ++;\n" " }//while\n" +" \n" +" batchSizes[wgIdx] = batchIdx;\n" " }//if( lIdx == 0 )\n" " \n" " //return batchIdx;\n" diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl index 19c7f4d37..5c4d62e4e 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl +++ b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl @@ -401,7 +401,8 @@ void BatchSolveKernelContact(__global Body* gBodies, __global Constraint4* gConstraints, __global int* gN, __global int* gOffsets, - int maxBatch, + __global int* batchSizes, + int maxBatch1, int cellBatch, int4 nSplit ) @@ -418,6 +419,8 @@ void BatchSolveKernelContact(__global Body* gBodies, // debugInfo[gIdx].m_valInt0 = gIdx; //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE; + + int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2); int remain= (wgIdx%((nSplit.x*nSplit.y)/4)); @@ -432,6 +435,7 @@ void BatchSolveKernelContact(__global Body* gBodies, if( gN[cellIdx] == 0 ) return; + int maxBatch = batchSizes[cellIdx]; const int start = gOffsets[cellIdx]; diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h index 61634326d..15a049992 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h +++ b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h @@ -307,7 +307,8 @@ static const char* solveContactCL= \ " __global Constraint4* gConstraints,\n" " __global int* gN,\n" " __global int* gOffsets,\n" -" int maxBatch,\n" +" __global int* batchSizes,\n" +" int maxBatch1,\n" " int cellBatch,\n" " int4 nSplit\n" " )\n" @@ -321,6 +322,8 @@ static const char* solveContactCL= \ "// int gIdx = GET_GLOBAL_IDX;\n" "// debugInfo[gIdx].m_valInt0 = gIdx;\n" " //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n" +" \n" +" \n" " int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n" " int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n" " int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n" @@ -332,6 +335,7 @@ static const char* solveContactCL= \ " \n" " if( gN[cellIdx] == 0 ) \n" " return;\n" +" int maxBatch = batchSizes[cellIdx];\n" " \n" " \n" " const int start = gOffsets[cellIdx];\n" diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl index f64e5252b..1d70fbbae 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl +++ b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl @@ -430,7 +430,8 @@ void BatchSolveKernelFriction(__global Body* gBodies, __global Constraint4* gConstraints, __global int* gN, __global int* gOffsets, - int maxBatch, + __global int* batchSizes, + int maxBatch1, int cellBatch, int4 nSplit ) @@ -458,6 +459,8 @@ void BatchSolveKernelFriction(__global Body* gBodies, if( gN[cellIdx] == 0 ) return; + int maxBatch = batchSizes[cellIdx]; + const int start = gOffsets[cellIdx]; const int end = start + gN[cellIdx]; diff --git a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h index a4804f64f..eb58674f2 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h +++ b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h @@ -343,7 +343,8 @@ static const char* solveFrictionCL= \ " __global Constraint4* gConstraints,\n" " __global int* gN,\n" " __global int* gOffsets,\n" -" int maxBatch,\n" +" __global int* batchSizes,\n" +" int maxBatch1,\n" " int cellBatch,\n" " int4 nSplit\n" " )\n" @@ -365,6 +366,7 @@ static const char* solveFrictionCL= \ " \n" " if( gN[cellIdx] == 0 ) \n" " return;\n" +" int maxBatch = batchSizes[cellIdx];\n" " const int start = gOffsets[cellIdx];\n" " const int end = start + gN[cellIdx];\n" " \n"