batching reports the size of the batches, and solver uses this as termination condition, giving a good speedup

This commit is contained in:
erwincoumans
2014-02-07 06:28:15 -08:00
parent 3768a30bb2
commit bd5c2ff5ec
12 changed files with 74 additions and 14 deletions

View File

@@ -120,7 +120,7 @@ u32 tryWrite(__local u32* buff, int idx)
// batching on the GPU
__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,
__global const u32* gN, __global const u32* gStart,
__global const u32* gN, __global const u32* gStart, __global int* batchSizes,
int m_staticIdx )
{
__local u32 ldsStackIdx[STACK_SIZE];
@@ -147,9 +147,13 @@ __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints,
ldsDstEnd = m_start;
}
// while(1)
//was 250
for(int ie=0; ie<50; ie++)
int ie=0;
int maxBatch = 0;
for(ie=0; ie<50; ie++)
{
ldsFixedBuffer[lIdx] = 0;
@@ -297,7 +301,12 @@ __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints,
int idx = m_start + ldsRingElem[i].m_idx;
int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;
int curBatch = 100+i;
if (maxBatch < curBatch)
maxBatch = curBatch;
gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;
}
GROUP_LDS_BARRIER;
if( lIdx == 0 ) ldsRingEnd = 0;
@@ -312,6 +321,12 @@ __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints,
break;
}
if( lIdx == 0 )
{
if (maxBatch < ie)
maxBatch=ie;
batchSizes[wgIdx]=maxBatch;
}
}

View File

@@ -196,7 +196,7 @@ static const char* batchingKernelsCL= \
"}\n"
"// batching on the GPU\n"
"__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n"
" __global const u32* gN, __global const u32* gStart, \n"
" __global const u32* gN, __global const u32* gStart, __global int* batchSizes, \n"
" int m_staticIdx )\n"
"{\n"
" __local u32 ldsStackIdx[STACK_SIZE];\n"
@@ -222,9 +222,13 @@ static const char* batchingKernelsCL= \
" ldsDstEnd = m_start;\n"
" }\n"
" \n"
" \n"
" \n"
"// while(1)\n"
"//was 250\n"
" for(int ie=0; ie<50; ie++)\n"
" int ie=0;\n"
" int maxBatch = 0;\n"
" for(ie=0; ie<50; ie++)\n"
" {\n"
" ldsFixedBuffer[lIdx] = 0;\n"
" for(int giter=0; giter<4; giter++)\n"
@@ -357,7 +361,12 @@ static const char* batchingKernelsCL= \
" int idx = m_start + ldsRingElem[i].m_idx;\n"
" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
" gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;\n"
" int curBatch = 100+i;\n"
" if (maxBatch < curBatch)\n"
" maxBatch = curBatch;\n"
" \n"
" gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;\n"
" \n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 ) ldsRingEnd = 0;\n"
@@ -368,5 +377,11 @@ static const char* batchingKernelsCL= \
" if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
" break;\n"
" }\n"
" if( lIdx == 0 )\n"
" {\n"
" if (maxBatch < ie)\n"
" maxBatch=ie;\n"
" batchSizes[wgIdx]=maxBatch;\n"
" }\n"
"}\n"
;

View File

@@ -141,7 +141,7 @@ u32 tryWrite(__local u32* buff, int idx)
// batching on the GPU
__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )
__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )
{
int wgIdx = GET_GROUP_IDX;
int lIdx = GET_LOCAL_IDX;
@@ -222,6 +222,9 @@ __kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __
}//for
batchIdx ++;
}//while
batchSizes[wgIdx] = batchIdx;
}//if( lIdx == 0 )
//return batchIdx;

View File

@@ -209,7 +209,7 @@ static const char* batchingKernelsNewCL= \
" return ((ans >> bitIdx)&1) == 0;\n"
"}\n"
"// batching on the GPU\n"
"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )\n"
"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )\n"
"{\n"
" int wgIdx = GET_GROUP_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
@@ -281,6 +281,8 @@ static const char* batchingKernelsNewCL= \
" }//for\n"
" batchIdx ++;\n"
" }//while\n"
" \n"
" batchSizes[wgIdx] = batchIdx;\n"
" }//if( lIdx == 0 )\n"
" \n"
" //return batchIdx;\n"

View File

@@ -401,7 +401,8 @@ void BatchSolveKernelContact(__global Body* gBodies,
__global Constraint4* gConstraints,
__global int* gN,
__global int* gOffsets,
int maxBatch,
__global int* batchSizes,
int maxBatch1,
int cellBatch,
int4 nSplit
)
@@ -418,6 +419,8 @@ void BatchSolveKernelContact(__global Body* gBodies,
// debugInfo[gIdx].m_valInt0 = gIdx;
//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;
int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);
int remain= (wgIdx%((nSplit.x*nSplit.y)/4));
@@ -432,6 +435,7 @@ void BatchSolveKernelContact(__global Body* gBodies,
if( gN[cellIdx] == 0 )
return;
int maxBatch = batchSizes[cellIdx];
const int start = gOffsets[cellIdx];

View File

@@ -307,7 +307,8 @@ static const char* solveContactCL= \
" __global Constraint4* gConstraints,\n"
" __global int* gN,\n"
" __global int* gOffsets,\n"
" int maxBatch,\n"
" __global int* batchSizes,\n"
" int maxBatch1,\n"
" int cellBatch,\n"
" int4 nSplit\n"
" )\n"
@@ -321,6 +322,8 @@ static const char* solveContactCL= \
"// int gIdx = GET_GLOBAL_IDX;\n"
"// debugInfo[gIdx].m_valInt0 = gIdx;\n"
" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
" \n"
" \n"
" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
@@ -332,6 +335,7 @@ static const char* solveContactCL= \
" \n"
" if( gN[cellIdx] == 0 ) \n"
" return;\n"
" int maxBatch = batchSizes[cellIdx];\n"
" \n"
" \n"
" const int start = gOffsets[cellIdx];\n"

View File

@@ -430,7 +430,8 @@ void BatchSolveKernelFriction(__global Body* gBodies,
__global Constraint4* gConstraints,
__global int* gN,
__global int* gOffsets,
int maxBatch,
__global int* batchSizes,
int maxBatch1,
int cellBatch,
int4 nSplit
)
@@ -458,6 +459,8 @@ void BatchSolveKernelFriction(__global Body* gBodies,
if( gN[cellIdx] == 0 )
return;
int maxBatch = batchSizes[cellIdx];
const int start = gOffsets[cellIdx];
const int end = start + gN[cellIdx];

View File

@@ -343,7 +343,8 @@ static const char* solveFrictionCL= \
" __global Constraint4* gConstraints,\n"
" __global int* gN,\n"
" __global int* gOffsets,\n"
" int maxBatch,\n"
" __global int* batchSizes,\n"
" int maxBatch1,\n"
" int cellBatch,\n"
" int4 nSplit\n"
" )\n"
@@ -365,6 +366,7 @@ static const char* solveFrictionCL= \
" \n"
" if( gN[cellIdx] == 0 ) \n"
" return;\n"
" int maxBatch = batchSizes[cellIdx];\n"
" const int start = gOffsets[cellIdx];\n"
" const int end = start + gN[cellIdx];\n"
" \n"