Files
bullet3/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernelsDX11.h

148 lines
3.9 KiB
C

static const char* prefixScanKernelsDX11= \
"/*\n"
" 2011 Takahiro Harada\n"
"*/\n"
"\n"
"typedef uint u32;\n"
"\n"
"#define GET_GROUP_IDX groupIdx.x\n"
"#define GET_LOCAL_IDX localIdx.x\n"
"#define GET_GLOBAL_IDX globalIdx.x\n"
"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
"\n"
"// takahiro end\n"
"#define WG_SIZE 128\n"
"\n"
"#define GET_GROUP_SIZE WG_SIZE\n"
"\n"
"\n"
"cbuffer SortCB : register( b0 )\n"
"{\n"
" int m_numElems;\n"
" int m_numBlocks;\n"
" int m_numScanBlocks;\n"
"};\n"
" \n"
"RWStructuredBuffer<uint> dst : register( u0 );\n"
"RWStructuredBuffer<uint> src : register( u1 );\n"
"RWStructuredBuffer<uint> sumBuffer : register( u2 );\n"
"\n"
"\n"
"groupshared u32 ldsData[2048];\n"
"\n"
"u32 ScanExclusive(u32 n, int lIdx, int lSize)\n"
"{\n"
" u32 blocksum;\n"
" int offset = 1;\n"
" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" ldsData[bi] += ldsData[ai];\n"
" }\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" if( lIdx == 0 )\n"
" {\n"
" blocksum = ldsData[ n-1 ];\n"
" ldsData[ n-1 ] = 0;\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" offset >>= 1;\n"
" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" u32 temp = ldsData[ai];\n"
" ldsData[ai] = ldsData[bi];\n"
" ldsData[bi] += temp;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" return blocksum;\n"
"}\n"
"\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void LocalScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
"\n"
" ldsData[2*lIdx] = ( 2*gIdx < m_numElems )? src[2*gIdx]: 0;\n"
" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < m_numElems )? src[2*gIdx + 1]: 0;\n"
"\n"
" u32 sum = ScanExclusive(WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
"\n"
" if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
"\n"
" if( (2*gIdx) < m_numElems )\n"
" {\n"
" dst[2*gIdx] = ldsData[2*lIdx];\n"
" }\n"
" if( (2*gIdx + 1) < m_numElems )\n"
" {\n"
" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
" }\n"
"}\n"
"\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void TopLevelScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int lSize = GET_GROUP_SIZE;\n"
"\n"
" for(int i=lIdx; i<m_numScanBlocks; i+=lSize )\n"
" {\n"
" ldsData[i] = (i<m_numBlocks)? dst[i]:0;\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" u32 sum = ScanExclusive(m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
"\n"
" for(int i=lIdx; i<m_numBlocks; i+=lSize )\n"
" {\n"
" dst[i] = ldsData[i];\n"
" }\n"
"\n"
" if( gIdx == 0 )\n"
" {\n"
" dst[m_numBlocks] = sum;\n"
" }\n"
"}\n"
"\n"
"\n"
" \n"
"RWStructuredBuffer<uint> blockSum2 : register( u1 );\n"
"\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void AddOffsetKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
"{\n"
" const u32 blockSize = WG_SIZE*2;\n"
"\n"
" int myIdx = GET_GROUP_IDX+1;\n"
" int llIdx = GET_LOCAL_IDX;\n"
"\n"
" u32 iBlockSum = blockSum2[myIdx];\n"
"\n"
" int endValue = min((myIdx+1)*(blockSize), m_numElems);\n"
" for(int i=myIdx*blockSize+llIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
" {\n"
" dst[i] += iBlockSum;\n"
" }\n"
"}\n"
"\n"
;