static const char* narrowphaseKernelsCL= \ "\n" "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" "\n" "#ifdef cl_ext_atomic_counters_32\n" "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" "#else\n" "#define counter32_t volatile global int*\n" "#endif\n" "\n" "\n" "typedef unsigned int u32;\n" "typedef unsigned short u16;\n" "typedef unsigned char u8;\n" "\n" "#define GET_GROUP_IDX get_group_id(0)\n" "#define GET_LOCAL_IDX get_local_id(0)\n" "#define GET_GLOBAL_IDX get_global_id(0)\n" "#define GET_GROUP_SIZE get_local_size(0)\n" "#define GET_NUM_GROUPS get_num_groups(0)\n" "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" "#define AtomInc(x) atom_inc(&(x))\n" "#define AtomInc1(x, out) out = atom_inc(&(x))\n" "#define AppendInc(x, out) out = atomic_inc(x)\n" "#define AtomAdd(x, value) atom_add(&(x), value)\n" "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" "\n" "\n" "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" "\n" "#define make_float4 (float4)\n" "#define make_float2 (float2)\n" "#define make_uint4 (uint4)\n" "#define make_int4 (int4)\n" "#define make_uint2 (uint2)\n" "#define make_int2 (int2)\n" "\n" "\n" "#define max2 max\n" "#define min2 min\n" "\n" "\n" "///////////////////////////////////////\n" "// Vector\n" "///////////////////////////////////////\n" "__inline\n" "float fastDiv(float numerator, float denominator)\n" "{\n" " return native_divide(numerator, denominator); \n" "// return numerator/denominator; \n" "}\n" "\n" "__inline\n" "float4 fastDiv4(float4 numerator, float4 denominator)\n" "{\n" " return native_divide(numerator, denominator); \n" "}\n" "\n" "__inline\n" "float fastSqrtf(float f2)\n" "{\n" " return native_sqrt(f2);\n" "// return sqrt(f2);\n" "}\n" "\n" "__inline\n" "float fastRSqrt(float f2)\n" "{\n" " return native_rsqrt(f2);\n" "}\n" "\n" "__inline\n" "float fastLength4(float4 v)\n" "{\n" " return fast_length(v);\n" "}\n" "\n" "__inline\n" "float4 fastNormalize4(float4 v)\n" "{\n" " return fast_normalize(v);\n" "}\n" "\n" "\n" "__inline\n" "float sqrtf(float a)\n" "{\n" "// return sqrt(a);\n" " return native_sqrt(a);\n" "}\n" "\n" "__inline\n" "float4 cross3(float4 a, float4 b)\n" "{\n" " return cross(a,b);\n" "}\n" "\n" "__inline\n" "float dot3F4(float4 a, float4 b)\n" "{\n" " float4 a1 = make_float4(a.xyz,0.f);\n" " float4 b1 = make_float4(b.xyz,0.f);\n" " return dot(a1, b1);\n" "}\n" "\n" "__inline\n" "float length3(const float4 a)\n" "{\n" " return sqrtf(dot3F4(a,a));\n" "}\n" "\n" "__inline\n" "float dot4(const float4 a, const float4 b)\n" "{\n" " return dot( a, b );\n" "}\n" "\n" "// for height\n" "__inline\n" "float dot3w1(const float4 point, const float4 eqn)\n" "{\n" " return dot3F4(point,eqn) + eqn.w;\n" "}\n" "\n" "__inline\n" "float4 normalize3(const float4 a)\n" "{\n" " float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" " return fastNormalize4( n );\n" "// float length = sqrtf(dot3F4(a, a));\n" "// return 1.f/length * a;\n" "}\n" "\n" "__inline\n" "float4 normalize4(const float4 a)\n" "{\n" " float length = sqrtf(dot4(a, a));\n" " return 1.f/length * a;\n" "}\n" "\n" "__inline\n" "float4 createEquation(const float4 a, const float4 b, const float4 c)\n" "{\n" " float4 eqn;\n" " float4 ab = b-a;\n" " float4 ac = c-a;\n" " eqn = normalize3( cross3(ab, ac) );\n" " eqn.w = -dot3F4(eqn,a);\n" " return eqn;\n" "}\n" "\n" "///////////////////////////////////////\n" "// Matrix3x3\n" "///////////////////////////////////////\n" "\n" "typedef struct\n" "{\n" " float4 m_row[3];\n" "}Matrix3x3;\n" "\n" "__inline\n" "Matrix3x3 mtZero();\n" "\n" "__inline\n" "Matrix3x3 mtIdentity();\n" "\n" "__inline\n" "Matrix3x3 mtTranspose(Matrix3x3 m);\n" "\n" "__inline\n" "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n" "\n" "__inline\n" "float4 mtMul1(Matrix3x3 a, float4 b);\n" "\n" "__inline\n" "float4 mtMul3(float4 a, Matrix3x3 b);\n" "\n" "__inline\n" "Matrix3x3 mtZero()\n" "{\n" " Matrix3x3 m;\n" " m.m_row[0] = (float4)(0.f);\n" " m.m_row[1] = (float4)(0.f);\n" " m.m_row[2] = (float4)(0.f);\n" " return m;\n" "}\n" "\n" "__inline\n" "Matrix3x3 mtIdentity()\n" "{\n" " Matrix3x3 m;\n" " m.m_row[0] = (float4)(1,0,0,0);\n" " m.m_row[1] = (float4)(0,1,0,0);\n" " m.m_row[2] = (float4)(0,0,1,0);\n" " return m;\n" "}\n" "\n" "__inline\n" "Matrix3x3 mtTranspose(Matrix3x3 m)\n" "{\n" " Matrix3x3 out;\n" " out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" " out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" " out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" " return out;\n" "}\n" "\n" "__inline\n" "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n" "{\n" " Matrix3x3 transB;\n" " transB = mtTranspose( b );\n" " Matrix3x3 ans;\n" " // why this doesn't run when 0ing in the for{}\n" " a.m_row[0].w = 0.f;\n" " a.m_row[1].w = 0.f;\n" " a.m_row[2].w = 0.f;\n" " for(int i=0; i<3; i++)\n" " {\n" "// a.m_row[i].w = 0.f;\n" " ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n" " ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n" " ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n" " ans.m_row[i].w = 0.f;\n" " }\n" " return ans;\n" "}\n" "\n" "__inline\n" "float4 mtMul1(Matrix3x3 a, float4 b)\n" "{\n" " float4 ans;\n" " ans.x = dot3F4( a.m_row[0], b );\n" " ans.y = dot3F4( a.m_row[1], b );\n" " ans.z = dot3F4( a.m_row[2], b );\n" " ans.w = 0.f;\n" " return ans;\n" "}\n" "\n" "__inline\n" "float4 mtMul3(float4 a, Matrix3x3 b)\n" "{\n" " float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" " float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" " float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" "\n" " float4 ans;\n" " ans.x = dot3F4( a, colx );\n" " ans.y = dot3F4( a, coly );\n" " ans.z = dot3F4( a, colz );\n" " return ans;\n" "}\n" "\n" "///////////////////////////////////////\n" "// Quaternion\n" "///////////////////////////////////////\n" "\n" "typedef float4 Quaternion;\n" "\n" "__inline\n" "Quaternion qtMul(Quaternion a, Quaternion b);\n" "\n" "__inline\n" "Quaternion qtNormalize(Quaternion in);\n" "\n" "__inline\n" "float4 qtRotate(Quaternion q, float4 vec);\n" "\n" "__inline\n" "Quaternion qtInvert(Quaternion q);\n" "\n" "__inline\n" "Matrix3x3 qtGetRotationMatrix(Quaternion q);\n" "\n" "\n" "\n" "__inline\n" "Quaternion qtMul(Quaternion a, Quaternion b)\n" "{\n" " Quaternion ans;\n" " ans = cross3( a, b );\n" " ans += a.w*b+b.w*a;\n" "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" " ans.w = a.w*b.w - dot3F4(a, b);\n" " return ans;\n" "}\n" "\n" "__inline\n" "Quaternion qtNormalize(Quaternion in)\n" "{\n" " return fastNormalize4(in);\n" "// in /= length( in );\n" "// return in;\n" "}\n" "__inline\n" "float4 qtRotate(Quaternion q, float4 vec)\n" "{\n" " Quaternion qInv = qtInvert( q );\n" " float4 vcpy = vec;\n" " vcpy.w = 0.f;\n" " float4 out = qtMul(qtMul(q,vcpy),qInv);\n" " return out;\n" "}\n" "\n" "__inline\n" "Quaternion qtInvert(Quaternion q)\n" "{\n" " return (Quaternion)(-q.xyz, q.w);\n" "}\n" "\n" "__inline\n" "float4 qtInvRotate(const Quaternion q, float4 vec)\n" "{\n" " return qtRotate( qtInvert( q ), vec );\n" "}\n" "\n" "__inline\n" "Matrix3x3 qtGetRotationMatrix(Quaternion quat)\n" "{\n" " float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" " Matrix3x3 out;\n" "\n" " out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" " out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" " out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" " out.m_row[0].w = 0.f;\n" "\n" " out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" " out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" " out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" " out.m_row[1].w = 0.f;\n" "\n" " out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" " out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" " out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" " out.m_row[2].w = 0.f;\n" "\n" " return out;\n" "}\n" "\n" "\n" "#define WG_SIZE 64\n" "#define HEIGHT_RES 4\n" "#define SHAPE_CONVEX_HEIGHT_FIELD 1//keep this in sync with AdlCollisionShape.h!\n" "\n" "typedef struct\n" "{\n" " float4 m_normal[HEIGHT_RES*HEIGHT_RES*6];\n" " u32 m_height4[HEIGHT_RES*HEIGHT_RES*6];\n" " u32 m_supportHeight4[HEIGHT_RES*HEIGHT_RES*6];\n" "\n" " float m_scale;\n" " float m_padding0;\n" " float m_padding1;\n" " float m_padding2;\n" "} ShapeData;\n" "\n" "typedef struct\n" "{\n" " u32 m_height4[HEIGHT_RES*HEIGHT_RES*6/4];\n" "\n" " float m_scale;\n" "} ShapeDeviceData;\n" "\n" "typedef struct\n" "{\n" " float4 m_pos;\n" " float4 m_quat;\n" " float4 m_linVel;\n" " float4 m_angVel;\n" "\n" " u32 m_shapeIdx;\n" " u32 m_shapeType;\n" " \n" " float m_invMass;\n" " float m_restituitionCoeff;\n" " float m_frictionCoeff;\n" "} BodyData;\n" "\n" "typedef struct\n" "{\n" " float4 m_worldPos[4];\n" " float4 m_worldNormal; // w: m_nPoints\n" "// float m_restituitionCoeff;\n" "// float m_frictionCoeff;\n" " u32 m_coeffs;\n" " u32 m_batchIdx;\n" "// int m_nPoints;\n" "// int m_padding0;\n" "\n" " u32 m_bodyAPtr;//x:m_bodyAPtr, y:m_bodyBPtr\n" " u32 m_bodyBPtr;\n" "} Contact4;\n" "\n" "#define GET_NPOINTS(x) (x).m_worldNormal.w\n" "\n" "\n" "typedef struct\n" "{\n" " int m_nPairs;\n" " float m_collisionMargin;\n" " int m_capacity;\n" " int m_paddings[1];\n" "} ConstBuffer;\n" "\n" "__inline\n" "float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" "{\n" " return qtRotate( *orientation, *p ) + (*translation);\n" "}\n" "\n" "__inline\n" "float4 invTransform(const float4* p, const float4* translation, const Quaternion* orientation)\n" "{\n" " return qtRotate( qtInvert( *orientation ), (*p)-(*translation) ); // use qtInvRotate\n" "}\n" "\n" "void CubeMapUtilsCalcCrd(const float4 p, int* faceIdxOut, float* x, float* y)\n" "{\n" " {\n" " int idx;\n" " float r2[] = {p.x*p.x, p.y*p.y, p.z*p.z};\n" "\n" " if (r2[1]>r2[0])\n" " {\n" " if (r2[2]>r2[1])\n" " {\n" " idx = 2;\n" " \n" " } else\n" " {\n" " idx = 1;\n" " }\n" " \n" " } else\n" " {\n" " if (r2[2]>r2[0])\n" " {\n" " idx = 2;\n" " } else\n" " {\n" " idx = 0;\n" " }\n" " }\n" "\n" " *faceIdxOut = (idx*2);\n" "//==\n" " float4 abs = make_float4( fabs(p.x), fabs(p.y), fabs(p.z), 0.f );\n" "\n" " float d;\n" " if( idx == 0 )\n" " {\n" " *x = p.y;\n" " *y = p.z;\n" " d = abs.x;\n" " *faceIdxOut += (p.x < 0.f)? 0: 1.f;\n" " }\n" " else if( idx == 1 )\n" " {\n" " *x = p.z;\n" " *y = p.x;\n" " d = abs.y;\n" " *faceIdxOut += (p.y < 0.f)? 0: 1.f;\n" " }\n" " else\n" " {\n" " *x = p.x;\n" " *y = p.y;\n" " d = abs.z;\n" " *faceIdxOut += (p.z < 0.f)? 0: 1.f;\n" " }\n" "\n" " float dInv = (d==0.f)? 0.f: fastDiv(1.f,d);\n" " *x = (*x*dInv+1.f)*0.5f;\n" " *y = (*y*dInv+1.f)*0.5f;\n" " }\n" "}\n" "\n" "float4 CubeMapUtilsCalcVector(int faceIdx, float x, float y)\n" "{\n" " int dir = faceIdx/2;\n" " float z = (faceIdx%2 == 0)? -1.f:1.f;\n" "\n" " x = x*2.f-1.f;\n" " y = y*2.f-1.f;\n" " \n" " if( dir == 0 )\n" " {\n" " return make_float4(z, x, y, 0.f);\n" " }\n" " else if( dir == 1 )\n" " {\n" " return make_float4(y,z,x, 0.f);\n" " }\n" " else\n" " {\n" " return make_float4(x,y,z, 0.f);\n" " }\n" "}\n" "\n" "typedef int Face;\n" "\n" "u32 sample(__local ShapeDeviceData* shape, int face, int x, int y)\n" "{\n" "\n" " int idx = HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES;\n" " __local u8* height = (__local u8*)shape->m_height4;\n" " return height[idx];\n" "}\n" "\n" "u32 sampleSupportGlobal(__global ShapeData* shape, int face, int x, int y)\n" "{\n" "\n" " int idx = HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES;\n" " __global u8* height = (__global u8*)shape->m_supportHeight4;\n" " return height[idx];\n" "}\n" "\n" "float4 sampleNormal(__local ShapeData* shape, int face, int x, int y)\n" "{\n" " return shape->m_normal[HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES];\n" "}\n" "\n" "float4 sampleNormalGlobal(const __global ShapeData* shape, int face, int x, int y)\n" "{\n" " return shape->m_normal[HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES];\n" "}\n" "\n" "float4 ShapeDataCalcSamplePoint( __local const ShapeDeviceData* shape, int sIdx )//u8 height, int sIdx, float scale )\n" "{\n" " const float oneOver255 = 1.f/255.f;\n" "\n" " int faceIdx = fastDiv(sIdx,(HEIGHT_RES*HEIGHT_RES));\n" " int r = (sIdx%(HEIGHT_RES*HEIGHT_RES));\n" " int i = r/HEIGHT_RES;\n" " int j = r%HEIGHT_RES;\n" "\n" " float4 v;\n" " float x = fastDiv((i+0.5f),(float)HEIGHT_RES);\n" " float y = fastDiv((j+0.5f),(float)HEIGHT_RES);\n" " v = CubeMapUtilsCalcVector(faceIdx, x, y);\n" " v = normalize3( v );\n" "\n" " int quantizedHeight = sample( shape, faceIdx, i, j );\n" " float rheight = quantizedHeight*oneOver255*shape->m_scale;\n" " return rheight*v;\n" "}\n" "\n" "float ShapeDataQueryDistance(__local const ShapeDeviceData* shape, float4 p )\n" "{\n" " if( dot3F4( p, p ) >= shape->m_scale*shape->m_scale ) return FLT_MAX;\n" "\n" " const float oneOver255 = 1.f/255.f;\n" "\n" " int faceIdx;\n" " float x, y;\n" " CubeMapUtilsCalcCrd( p, &faceIdx, &x, &y );\n" " x = (x*HEIGHT_RES) - 0.5f;\n" " y = (y*HEIGHT_RES) - 0.5f;\n" "\n" " float height;\n" " {\n" " int xi = (int)(x);\n" " int yi = (int)(y);\n" " float dx = x-xi;\n" " float dy = y-yi;\n" "\n" " {\n" " int xip = min2((int)(HEIGHT_RES-1), xi+1);\n" " int yip = min2((int)(HEIGHT_RES-1), yi+1);\n" "\n" " u32 xy = sample( shape, faceIdx, xi, yi );\n" " u32 xpy = sample( shape, faceIdx, xip, yi );\n" " u32 xpyp = sample( shape, faceIdx, xip, yip );\n" " u32 xyp = sample( shape, faceIdx, xi, yip );\n" "\n" " height = (xy*(1.f-dx)+xpy*dx)*(1.f-dy) + (xyp*(1.f-dx)+xpyp*dx)*dy;\n" " height = height*oneOver255*shape->m_scale;\n" "\n" " p.w = 0.f;\n" "\n" " height = fastLength4( p ) - height;\n" " }\n" " }\n" "\n" " return height;\n" "}\n" "\n" "float ShapeDataQuerySupportHeight(__global ShapeData* shape, float4 p )\n" "{\n" " int faceIdx;\n" " float x, y;\n" " CubeMapUtilsCalcCrd( p, &faceIdx, &x, &y );\n" " x = (x*HEIGHT_RES) - 0.5f;\n" " y = (y*HEIGHT_RES) - 0.5f;\n" "\n" " float height;\n" " {\n" " int xi = (int)(x);\n" " int yi = (int)(y);\n" "\n" " {\n" " int xip = min2((int)(HEIGHT_RES-1), xi+1);\n" " int yip = min2((int)(HEIGHT_RES-1), yi+1);\n" "\n" " u32 xy = sampleSupportGlobal( shape, faceIdx, xi, yi );\n" " u32 xpy = sampleSupportGlobal( shape, faceIdx, xip, yi );\n" " u32 xpyp = sampleSupportGlobal( shape, faceIdx, xip, yip );\n" " u32 xyp = sampleSupportGlobal( shape, faceIdx, xi, yip );\n" "\n" " height = max2( xy, max2( xpy, max2( xpyp, xyp ) ) );\n" " height = height/255.f*shape->m_scale;\n" " }\n" " }\n" "\n" " return height;\n" "\n" "}\n" "\n" "float4 ShapeDataQueryNormal(__global const ShapeData* shape, float4 p )\n" "{\n" " int faceIdx;\n" " float x, y;\n" " CubeMapUtilsCalcCrd( p, &faceIdx, &x, &y );\n" " x = (x*HEIGHT_RES) - 0.5f;\n" " y = (y*HEIGHT_RES) - 0.5f;\n" "\n" " float4 normalOut;\n" " {\n" " int xi = (int)(x);\n" " int yi = (int)(y);\n" "\n" " normalOut = sampleNormalGlobal( shape, faceIdx, xi, yi );\n" " }\n" " return normalOut;\n" "}\n" "\n" "\n" "\n" "// kernels\n" "\n" "\n" "__kernel\n" "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" "void SupportCullingKernel( __global int2* restrict gPairsIn, __global ShapeData* gShapes, \n" " __global BodyData* gBodies, \n" " __global int2* gPairsOut, \n" " counter32_t gNPairs,\n" " ConstBuffer cb )\n" "{\n" " int gIdx = GET_GLOBAL_IDX;\n" " if( gIdx >= cb.m_nPairs ) return;\n" "\n" " const float collisionMargin = cb.m_collisionMargin;\n" " const int capacity = cb.m_capacity;\n" "\n" " int2 pair = gPairsIn[gIdx];\n" " BodyData bodyA = gBodies[pair.x];\n" " BodyData bodyB = gBodies[pair.y];\n" " int shapeAIdx = bodyA.m_shapeIdx;\n" " int shapeBIdx = bodyB.m_shapeIdx;\n" "\n" "\n" " bool collide = false;\n" " \n" " //only collide if one of the two bodies has a non-zero mass\n" " if (bodyA.m_invMass==0.f && bodyB.m_invMass==0.f)\n" " return;\n" " \n" " \n" " if (bodyA.m_shapeType == SHAPE_CONVEX_HEIGHT_FIELD && bodyB.m_shapeType==SHAPE_CONVEX_HEIGHT_FIELD)\n" " {\n" " float4 abInA, baInB;\n" " float4 ab = bodyB.m_pos - bodyA.m_pos;\n" " {\n" " abInA = qtInvRotate( bodyA.m_quat, ab );\n" " baInB = qtInvRotate( bodyB.m_quat, -ab );\n" " }\n" " float hA = ShapeDataQuerySupportHeight( gShapes+shapeAIdx, abInA );\n" " float hB = ShapeDataQuerySupportHeight( gShapes+shapeBIdx, baInB );\n" "\n" " float h2 = dot3F4( ab, ab );\n" "\n" " collide = ( hA + hB + collisionMargin > sqrtf(h2) );\n" " }\n" "\n" " if( collide )\n" " {\n" " int dstIdx;\n" " AppendInc( gNPairs, dstIdx );\n" " if( dstIdx < capacity )\n" " gPairsOut[dstIdx] = pair;\n" " }\n" "}\n" "\n" "\n" "#define PARALLEL_DO(execution, n) for(int ie=0; ie h[lIdx+1].y)? h[lIdx]: h[lIdx+1];" " mem_fence( CLK_LOCAL_MEM_FENCE );" " h[lIdx] = (h[lIdx].y > h[lIdx+2].y)? h[lIdx]: h[lIdx+2];" " mem_fence( CLK_LOCAL_MEM_FENCE );" " h[lIdx] = (h[lIdx].y > h[lIdx+4].y)? h[lIdx]: h[lIdx+4];" " mem_fence( CLK_LOCAL_MEM_FENCE );" " h[lIdx] = (h[lIdx].y > h[lIdx+8].y)? h[lIdx]: h[lIdx+8];" " mem_fence( CLK_LOCAL_MEM_FENCE );" " h[lIdx] = (h[lIdx].y > h[lIdx+16].y)? h[lIdx]: h[lIdx+16];" " }}\n" "\n" "#define PARALLEL_REDUCE32(h) " " {int lIdx = GET_LOCAL_IDX;" " if( lIdx < 32 )" " {" " h[lIdx] += h[lIdx+1];" " mem_fence( CLK_LOCAL_MEM_FENCE );" " h[lIdx] += h[lIdx+2];" " mem_fence( CLK_LOCAL_MEM_FENCE );" " h[lIdx] += h[lIdx+4];" " mem_fence( CLK_LOCAL_MEM_FENCE );" " h[lIdx] += h[lIdx+8];" " mem_fence( CLK_LOCAL_MEM_FENCE );" " h[lIdx] += h[lIdx+16];" " }}\n" "\n" "\n" "float4 extractManifold(__local float4* p, __local float4* h, __local int* nPointsPtr, float4 nearNormal)\n" "{\n" " int nPoints = *nPointsPtr;\n" " float4 center = make_float4(0,0,0,0);\n" " { // calculate center\n" " nPoints = min2( nPoints, 32 );\n" " {\n" " int lIdx = GET_LOCAL_IDX;\n" " h[lIdx] = p[lIdx];\n" " h[lIdx] = (lIdx= nPoints ) a[ie] = make_int4(-0xfffffff, -0xfffffff, -0xfffffff, -0xfffffff);\n" " }\n" " }\n" "\n" " GROUP_LDS_BARRIER;\n" "\n" " { // vector reduce, h[64]\n" " int lIdx = GET_LOCAL_IDX;\n" " if( lIdx < 32 )\n" " {\n" " h[lIdx] = max2( h[lIdx], h[lIdx+1] );\n" " mem_fence( CLK_LOCAL_MEM_FENCE );\n" " h[lIdx] = max2( h[lIdx], h[lIdx+2] );\n" " mem_fence( CLK_LOCAL_MEM_FENCE );\n" " h[lIdx] = max2( h[lIdx], h[lIdx+4] );\n" " mem_fence( CLK_LOCAL_MEM_FENCE );\n" " h[lIdx] = max2( h[lIdx], h[lIdx+8] );\n" " mem_fence( CLK_LOCAL_MEM_FENCE );\n" " h[lIdx] = max2( h[lIdx], h[lIdx+16] );\n" " }\n" " }\n" "\n" " GROUP_LDS_BARRIER;\n" " }\n" " {\n" " { // set to idx\n" " idx[0] = (int)a[0].x & 0xff;\n" " idx[1] = (int)a[0].y & 0xff;\n" " idx[2] = (int)a[0].z & 0xff;\n" " idx[3] = (int)a[0].w & 0xff;\n" " }\n" "\n" " GROUP_LDS_BARRIER;\n" " float4 selection;\n" " if( GET_LOCAL_IDX < 4 ) selection = p[idx[GET_LOCAL_IDX]];\n" "\n" " GROUP_LDS_BARRIER;\n" " if( GET_LOCAL_IDX < 4 ) p[GET_LOCAL_IDX] = selection;\n" " }\n" "\n" "\n" " return center;\n" "}\n" "\n" "void extractManifold1(__local float4* p, __local float4* h, __local int* nPointsPtr, float4 center)\n" "{\n" " __local int* a = (__local int*)h;\n" " {\n" " GROUP_LDS_BARRIER;\n" " float4 selection;\n" " if( GET_LOCAL_IDX < 4 )\n" " {\n" " int idx = (int)a[GET_LOCAL_IDX] & 0xff;\n" " selection = p[idx];\n" " }\n" "\n" " GROUP_LDS_BARRIER;\n" " if( GET_LOCAL_IDX < 4 ) p[GET_LOCAL_IDX] = selection;\n" " }\n" "\n" "}\n" "\n" "void extractManifold2( __local float4* p0, __local int* nPointsPtr0, float4 nearNormal0,\n" " __local float4* p1, __local int* nPointsPtr1, float4 nearNormal1,\n" " __local float4* h, float4 centerOut[2])\n" "{\n" "\n" " int nPoints[2];\n" " nPoints[0] = *nPointsPtr0;\n" " nPoints[1] = *nPointsPtr1;\n" " float4 center[2];\n" " center[0] = make_float4(0,0,0,0);\n" " center[1] = make_float4(0,0,0,0);\n" " { // calculate center\n" " nPoints[0] = min2( nPoints[0], 32 );\n" " nPoints[1] = min2( nPoints[1], 32 );\n" " {\n" " int lIdx = GET_LOCAL_IDX;\n" " h[lIdx] = (lIdx= nPoints[setIdx] ) a[ie + setIdx*64] = make_int4(-0xfffffff, -0xfffffff, -0xfffffff, -0xfffffff);\n" "\n" " a[ie + 32] = make_int4(-0xfffffff, -0xfffffff, -0xfffffff, -0xfffffff);\n" " }\n" " }\n" " }\n" " GROUP_LDS_BARRIER;\n" "\n" " { // vector reduce, h[64]\n" " int bIdx = GET_LOCAL_IDX/32;\n" " int eIdx = GET_LOCAL_IDX%32;\n" " int lIdx = eIdx + bIdx*64;\n" " {\n" " h[lIdx] = max2( h[lIdx], h[lIdx+1] );\n" " mem_fence( CLK_LOCAL_MEM_FENCE );\n" " h[lIdx] = max2( h[lIdx], h[lIdx+2] );\n" " mem_fence( CLK_LOCAL_MEM_FENCE );\n" " h[lIdx] = max2( h[lIdx], h[lIdx+4] );\n" " mem_fence( CLK_LOCAL_MEM_FENCE );\n" " h[lIdx] = max2( h[lIdx], h[lIdx+8] );\n" " mem_fence( CLK_LOCAL_MEM_FENCE );\n" " h[lIdx] = max2( h[lIdx], h[lIdx+16] );\n" " }\n" " }\n" "\n" " GROUP_LDS_BARRIER;\n" " }\n" " __local int* a = (__local int*)h;\n" " {\n" " GROUP_LDS_BARRIER;\n" " \n" " float4 selection;\n" "\n" " int bIdx = GET_LOCAL_IDX/32;\n" " int eIdx = GET_LOCAL_IDX%32;\n" "\n" " if( eIdx < 4 )\n" " {\n" " int idx = (int)a[eIdx+64*4*bIdx] & 0xff;\n" " selection = p0[idx+32*bIdx];\n" " }\n" "\n" " GROUP_LDS_BARRIER;\n" " if( eIdx < 4 ) p0[eIdx+32*bIdx] = selection;\n" " }\n" "}\n" "\n" "/*\n" "1. Query Normal\n" "2. Fill Normal\n" "3. A->B, B->A\n" "*/\n" "\n" "void testVtx(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n" " __local ShapeDeviceData* shapeAPtr, __local ShapeDeviceData* shapeBPtr,\n" " __local int* lNContacts, __local float4* lCPoints)\n" "{\n" " int pIdx = GET_LOCAL_IDX;\n" " float4 bodyAPos = bodyAPtr->m_pos;\n" " float4 bodyBPos = bodyBPtr->m_pos;\n" " Quaternion bodyAQuat = bodyAPtr->m_quat;\n" " Quaternion bodyBQuat = bodyBPtr->m_quat;\n" " while( pIdx < HEIGHT_RES*HEIGHT_RES*6 )\n" " {\n" " float4 pInB = ShapeDataCalcSamplePoint( shapeBPtr, pIdx );\n" "\n" " float4 pInW = transform( &pInB, &bodyBPos, &bodyBQuat );\n" "// Aabb bodyAAabb = bodyAPtr->m_aabb;\n" "// if( AabbOverlapsPoint( &bodyAAabb, pInW ) )\n" " {\n" " float4 pInA = invTransform( &pInW, &bodyAPos, &bodyAQuat );\n" "\n" " float dist = ShapeDataQueryDistance( shapeAPtr, pInA );\n" " if( dist < 0.010f )\n" " {\n" " int dstIdx = atom_add( lNContacts, 1 );\n" " if( dstIdx < 32 )\n" " {\n" " lCPoints[ dstIdx ] = make_float4( pInA.x, pInA.y, pInA.z, dist );\n" " }\n" " }\n" " }\n" "\n" " pIdx += GET_GROUP_SIZE;\n" " }\n" "}\n" "\n" "void testVtx2(__local const BodyData* bodyA, __local const BodyData* bodyB,\n" " __local const ShapeDeviceData* shapeA, __local const ShapeDeviceData* shapeB,\n" " __local int* lNContactsA, __local float4* lCPointsA,\n" " __local int* lNContactsB, __local float4* lCPointsB, float collisionMargin )\n" "{\n" " int pIdx = GET_LOCAL_IDX;\n" "\n" " while( pIdx < HEIGHT_RES*HEIGHT_RES*6*2 )\n" " {\n" " __local const BodyData* bodyAPtr =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?bodyA:bodyB;\n" " __local const BodyData* bodyBPtr =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?bodyB:bodyA;\n" " __local const ShapeDeviceData* shapeAPtr =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?shapeA:shapeB;\n" " __local const ShapeDeviceData* shapeBPtr =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?shapeB:shapeA;\n" " __local int* lNContacts =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?lNContactsA:lNContactsB;\n" " __local float4* lCPoints =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?lCPointsA:lCPointsB;\n" "\n" " float4 bodyAPos = bodyAPtr->m_pos;\n" " float4 bodyBPos = bodyBPtr->m_pos;\n" " Quaternion bodyAQuat = bodyAPtr->m_quat;\n" " Quaternion bodyBQuat = bodyBPtr->m_quat;\n" "\n" " float4 pInB = ShapeDataCalcSamplePoint( shapeBPtr, pIdx%(HEIGHT_RES*HEIGHT_RES*6) );\n" "\n" " float4 pInW = transform( &pInB, &bodyBPos, &bodyBQuat );\n" "// Aabb bodyAAabb = bodyAPtr->m_aabb;\n" "// if( AabbOverlapsPoint( &bodyAAabb, pInW ) )\n" " {\n" " float4 pInA = invTransform( &pInW, &bodyAPos, &bodyAQuat );\n" "\n" " float dist = ShapeDataQueryDistance( shapeAPtr, pInA );\n" " if( dist < collisionMargin )\n" " {\n" " int dstIdx = atom_add( lNContacts, 1 );\n" " if( dstIdx < 32 )\n" " {\n" " lCPoints[ dstIdx ] = make_float4( pInA.x, pInA.y, pInA.z, dist );\n" " }\n" " }\n" " }\n" "\n" " pIdx += GET_GROUP_SIZE;\n" " }\n" "}\n" "\n" "void testVtxWithPlane(__local BodyData* bodyA, __local BodyData* bodyB,\n" " float4 nA, __local ShapeDeviceData* shapeB,\n" " __local int* lNContactsA, __local float4* lCPointsA, float collisionMargin)\n" "{\n" " int pIdx = GET_LOCAL_IDX;\n" "\n" " while( pIdx < HEIGHT_RES*HEIGHT_RES*6 )\n" " {\n" " __local BodyData* bodyAPtr =bodyA;\n" " __local BodyData* bodyBPtr =bodyB;\n" " __local ShapeDeviceData* shapeBPtr =shapeB;\n" " __local int* lNContacts =lNContactsA;\n" " __local float4* lCPoints =lCPointsA;\n" "\n" " float4 bodyAPos = bodyAPtr->m_pos;\n" " float4 bodyBPos = bodyBPtr->m_pos;\n" " Quaternion bodyAQuat = bodyAPtr->m_quat;\n" " Quaternion bodyBQuat = bodyBPtr->m_quat;\n" "\n" " float4 pInB = ShapeDataCalcSamplePoint( shapeBPtr, pIdx%(HEIGHT_RES*HEIGHT_RES*6) );\n" "\n" " float4 pInW = transform( &pInB, &bodyBPos, &bodyBQuat );\n" " {\n" " float4 pInA = invTransform( &pInW, &bodyAPos, &bodyAQuat );\n" "\n" " float dist = dot3w1( pInA, nA );//ShapeDataQueryDistance( shapeAPtr, pInA );\n" " if( dist < collisionMargin )\n" " {\n" " int dstIdx = atom_add( lNContacts, 1 );\n" " if( dstIdx < 32 )\n" " {\n" " lCPoints[ dstIdx ] = make_float4( pInA.x, pInA.y, pInA.z, dist );\n" " }\n" " }\n" " }\n" "\n" " pIdx += GET_GROUP_SIZE;\n" " }\n" "}\n" "\n" "#define GET_SHAPE_IDX(x) (int)((x).m_shapeIdx)\n" "\n" "void output(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n" " __local int2* iPair,\n" " __local int* lNContacts, __local float4* lCPoints,\n" " float4 center, \n" " __global ShapeData* shapeData, __global Contact4* contactsOut, float collisionMargin)\n" "{\n" " if( *lNContacts != 0 )\n" " {\n" " int nContacts = min2( *lNContacts, 4 );\n" "\n" " __global Contact4* c = contactsOut;\n" "\n" " if( GET_LOCAL_IDX < nContacts )\n" " {\n" " int i = GET_LOCAL_IDX;\n" " float4 p = lCPoints[i];\n" " float4 bodyAPos = bodyAPtr->m_pos;\n" " Quaternion bodyAQuat = bodyAPtr->m_quat;\n" "\n" " c->m_worldPos[i] = transform( &p, &bodyAPos, &bodyAQuat );\n" " c->m_worldPos[i].w = lCPoints[i].w - collisionMargin;\n" " }\n" "\n" " if( GET_LOCAL_IDX == 0 )\n" " {\n" " float4 contactNormal;\n" " contactNormal = ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );\n" " contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );\n" "\n" " c->m_worldNormal = contactNormal;\n" "// c->m_restituitionCoeff = 0.f;\n" "// c->m_frictionCoeff = 0.7f;\n" " c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n" " GET_NPOINTS(*c) = nContacts;\n" " c->m_bodyAPtr = iPair[0].x;\n" " c->m_bodyBPtr = iPair[0].y;\n" " }\n" " }\n" " else\n" " {\n" " if( GET_LOCAL_IDX == 0 )\n" " GET_NPOINTS(contactsOut[0]) = 0;\n" " }\n" "}\n" "\n" "// todo. make it better\n" "void output2(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n" " int pair0, int pair1,\n" " __local int* lNContacts, __local float4* lCPoints,\n" " float4 center, \n" " const __global ShapeData* shapeData, __global Contact4* contactsOut, counter32_t nContactsOut, int capacity,\n" " float collisionMargin )\n" "{\n" " int lIdx = GET_LOCAL_IDX%32;\n" " int nContacts = min2( *lNContacts, 4 );\n" " \n" " GROUP_LDS_BARRIER;\n" "\n" " if( lIdx == 0 && nContacts)\n" " {\n" " int dstIdx;\n" " AppendInc( nContactsOut, dstIdx );\n" " *lNContacts = dstIdx;\n" "\n" " if( dstIdx >= capacity )\n" " *lNContacts = -1;\n" " }\n" "\n" " GROUP_LDS_BARRIER;\n" "\n" " bool canWrite = (*lNContacts!=-1);\n" "\n" " if( nContacts && canWrite )\n" " {\n" " __global Contact4* c = contactsOut + (*lNContacts);\n" "\n" " if( lIdx < nContacts )\n" " {\n" " int i = lIdx;\n" " float4 p = lCPoints[i];\n" " float4 bodyAPos = bodyAPtr->m_pos;\n" " Quaternion bodyAQuat = bodyAPtr->m_quat;\n" "\n" " p = transform( &p, &bodyAPos, &bodyAQuat );\n" " p.w = lCPoints[i].w - collisionMargin;\n" " c->m_worldPos[i] = p;\n" " }\n" "\n" " if( lIdx == 0 )\n" " {\n" " if( nContacts )\n" " {\n" " float4 contactNormal;\n" " contactNormal = ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );\n" " contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );\n" "\n" " c->m_worldNormal = contactNormal;\n" "// c->m_restituitionCoeff = 0.f;\n" "// c->m_frictionCoeff = 0.7f;\n" " c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n" " c->m_bodyAPtr = pair0;\n" " c->m_bodyBPtr = pair1;\n" " }\n" " GET_NPOINTS(*c) = nContacts;\n" " }\n" " }\n" "}\n" "\n" "__inline\n" "void output2LDS(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n" " int pair0, int pair1,\n" " int lNContacts, __local float4* lCPoints,\n" " float4 center, \n" " const __global ShapeData* shapeData, __local Contact4* contactsOut,\n" " float collisionMargin )\n" "{\n" " int lIdx = GET_LOCAL_IDX%32;\n" "// int lIdx = GET_LOCAL_IDX;\n" "// int groupIdx = 0;\n" "\n" " int nContacts = min2( lNContacts, 4 );\n" " \n" " GROUP_LDS_BARRIER;\n" "\n" " if( nContacts != 0 )\n" " {\n" " if( lIdx < nContacts )\n" " {\n" " int i = lIdx;\n" " float4 p = lCPoints[i];\n" " float4 bodyAPos = bodyAPtr->m_pos;\n" " Quaternion bodyAQuat = bodyAPtr->m_quat;\n" "\n" " p = transform( &p, &bodyAPos, &bodyAQuat );\n" " p.w = lCPoints[i].w - collisionMargin;\n" " contactsOut->m_worldPos[i] = p;\n" " }\n" " }\n" "\n" " if( lIdx == 0 )\n" " {\n" " if( nContacts != 0 )\n" " {\n" " float4 contactNormal;\n" " contactNormal = ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );\n" " contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );\n" "\n" " contactsOut->m_worldNormal = contactNormal;\n" "// contactsOut->m_worldNormal = make_float4(1.5f,1.4f,1.3f,0.f);\n" "// contactsOut->m_restituitionCoeff = 0.f;\n" "// contactsOut->m_frictionCoeff = 0.7f;\n" " contactsOut->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n" " contactsOut->m_bodyAPtr = pair0;\n" " contactsOut->m_bodyBPtr = pair1;\n" " }\n" " GET_NPOINTS(*contactsOut) = nContacts;//nContacts;\n" " }\n" "\n" "// contactsOut[groupIdx].m_worldNormal = make_float4(1.5f,1.4f,1.3f,0.f);\n" "}\n" "\n" "void output2_1(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n" " int pair0, int pair1,\n" " __local int* lNContacts, __local float4* lCPoints,\n" " float4 center, float4 nA, \n" " const __global ShapeData* shapeData, __global Contact4* contactsOut, counter32_t nContactsOut, int capacity, float collisionMargin )\n" "{\n" " int lIdx = GET_LOCAL_IDX;\n" " int nContacts = min2( *lNContacts, 4 );\n" " \n" " GROUP_LDS_BARRIER;\n" "\n" " if( lIdx == 0 && nContacts)\n" " {\n" " int dstIdx;\n" " AppendInc( nContactsOut, dstIdx );\n" " *lNContacts = dstIdx;\n" "\n" " if( dstIdx >= capacity )\n" " *lNContacts = -1;\n" " }\n" "\n" " GROUP_LDS_BARRIER;\n" "\n" " bool canWrite = (*lNContacts!=-1);\n" "\n" " if( nContacts && canWrite )\n" " {\n" " __global Contact4* c = contactsOut + (*lNContacts);\n" "\n" " if( lIdx < nContacts )\n" " {\n" " int i = lIdx;\n" " float4 p = lCPoints[i];\n" " float4 bodyAPos = bodyAPtr->m_pos;\n" " Quaternion bodyAQuat = bodyAPtr->m_quat;\n" "\n" " p = transform( &p, &bodyAPos, &bodyAQuat );\n" " p.w = lCPoints[i].w - collisionMargin;\n" " c->m_worldPos[i] = p;\n" " }\n" "\n" " if( lIdx == 0 )\n" " {\n" " if( nContacts )\n" " {\n" " float4 contactNormal;\n" " contactNormal = nA;//ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );\n" " contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );\n" "\n" " c->m_worldNormal = contactNormal;\n" "// c->m_restituitionCoeff = 0.f;\n" "// c->m_frictionCoeff = 0.7f;\n" " c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n" " c->m_bodyAPtr = pair0;\n" " c->m_bodyBPtr = pair1;\n" " }\n" " GET_NPOINTS(*c) = nContacts;\n" " }\n" " }\n" "}\n" "\n" "__kernel\n" "void manifold(__global float4* vIn, __global float4* vOut)\n" "{\n" " __local float4 lCPoints[32];\n" " __local float4 lManifoldBuffer[64];\n" " __local int lNContacts;\n" " __local float4 ab;\n" "\n" " if( GET_LOCAL_IDX<32 )\n" " {\n" " lCPoints[GET_LOCAL_IDX] = vIn[GET_GLOBAL_IDX];\n" " }\n" "\n" " if( GET_LOCAL_IDX == 0 ) \n" " {\n" " lNContacts = 32;\n" " ab = vIn[GET_GLOBAL_IDX];\n" " }\n" "\n" " GROUP_LDS_BARRIER;\n" "\n" " float4 center = extractManifold( lCPoints, lManifoldBuffer, &lNContacts, ab );\n" "\n" " if( GET_LOCAL_IDX < lNContacts )\n" " {\n" " vOut[4*GET_GROUP_IDX+GET_LOCAL_IDX] = lCPoints[GET_LOCAL_IDX];\n" " }\n" "\n" "}\n" "\n" "//#define COMBINE_REDUCTION \n" "\n" "__kernel\n" "__attribute__((reqd_work_group_size(64, 1, 1)))\n" "void NarrowphaseKernel( const __global int2* restrict pairs, const __global ShapeData* shapeData, const __global BodyData* restrict bodyDatas, \n" " __global Contact4* restrict contactsOut,\n" " counter32_t nContactsOut, ConstBuffer cb ) \n" "{\n" " // 2.5K LDS\n" " __local Contact4 ldsContacts[2];\n" " __local BodyData bodyA;\n" " __local BodyData bodyB;\n" " __local ShapeDeviceData shapeA;\n" " __local ShapeDeviceData shapeB;\n" " __local float4 lCPointsA[32*2];\n" " __local int lNContactsA;\n" " __local float4* lCPointsB = lCPointsA+32;\n" " __local int lNContactsB;\n" "#ifdef COMBINE_REDUCTION\n" " __local float4 lManifoldBuffer[64*2];\n" "#else\n" " __local float4 lManifoldBuffer[64];\n" "#endif\n" " __local int2 iPairAB;\n" "\n" " const int capacity = cb.m_capacity;\n" " const float collisionMargin = cb.m_collisionMargin;\n" "\n" "\n" " int pairIdx = GET_GROUP_IDX;\n" "// for(int pairIdx = GET_GROUP_IDX; pairIdxm_height4[idx] = shapeData[ myShapeIdx ].m_height4[idx];\n" "\n" " idx+=32;\n" " }\n" " }\n" "\n" " GROUP_LDS_BARRIER;\n" "\n" " testVtx2( &bodyA, &bodyB, &shapeA, &shapeB, &lNContactsA, lCPointsA, &lNContactsB, lCPointsB, collisionMargin );\n" "\n" " GROUP_LDS_BARRIER;\n" "\n" " float4 ab = bodyB.m_pos - bodyA.m_pos;\n" " float4 center[2];\n" "\n" " if( lNContactsA != 0 || lNContactsB != 0 )\n" " {\n" " float4 abInA;\n" " abInA = qtInvRotate( bodyA.m_quat, ab );\n" "\n" " float4 abInB;\n" " abInB = qtInvRotate( bodyB.m_quat, ab );\n" "\n" "#ifdef COMBINE_REDUCTION\n" " extractManifold2( lCPointsA, &lNContactsA, abInA,\n" " lCPointsB, &lNContactsB, abInB,\n" " lManifoldBuffer, center );\n" "#else\n" " if( lNContactsA != 0 )\n" " center[0] = extractManifold( lCPointsA, lManifoldBuffer, &lNContactsA, abInA );\n" " if( lNContactsB != 0 )\n" " center[1] = extractManifold( lCPointsB, lManifoldBuffer, &lNContactsB, abInB );\n" "#endif\n" " }\n" "\n" " int firstSet = GET_LOCAL_IDX/32;\n" "\n" "/*\n" " if( GET_LOCAL_IDX == 0 ) // for debug\n" " {\n" " ldsContacts[0].m_worldNormal = make_float4(-1,-1,-1,0);\n" " ldsContacts[0].m_bodyAPtr = 0;\n" " ldsContacts[0].m_bodyBPtr = 0;\n" " ldsContacts[0].m_batchIdx = 111;\n" " ldsContacts[1].m_worldNormal = make_float4(-1,-1,-1,0);\n" " ldsContacts[1].m_bodyAPtr = 0;\n" " ldsContacts[1].m_bodyBPtr = 0;\n" " ldsContacts[1].m_batchIdx = 111;\n" " }\n" "*/\n" " bool doReduction = true;\n" " if( doReduction )\n" " {\n" " GROUP_LDS_BARRIER;\n" "\n" " output2LDS( (firstSet)?&bodyA: &bodyB, (firstSet)?&bodyB : &bodyA, \n" " (firstSet)?iPairAB.x : iPairAB.y, (firstSet)?iPairAB.y : iPairAB.x, \n" " (firstSet)?lNContactsA : lNContactsB, (firstSet)?lCPointsA:lCPointsB, \n" " (firstSet)?center[0] : center[1], shapeData, (firstSet)?&ldsContacts[0]: &ldsContacts[1], collisionMargin );\n" "\n" " GROUP_LDS_BARRIER;\n" " \n" " if( GET_LOCAL_IDX == 0 )\n" " {\n" " if( lNContactsA && lNContactsB )\n" " {\n" " float nDotn = dot3F4( ldsContacts[0].m_worldNormal, ldsContacts[1].m_worldNormal );\n" " if( nDotn < -(1.f-0.01f) )\n" " {\n" " if( ldsContacts[0].m_bodyAPtr > ldsContacts[1].m_bodyAPtr )\n" " lNContactsA = 0;\n" " else\n" " lNContactsB = 0;\n" " }\n" " }\n" " }\n" " \n" " if( GET_LOCAL_IDX == 0 )\n" " {\n" " int n = lNContactsA;\n" " if( n != 0 )\n" " {\n" " int dstIdx;\n" " AppendInc( nContactsOut, dstIdx );\n" " if( dstIdx < capacity )\n" " { int idx = 0;\n" " contactsOut[ dstIdx ] = ldsContacts[idx];\n" " contactsOut[ dstIdx].m_batchIdx = pairIdx;\n" " }\n" " }\n" "\n" " n = lNContactsB;\n" " if( n != 0 )\n" " {\n" " int dstIdx;\n" " AppendInc( nContactsOut, dstIdx );\n" " if( dstIdx < capacity )\n" " { int idx = 1;\n" " contactsOut[ dstIdx ] = ldsContacts[idx];\n" " contactsOut[ dstIdx].m_batchIdx = pairIdx;\n" " }\n" " }\n" " }\n" "\n" " GROUP_LDS_BARRIER;\n" " }\n" " else\n" " {\n" " //output2( (firstSet)?&bodyA: &bodyB, (firstSet)?&bodyB : &bodyA, \n" " // (firstSet)?iPairAB.x : iPairAB.y, (firstSet)?iPairAB.y : iPairAB.x, \n" " // (firstSet)?&lNContactsA : &lNContactsB, (firstSet)?lCPointsA:lCPointsB, \n" " // (firstSet)?center[0] : center[1], shapeData, contactsOut, nContactsOut, capacity, collisionMargin );\n" " }\n" " }\n" "}\n" "\n" "\n" "__kernel\n" "__attribute__((reqd_work_group_size(64, 1, 1)))\n" "void NarrowphaseWithPlaneKernel( const __global int2* restrict pairs, const __global ShapeData* shapeData, const __global BodyData* restrict bodyDatas, \n" " __global Contact4* restrict contactsOut,\n" " counter32_t nContactsOut, ConstBuffer cb ) \n" "{\n" " // 2.5K LDS\n" " __local BodyData bodyA;\n" " __local BodyData bodyB;\n" " __local ShapeDeviceData shapeA;\n" " __local ShapeDeviceData shapeB;\n" " __local float4 lCPointsA[32*2];\n" " __local int lNContactsA;\n" "// __local float4* lCPointsB = lCPointsA+32;\n" "// __local int lNContactsB;\n" " __local float4 lManifoldBuffer[64];\n" " __local int2 iPairAB;\n" "\n" " const int capacity = cb.m_capacity;\n" " const float collisionMargin = cb.m_collisionMargin;\n" "\n" " int pairIdx = GET_GROUP_IDX;\n" " {\n" " if( GET_LOCAL_IDX == 0 ) // load Bodies\n" " {\n" " int2 pair = pairs[pairIdx];\n" " iPairAB = make_int2(pair.x, pair.y);\n" " bodyA = bodyDatas[ pair.x ];\n" " bodyB = bodyDatas[ pair.y ];\n" " shapeA.m_scale = shapeData[ GET_SHAPE_IDX(bodyA) ].m_scale;\n" " shapeB.m_scale = shapeData[ GET_SHAPE_IDX(bodyB) ].m_scale;\n" " lNContactsA = 0;\n" "// lNContactsB = 0;\n" " }\n" "\n" " GROUP_LDS_BARRIER;\n" "\n" " if (bodyB.m_invMass == 0.f)\n" " return;\n" " \n" " // todo. can check if the shape is the same to previous one. If same, dont read\n" " { // load shape data\n" " int idx = GET_LOCAL_IDX%32;\n" " int bIdx = GET_LOCAL_IDX/32;\n" " __local ShapeDeviceData* myShape = (bIdx==0)?&shapeA: &shapeB;\n" " int myShapeIdx = (bIdx==0)?GET_SHAPE_IDX(bodyA): GET_SHAPE_IDX(bodyB);\n" "\n" " while( idx < HEIGHT_RES*HEIGHT_RES*6/4 )\n" " {\n" " myShape->m_height4[idx] = shapeData[ myShapeIdx ].m_height4[idx];\n" "\n" " idx+=32;\n" " }\n" " }\n" "\n" " GROUP_LDS_BARRIER;\n" "\n" " float4 nA = make_float4(0,1,0,0);\n" "\n" "\n" "// testVtx2( &bodyA, &bodyB, &shapeA, &shapeB, &lNContactsA, lCPointsA, &lNContactsB, lCPointsB );\n" " testVtxWithPlane( &bodyA, &bodyB, nA, &shapeB, &lNContactsA, lCPointsA, collisionMargin );\n" "\n" " GROUP_LDS_BARRIER;\n" "\n" "// float4 ab = bodyB.m_pos - bodyA.m_pos;\n" " float4 center[2];\n" "\n" " if( lNContactsA != 0 )\n" " {\n" " float4 abInA;\n" " abInA = nA;//qtInvRotate( bodyA.m_quat, ab );\n" "\n" " if( lNContactsA != 0 )\n" " center[0] = extractManifold( lCPointsA, lManifoldBuffer, &lNContactsA, abInA );\n" " }\n" "\n" "// int firstSet = GET_LOCAL_IDX/32;\n" "\n" " output2_1( &bodyA, &bodyB, \n" " iPairAB.x, iPairAB.y, \n" " &lNContactsA, lCPointsA, \n" " center[0], nA, shapeData, contactsOut, nContactsOut, capacity, collisionMargin );\n" " }\n" "}\n" ;