Files
bullet3/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphaseKernels.h

1617 lines
44 KiB
C

static const char* narrowphaseKernelsCL= \
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile global int*\n"
"#endif\n"
"\n"
"\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"\n"
"\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"\n"
"#define make_float4 (float4)\n"
"#define make_float2 (float2)\n"
"#define make_uint4 (uint4)\n"
"#define make_int4 (int4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"\n"
"#define max2 max\n"
"#define min2 min\n"
"\n"
"\n"
"///////////////////////////////////////\n"
"// Vector\n"
"///////////////////////////////////////\n"
"__inline\n"
"float fastDiv(float numerator, float denominator)\n"
"{\n"
" return native_divide(numerator, denominator); \n"
"// return numerator/denominator; \n"
"}\n"
"\n"
"__inline\n"
"float4 fastDiv4(float4 numerator, float4 denominator)\n"
"{\n"
" return native_divide(numerator, denominator); \n"
"}\n"
"\n"
"__inline\n"
"float fastSqrtf(float f2)\n"
"{\n"
" return native_sqrt(f2);\n"
"// return sqrt(f2);\n"
"}\n"
"\n"
"__inline\n"
"float fastRSqrt(float f2)\n"
"{\n"
" return native_rsqrt(f2);\n"
"}\n"
"\n"
"__inline\n"
"float fastLength4(float4 v)\n"
"{\n"
" return fast_length(v);\n"
"}\n"
"\n"
"__inline\n"
"float4 fastNormalize4(float4 v)\n"
"{\n"
" return fast_normalize(v);\n"
"}\n"
"\n"
"\n"
"__inline\n"
"float sqrtf(float a)\n"
"{\n"
"// return sqrt(a);\n"
" return native_sqrt(a);\n"
"}\n"
"\n"
"__inline\n"
"float4 cross3(float4 a, float4 b)\n"
"{\n"
" return cross(a,b);\n"
"}\n"
"\n"
"__inline\n"
"float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = make_float4(a.xyz,0.f);\n"
" float4 b1 = make_float4(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"\n"
"__inline\n"
"float length3(const float4 a)\n"
"{\n"
" return sqrtf(dot3F4(a,a));\n"
"}\n"
"\n"
"__inline\n"
"float dot4(const float4 a, const float4 b)\n"
"{\n"
" return dot( a, b );\n"
"}\n"
"\n"
"// for height\n"
"__inline\n"
"float dot3w1(const float4 point, const float4 eqn)\n"
"{\n"
" return dot3F4(point,eqn) + eqn.w;\n"
"}\n"
"\n"
"__inline\n"
"float4 normalize3(const float4 a)\n"
"{\n"
" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
" return fastNormalize4( n );\n"
"// float length = sqrtf(dot3F4(a, a));\n"
"// return 1.f/length * a;\n"
"}\n"
"\n"
"__inline\n"
"float4 normalize4(const float4 a)\n"
"{\n"
" float length = sqrtf(dot4(a, a));\n"
" return 1.f/length * a;\n"
"}\n"
"\n"
"__inline\n"
"float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
"{\n"
" float4 eqn;\n"
" float4 ab = b-a;\n"
" float4 ac = c-a;\n"
" eqn = normalize3( cross3(ab, ac) );\n"
" eqn.w = -dot3F4(eqn,a);\n"
" return eqn;\n"
"}\n"
"\n"
"///////////////////////////////////////\n"
"// Matrix3x3\n"
"///////////////////////////////////////\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_row[3];\n"
"}Matrix3x3;\n"
"\n"
"__inline\n"
"Matrix3x3 mtZero();\n"
"\n"
"__inline\n"
"Matrix3x3 mtIdentity();\n"
"\n"
"__inline\n"
"Matrix3x3 mtTranspose(Matrix3x3 m);\n"
"\n"
"__inline\n"
"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b);\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b);\n"
"\n"
"__inline\n"
"Matrix3x3 mtZero()\n"
"{\n"
" Matrix3x3 m;\n"
" m.m_row[0] = (float4)(0.f);\n"
" m.m_row[1] = (float4)(0.f);\n"
" m.m_row[2] = (float4)(0.f);\n"
" return m;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtIdentity()\n"
"{\n"
" Matrix3x3 m;\n"
" m.m_row[0] = (float4)(1,0,0,0);\n"
" m.m_row[1] = (float4)(0,1,0,0);\n"
" m.m_row[2] = (float4)(0,0,1,0);\n"
" return m;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtTranspose(Matrix3x3 m)\n"
"{\n"
" Matrix3x3 out;\n"
" out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
" out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
" out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
" return out;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
"{\n"
" Matrix3x3 transB;\n"
" transB = mtTranspose( b );\n"
" Matrix3x3 ans;\n"
" // why this doesn't run when 0ing in the for{}\n"
" a.m_row[0].w = 0.f;\n"
" a.m_row[1].w = 0.f;\n"
" a.m_row[2].w = 0.f;\n"
" for(int i=0; i<3; i++)\n"
" {\n"
"// a.m_row[i].w = 0.f;\n"
" ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n"
" ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n"
" ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n"
" ans.m_row[i].w = 0.f;\n"
" }\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b)\n"
"{\n"
" float4 ans;\n"
" ans.x = dot3F4( a.m_row[0], b );\n"
" ans.y = dot3F4( a.m_row[1], b );\n"
" ans.z = dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b)\n"
"{\n"
" float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
"\n"
" float4 ans;\n"
" ans.x = dot3F4( a, colx );\n"
" ans.y = dot3F4( a, coly );\n"
" ans.z = dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"\n"
"///////////////////////////////////////\n"
"// Quaternion\n"
"///////////////////////////////////////\n"
"\n"
"typedef float4 Quaternion;\n"
"\n"
"__inline\n"
"Quaternion qtMul(Quaternion a, Quaternion b);\n"
"\n"
"__inline\n"
"Quaternion qtNormalize(Quaternion in);\n"
"\n"
"__inline\n"
"float4 qtRotate(Quaternion q, float4 vec);\n"
"\n"
"__inline\n"
"Quaternion qtInvert(Quaternion q);\n"
"\n"
"__inline\n"
"Matrix3x3 qtGetRotationMatrix(Quaternion q);\n"
"\n"
"\n"
"\n"
"__inline\n"
"Quaternion qtMul(Quaternion a, Quaternion b)\n"
"{\n"
" Quaternion ans;\n"
" ans = cross3( a, b );\n"
" ans += a.w*b+b.w*a;\n"
"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
" ans.w = a.w*b.w - dot3F4(a, b);\n"
" return ans;\n"
"}\n"
"\n"
"__inline\n"
"Quaternion qtNormalize(Quaternion in)\n"
"{\n"
" return fastNormalize4(in);\n"
"// in /= length( in );\n"
"// return in;\n"
"}\n"
"__inline\n"
"float4 qtRotate(Quaternion q, float4 vec)\n"
"{\n"
" Quaternion qInv = qtInvert( q );\n"
" float4 vcpy = vec;\n"
" vcpy.w = 0.f;\n"
" float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
" return out;\n"
"}\n"
"\n"
"__inline\n"
"Quaternion qtInvert(Quaternion q)\n"
"{\n"
" return (Quaternion)(-q.xyz, q.w);\n"
"}\n"
"\n"
"__inline\n"
"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
"{\n"
" return qtRotate( qtInvert( q ), vec );\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 qtGetRotationMatrix(Quaternion quat)\n"
"{\n"
" float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
" Matrix3x3 out;\n"
"\n"
" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
" out.m_row[0].w = 0.f;\n"
"\n"
" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
" out.m_row[1].w = 0.f;\n"
"\n"
" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
" out.m_row[2].w = 0.f;\n"
"\n"
" return out;\n"
"}\n"
"\n"
"\n"
"#define WG_SIZE 64\n"
"#define HEIGHT_RES 4\n"
"#define SHAPE_CONVEX_HEIGHT_FIELD 1//keep this in sync with AdlCollisionShape.h!\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_normal[HEIGHT_RES*HEIGHT_RES*6];\n"
" u32 m_height4[HEIGHT_RES*HEIGHT_RES*6];\n"
" u32 m_supportHeight4[HEIGHT_RES*HEIGHT_RES*6];\n"
"\n"
" float m_scale;\n"
" float m_padding0;\n"
" float m_padding1;\n"
" float m_padding2;\n"
"} ShapeData;\n"
"\n"
"typedef struct\n"
"{\n"
" u32 m_height4[HEIGHT_RES*HEIGHT_RES*6/4];\n"
"\n"
" float m_scale;\n"
"} ShapeDeviceData;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" float4 m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
"\n"
" u32 m_shapeIdx;\n"
" u32 m_shapeType;\n"
" \n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} BodyData;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_worldPos[4];\n"
" float4 m_worldNormal; // w: m_nPoints\n"
"// float m_restituitionCoeff;\n"
"// float m_frictionCoeff;\n"
" u32 m_coeffs;\n"
" u32 m_batchIdx;\n"
"// int m_nPoints;\n"
"// int m_padding0;\n"
"\n"
" u32 m_bodyAPtr;//x:m_bodyAPtr, y:m_bodyBPtr\n"
" u32 m_bodyBPtr;\n"
"} Contact4;\n"
"\n"
"#define GET_NPOINTS(x) (x).m_worldNormal.w\n"
"\n"
"\n"
"typedef struct\n"
"{\n"
" int m_nPairs;\n"
" float m_collisionMargin;\n"
" int m_capacity;\n"
" int m_paddings[1];\n"
"} ConstBuffer;\n"
"\n"
"__inline\n"
"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
"{\n"
" return qtRotate( *orientation, *p ) + (*translation);\n"
"}\n"
"\n"
"__inline\n"
"float4 invTransform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
"{\n"
" return qtRotate( qtInvert( *orientation ), (*p)-(*translation) ); // use qtInvRotate\n"
"}\n"
"\n"
"void CubeMapUtilsCalcCrd(const float4 p, int* faceIdxOut, float* x, float* y)\n"
"{\n"
" {\n"
" int idx;\n"
" float r2[] = {p.x*p.x, p.y*p.y, p.z*p.z};\n"
"\n"
" if (r2[1]>r2[0])\n"
" {\n"
" if (r2[2]>r2[1])\n"
" {\n"
" idx = 2;\n"
" \n"
" } else\n"
" {\n"
" idx = 1;\n"
" }\n"
" \n"
" } else\n"
" {\n"
" if (r2[2]>r2[0])\n"
" {\n"
" idx = 2;\n"
" } else\n"
" {\n"
" idx = 0;\n"
" }\n"
" }\n"
"\n"
" *faceIdxOut = (idx*2);\n"
"//==\n"
" float4 abs = make_float4( fabs(p.x), fabs(p.y), fabs(p.z), 0.f );\n"
"\n"
" float d;\n"
" if( idx == 0 )\n"
" {\n"
" *x = p.y;\n"
" *y = p.z;\n"
" d = abs.x;\n"
" *faceIdxOut += (p.x < 0.f)? 0: 1.f;\n"
" }\n"
" else if( idx == 1 )\n"
" {\n"
" *x = p.z;\n"
" *y = p.x;\n"
" d = abs.y;\n"
" *faceIdxOut += (p.y < 0.f)? 0: 1.f;\n"
" }\n"
" else\n"
" {\n"
" *x = p.x;\n"
" *y = p.y;\n"
" d = abs.z;\n"
" *faceIdxOut += (p.z < 0.f)? 0: 1.f;\n"
" }\n"
"\n"
" float dInv = (d==0.f)? 0.f: fastDiv(1.f,d);\n"
" *x = (*x*dInv+1.f)*0.5f;\n"
" *y = (*y*dInv+1.f)*0.5f;\n"
" }\n"
"}\n"
"\n"
"float4 CubeMapUtilsCalcVector(int faceIdx, float x, float y)\n"
"{\n"
" int dir = faceIdx/2;\n"
" float z = (faceIdx%2 == 0)? -1.f:1.f;\n"
"\n"
" x = x*2.f-1.f;\n"
" y = y*2.f-1.f;\n"
" \n"
" if( dir == 0 )\n"
" {\n"
" return make_float4(z, x, y, 0.f);\n"
" }\n"
" else if( dir == 1 )\n"
" {\n"
" return make_float4(y,z,x, 0.f);\n"
" }\n"
" else\n"
" {\n"
" return make_float4(x,y,z, 0.f);\n"
" }\n"
"}\n"
"\n"
"typedef int Face;\n"
"\n"
"u32 sample(__local ShapeDeviceData* shape, int face, int x, int y)\n"
"{\n"
"\n"
" int idx = HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES;\n"
" __local u8* height = (__local u8*)shape->m_height4;\n"
" return height[idx];\n"
"}\n"
"\n"
"u32 sampleSupportGlobal(__global ShapeData* shape, int face, int x, int y)\n"
"{\n"
"\n"
" int idx = HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES;\n"
" __global u8* height = (__global u8*)shape->m_supportHeight4;\n"
" return height[idx];\n"
"}\n"
"\n"
"float4 sampleNormal(__local ShapeData* shape, int face, int x, int y)\n"
"{\n"
" return shape->m_normal[HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES];\n"
"}\n"
"\n"
"float4 sampleNormalGlobal(const __global ShapeData* shape, int face, int x, int y)\n"
"{\n"
" return shape->m_normal[HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES];\n"
"}\n"
"\n"
"float4 ShapeDataCalcSamplePoint( __local const ShapeDeviceData* shape, int sIdx )//u8 height, int sIdx, float scale )\n"
"{\n"
" const float oneOver255 = 1.f/255.f;\n"
"\n"
" int faceIdx = fastDiv(sIdx,(HEIGHT_RES*HEIGHT_RES));\n"
" int r = (sIdx%(HEIGHT_RES*HEIGHT_RES));\n"
" int i = r/HEIGHT_RES;\n"
" int j = r%HEIGHT_RES;\n"
"\n"
" float4 v;\n"
" float x = fastDiv((i+0.5f),(float)HEIGHT_RES);\n"
" float y = fastDiv((j+0.5f),(float)HEIGHT_RES);\n"
" v = CubeMapUtilsCalcVector(faceIdx, x, y);\n"
" v = normalize3( v );\n"
"\n"
" int quantizedHeight = sample( shape, faceIdx, i, j );\n"
" float rheight = quantizedHeight*oneOver255*shape->m_scale;\n"
" return rheight*v;\n"
"}\n"
"\n"
"float ShapeDataQueryDistance(__local const ShapeDeviceData* shape, float4 p )\n"
"{\n"
" if( dot3F4( p, p ) >= shape->m_scale*shape->m_scale ) return FLT_MAX;\n"
"\n"
" const float oneOver255 = 1.f/255.f;\n"
"\n"
" int faceIdx;\n"
" float x, y;\n"
" CubeMapUtilsCalcCrd( p, &faceIdx, &x, &y );\n"
" x = (x*HEIGHT_RES) - 0.5f;\n"
" y = (y*HEIGHT_RES) - 0.5f;\n"
"\n"
" float height;\n"
" {\n"
" int xi = (int)(x);\n"
" int yi = (int)(y);\n"
" float dx = x-xi;\n"
" float dy = y-yi;\n"
"\n"
" {\n"
" int xip = min2((int)(HEIGHT_RES-1), xi+1);\n"
" int yip = min2((int)(HEIGHT_RES-1), yi+1);\n"
"\n"
" u32 xy = sample( shape, faceIdx, xi, yi );\n"
" u32 xpy = sample( shape, faceIdx, xip, yi );\n"
" u32 xpyp = sample( shape, faceIdx, xip, yip );\n"
" u32 xyp = sample( shape, faceIdx, xi, yip );\n"
"\n"
" height = (xy*(1.f-dx)+xpy*dx)*(1.f-dy) + (xyp*(1.f-dx)+xpyp*dx)*dy;\n"
" height = height*oneOver255*shape->m_scale;\n"
"\n"
" p.w = 0.f;\n"
"\n"
" height = fastLength4( p ) - height;\n"
" }\n"
" }\n"
"\n"
" return height;\n"
"}\n"
"\n"
"float ShapeDataQuerySupportHeight(__global ShapeData* shape, float4 p )\n"
"{\n"
" int faceIdx;\n"
" float x, y;\n"
" CubeMapUtilsCalcCrd( p, &faceIdx, &x, &y );\n"
" x = (x*HEIGHT_RES) - 0.5f;\n"
" y = (y*HEIGHT_RES) - 0.5f;\n"
"\n"
" float height;\n"
" {\n"
" int xi = (int)(x);\n"
" int yi = (int)(y);\n"
"\n"
" {\n"
" int xip = min2((int)(HEIGHT_RES-1), xi+1);\n"
" int yip = min2((int)(HEIGHT_RES-1), yi+1);\n"
"\n"
" u32 xy = sampleSupportGlobal( shape, faceIdx, xi, yi );\n"
" u32 xpy = sampleSupportGlobal( shape, faceIdx, xip, yi );\n"
" u32 xpyp = sampleSupportGlobal( shape, faceIdx, xip, yip );\n"
" u32 xyp = sampleSupportGlobal( shape, faceIdx, xi, yip );\n"
"\n"
" height = max2( xy, max2( xpy, max2( xpyp, xyp ) ) );\n"
" height = height/255.f*shape->m_scale;\n"
" }\n"
" }\n"
"\n"
" return height;\n"
"\n"
"}\n"
"\n"
"float4 ShapeDataQueryNormal(__global const ShapeData* shape, float4 p )\n"
"{\n"
" int faceIdx;\n"
" float x, y;\n"
" CubeMapUtilsCalcCrd( p, &faceIdx, &x, &y );\n"
" x = (x*HEIGHT_RES) - 0.5f;\n"
" y = (y*HEIGHT_RES) - 0.5f;\n"
"\n"
" float4 normalOut;\n"
" {\n"
" int xi = (int)(x);\n"
" int yi = (int)(y);\n"
"\n"
" normalOut = sampleNormalGlobal( shape, faceIdx, xi, yi );\n"
" }\n"
" return normalOut;\n"
"}\n"
"\n"
"\n"
"\n"
"// kernels\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void SupportCullingKernel( __global int2* restrict gPairsIn, __global ShapeData* gShapes, \n"
" __global BodyData* gBodies, \n"
" __global int2* gPairsOut, \n"
" counter32_t gNPairs,\n"
" ConstBuffer cb )\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx >= cb.m_nPairs ) return;\n"
"\n"
" const float collisionMargin = cb.m_collisionMargin;\n"
" const int capacity = cb.m_capacity;\n"
"\n"
" int2 pair = gPairsIn[gIdx];\n"
" BodyData bodyA = gBodies[pair.x];\n"
" BodyData bodyB = gBodies[pair.y];\n"
" int shapeAIdx = bodyA.m_shapeIdx;\n"
" int shapeBIdx = bodyB.m_shapeIdx;\n"
"\n"
"\n"
" bool collide = false;\n"
" \n"
" //only collide if one of the two bodies has a non-zero mass\n"
" if (bodyA.m_invMass==0.f && bodyB.m_invMass==0.f)\n"
" return;\n"
" \n"
" \n"
" if (bodyA.m_shapeType == SHAPE_CONVEX_HEIGHT_FIELD && bodyB.m_shapeType==SHAPE_CONVEX_HEIGHT_FIELD)\n"
" {\n"
" float4 abInA, baInB;\n"
" float4 ab = bodyB.m_pos - bodyA.m_pos;\n"
" {\n"
" abInA = qtInvRotate( bodyA.m_quat, ab );\n"
" baInB = qtInvRotate( bodyB.m_quat, -ab );\n"
" }\n"
" float hA = ShapeDataQuerySupportHeight( gShapes+shapeAIdx, abInA );\n"
" float hB = ShapeDataQuerySupportHeight( gShapes+shapeBIdx, baInB );\n"
"\n"
" float h2 = dot3F4( ab, ab );\n"
"\n"
" collide = ( hA + hB + collisionMargin > sqrtf(h2) );\n"
" }\n"
"\n"
" if( collide )\n"
" {\n"
" int dstIdx;\n"
" AppendInc( gNPairs, dstIdx );\n"
" if( dstIdx < capacity )\n"
" gPairsOut[dstIdx] = pair;\n"
" }\n"
"}\n"
"\n"
"\n"
"#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}\n"
"#define PARALLEL_REDUCE_MAX32(h) "
" {int lIdx = GET_LOCAL_IDX;"
" if( lIdx < 32 )"
" {"
" h[lIdx] = (h[lIdx].y > h[lIdx+1].y)? h[lIdx]: h[lIdx+1];"
" mem_fence( CLK_LOCAL_MEM_FENCE );"
" h[lIdx] = (h[lIdx].y > h[lIdx+2].y)? h[lIdx]: h[lIdx+2];"
" mem_fence( CLK_LOCAL_MEM_FENCE );"
" h[lIdx] = (h[lIdx].y > h[lIdx+4].y)? h[lIdx]: h[lIdx+4];"
" mem_fence( CLK_LOCAL_MEM_FENCE );"
" h[lIdx] = (h[lIdx].y > h[lIdx+8].y)? h[lIdx]: h[lIdx+8];"
" mem_fence( CLK_LOCAL_MEM_FENCE );"
" h[lIdx] = (h[lIdx].y > h[lIdx+16].y)? h[lIdx]: h[lIdx+16];"
" }}\n"
"\n"
"#define PARALLEL_REDUCE32(h) "
" {int lIdx = GET_LOCAL_IDX;"
" if( lIdx < 32 )"
" {"
" h[lIdx] += h[lIdx+1];"
" mem_fence( CLK_LOCAL_MEM_FENCE );"
" h[lIdx] += h[lIdx+2];"
" mem_fence( CLK_LOCAL_MEM_FENCE );"
" h[lIdx] += h[lIdx+4];"
" mem_fence( CLK_LOCAL_MEM_FENCE );"
" h[lIdx] += h[lIdx+8];"
" mem_fence( CLK_LOCAL_MEM_FENCE );"
" h[lIdx] += h[lIdx+16];"
" }}\n"
"\n"
"\n"
"float4 extractManifold(__local float4* p, __local float4* h, __local int* nPointsPtr, float4 nearNormal)\n"
"{\n"
" int nPoints = *nPointsPtr;\n"
" float4 center = make_float4(0,0,0,0);\n"
" { // calculate center\n"
" nPoints = min2( nPoints, 32 );\n"
" {\n"
" int lIdx = GET_LOCAL_IDX;\n"
" h[lIdx] = p[lIdx];\n"
" h[lIdx] = (lIdx<nPoints)? h[lIdx] : make_float4(0,0,0,0);\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" PARALLEL_REDUCE32( h );//working on h[64]\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
"// if( GET_LOCAL_IDX == 0 )\n"
" {\n"
" center = fastDiv4( h[0], make_float4(nPoints, nPoints, nPoints, 0.f) );\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" if( nPoints < 4 ) return center;\n"
" }\n"
" // is center set on all the WIs?\n"
" float4 aVector = p[0] - center;\n"
" float4 u = normalize3( cross3( nearNormal, aVector ) );\n"
" float4 v = normalize3( cross3( nearNormal, u ) );\n"
"\n"
" int idx[4];\n"
"\n"
" __local int4* a = (__local int4*)h;\n"
" { // select 4\n"
" { // set dot of 4 directions for xyzw\n"
" int ie = GET_LOCAL_IDX;\n"
" {\n"
" float f;\n"
" float4 r = p[ie]-center;\n"
" f = dot3F4( u, r );\n"
" a[ie].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
" f = dot3F4( -u, r );\n"
" a[ie].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
" f = dot3F4( v, r );\n"
" a[ie].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
" f = dot3F4( -v, r );\n"
" a[ie].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
" if( ie >= nPoints ) a[ie] = make_int4(-0xfffffff, -0xfffffff, -0xfffffff, -0xfffffff);\n"
" }\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" { // vector reduce, h[64]\n"
" int lIdx = GET_LOCAL_IDX;\n"
" if( lIdx < 32 )\n"
" {\n"
" h[lIdx] = max2( h[lIdx], h[lIdx+1] );\n"
" mem_fence( CLK_LOCAL_MEM_FENCE );\n"
" h[lIdx] = max2( h[lIdx], h[lIdx+2] );\n"
" mem_fence( CLK_LOCAL_MEM_FENCE );\n"
" h[lIdx] = max2( h[lIdx], h[lIdx+4] );\n"
" mem_fence( CLK_LOCAL_MEM_FENCE );\n"
" h[lIdx] = max2( h[lIdx], h[lIdx+8] );\n"
" mem_fence( CLK_LOCAL_MEM_FENCE );\n"
" h[lIdx] = max2( h[lIdx], h[lIdx+16] );\n"
" }\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" {\n"
" { // set to idx\n"
" idx[0] = (int)a[0].x & 0xff;\n"
" idx[1] = (int)a[0].y & 0xff;\n"
" idx[2] = (int)a[0].z & 0xff;\n"
" idx[3] = (int)a[0].w & 0xff;\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
" float4 selection;\n"
" if( GET_LOCAL_IDX < 4 ) selection = p[idx[GET_LOCAL_IDX]];\n"
"\n"
" GROUP_LDS_BARRIER;\n"
" if( GET_LOCAL_IDX < 4 ) p[GET_LOCAL_IDX] = selection;\n"
" }\n"
"\n"
"\n"
" return center;\n"
"}\n"
"\n"
"void extractManifold1(__local float4* p, __local float4* h, __local int* nPointsPtr, float4 center)\n"
"{\n"
" __local int* a = (__local int*)h;\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" float4 selection;\n"
" if( GET_LOCAL_IDX < 4 )\n"
" {\n"
" int idx = (int)a[GET_LOCAL_IDX] & 0xff;\n"
" selection = p[idx];\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
" if( GET_LOCAL_IDX < 4 ) p[GET_LOCAL_IDX] = selection;\n"
" }\n"
"\n"
"}\n"
"\n"
"void extractManifold2( __local float4* p0, __local int* nPointsPtr0, float4 nearNormal0,\n"
" __local float4* p1, __local int* nPointsPtr1, float4 nearNormal1,\n"
" __local float4* h, float4 centerOut[2])\n"
"{\n"
"\n"
" int nPoints[2];\n"
" nPoints[0] = *nPointsPtr0;\n"
" nPoints[1] = *nPointsPtr1;\n"
" float4 center[2];\n"
" center[0] = make_float4(0,0,0,0);\n"
" center[1] = make_float4(0,0,0,0);\n"
" { // calculate center\n"
" nPoints[0] = min2( nPoints[0], 32 );\n"
" nPoints[1] = min2( nPoints[1], 32 );\n"
" {\n"
" int lIdx = GET_LOCAL_IDX;\n"
" h[lIdx] = (lIdx<nPoints[0])? p0[lIdx] : make_float4(0,0,0,0);\n"
" h[lIdx+64] = (lIdx<nPoints[1])? p1[lIdx] : make_float4(0,0,0,0);\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" {\n"
" int bIdx = GET_LOCAL_IDX/32;\n"
" int eIdx = GET_LOCAL_IDX%32;\n"
" int lIdx = eIdx + bIdx*64;\n"
" {\n"
" h[lIdx] += h[lIdx+1];\n"
" mem_fence( CLK_LOCAL_MEM_FENCE );\n"
" h[lIdx] += h[lIdx+2];\n"
" mem_fence( CLK_LOCAL_MEM_FENCE );\n"
" h[lIdx] += h[lIdx+4];\n"
" mem_fence( CLK_LOCAL_MEM_FENCE );\n"
" h[lIdx] += h[lIdx+8];\n"
" mem_fence( CLK_LOCAL_MEM_FENCE );\n"
" h[lIdx] += h[lIdx+16];\n"
" }\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" for(int bIdx=0; bIdx<2; bIdx++)\n"
" {\n"
" center[bIdx] = fastDiv4( h[bIdx*64], make_float4(nPoints[bIdx], nPoints[bIdx], nPoints[bIdx], 0.f) );\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
"\n"
" centerOut[0] = center[0];\n"
" centerOut[1] = center[1];\n"
"\n"
" float4 u[2];\n"
" float4 v[2];\n"
"\n"
" {\n"
" float4 aVector = p0[0] - center[0];\n"
" u[0] = normalize3( cross3( nearNormal0, aVector ) );\n"
" v[0] = normalize3( cross3( nearNormal0, u[0] ) );\n"
" }\n"
" {\n"
" float4 aVector = p1[0] - center[1];\n"
" u[1] = normalize3( cross3( nearNormal1, aVector ) );\n"
" v[1] = normalize3( cross3( nearNormal1, u[1] ) );\n"
" }\n"
"\n"
" {\n"
" __local int4* a = (__local int4*)h;\n"
" { // select 4\n"
" { // set dot of 4 directions for xyzw\n"
" int ie = GET_LOCAL_IDX%32;\n"
" int setIdx = GET_LOCAL_IDX/32;\n"
" {\n"
" float f;\n"
" float4 r = p0[ie + setIdx*32]-center[setIdx];\n"
" f = dot3F4( u[setIdx], r );\n"
" a[ie + setIdx*64].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
" f = dot3F4( -u[setIdx], r );\n"
" a[ie + setIdx*64].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
" f = dot3F4( v[setIdx], r );\n"
" a[ie + setIdx*64].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
" f = dot3F4( -v[setIdx], r );\n"
" a[ie + setIdx*64].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
" if( ie >= nPoints[setIdx] ) a[ie + setIdx*64] = make_int4(-0xfffffff, -0xfffffff, -0xfffffff, -0xfffffff);\n"
"\n"
" a[ie + 32] = make_int4(-0xfffffff, -0xfffffff, -0xfffffff, -0xfffffff);\n"
" }\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" { // vector reduce, h[64]\n"
" int bIdx = GET_LOCAL_IDX/32;\n"
" int eIdx = GET_LOCAL_IDX%32;\n"
" int lIdx = eIdx + bIdx*64;\n"
" {\n"
" h[lIdx] = max2( h[lIdx], h[lIdx+1] );\n"
" mem_fence( CLK_LOCAL_MEM_FENCE );\n"
" h[lIdx] = max2( h[lIdx], h[lIdx+2] );\n"
" mem_fence( CLK_LOCAL_MEM_FENCE );\n"
" h[lIdx] = max2( h[lIdx], h[lIdx+4] );\n"
" mem_fence( CLK_LOCAL_MEM_FENCE );\n"
" h[lIdx] = max2( h[lIdx], h[lIdx+8] );\n"
" mem_fence( CLK_LOCAL_MEM_FENCE );\n"
" h[lIdx] = max2( h[lIdx], h[lIdx+16] );\n"
" }\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" __local int* a = (__local int*)h;\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" \n"
" float4 selection;\n"
"\n"
" int bIdx = GET_LOCAL_IDX/32;\n"
" int eIdx = GET_LOCAL_IDX%32;\n"
"\n"
" if( eIdx < 4 )\n"
" {\n"
" int idx = (int)a[eIdx+64*4*bIdx] & 0xff;\n"
" selection = p0[idx+32*bIdx];\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
" if( eIdx < 4 ) p0[eIdx+32*bIdx] = selection;\n"
" }\n"
"}\n"
"\n"
"/*\n"
"1. Query Normal\n"
"2. Fill Normal\n"
"3. A->B, B->A\n"
"*/\n"
"\n"
"void testVtx(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n"
" __local ShapeDeviceData* shapeAPtr, __local ShapeDeviceData* shapeBPtr,\n"
" __local int* lNContacts, __local float4* lCPoints)\n"
"{\n"
" int pIdx = GET_LOCAL_IDX;\n"
" float4 bodyAPos = bodyAPtr->m_pos;\n"
" float4 bodyBPos = bodyBPtr->m_pos;\n"
" Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
" Quaternion bodyBQuat = bodyBPtr->m_quat;\n"
" while( pIdx < HEIGHT_RES*HEIGHT_RES*6 )\n"
" {\n"
" float4 pInB = ShapeDataCalcSamplePoint( shapeBPtr, pIdx );\n"
"\n"
" float4 pInW = transform( &pInB, &bodyBPos, &bodyBQuat );\n"
"// Aabb bodyAAabb = bodyAPtr->m_aabb;\n"
"// if( AabbOverlapsPoint( &bodyAAabb, pInW ) )\n"
" {\n"
" float4 pInA = invTransform( &pInW, &bodyAPos, &bodyAQuat );\n"
"\n"
" float dist = ShapeDataQueryDistance( shapeAPtr, pInA );\n"
" if( dist < 0.010f )\n"
" {\n"
" int dstIdx = atom_add( lNContacts, 1 );\n"
" if( dstIdx < 32 )\n"
" {\n"
" lCPoints[ dstIdx ] = make_float4( pInA.x, pInA.y, pInA.z, dist );\n"
" }\n"
" }\n"
" }\n"
"\n"
" pIdx += GET_GROUP_SIZE;\n"
" }\n"
"}\n"
"\n"
"void testVtx2(__local const BodyData* bodyA, __local const BodyData* bodyB,\n"
" __local const ShapeDeviceData* shapeA, __local const ShapeDeviceData* shapeB,\n"
" __local int* lNContactsA, __local float4* lCPointsA,\n"
" __local int* lNContactsB, __local float4* lCPointsB, float collisionMargin )\n"
"{\n"
" int pIdx = GET_LOCAL_IDX;\n"
"\n"
" while( pIdx < HEIGHT_RES*HEIGHT_RES*6*2 )\n"
" {\n"
" __local const BodyData* bodyAPtr =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?bodyA:bodyB;\n"
" __local const BodyData* bodyBPtr =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?bodyB:bodyA;\n"
" __local const ShapeDeviceData* shapeAPtr =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?shapeA:shapeB;\n"
" __local const ShapeDeviceData* shapeBPtr =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?shapeB:shapeA;\n"
" __local int* lNContacts =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?lNContactsA:lNContactsB;\n"
" __local float4* lCPoints =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?lCPointsA:lCPointsB;\n"
"\n"
" float4 bodyAPos = bodyAPtr->m_pos;\n"
" float4 bodyBPos = bodyBPtr->m_pos;\n"
" Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
" Quaternion bodyBQuat = bodyBPtr->m_quat;\n"
"\n"
" float4 pInB = ShapeDataCalcSamplePoint( shapeBPtr, pIdx%(HEIGHT_RES*HEIGHT_RES*6) );\n"
"\n"
" float4 pInW = transform( &pInB, &bodyBPos, &bodyBQuat );\n"
"// Aabb bodyAAabb = bodyAPtr->m_aabb;\n"
"// if( AabbOverlapsPoint( &bodyAAabb, pInW ) )\n"
" {\n"
" float4 pInA = invTransform( &pInW, &bodyAPos, &bodyAQuat );\n"
"\n"
" float dist = ShapeDataQueryDistance( shapeAPtr, pInA );\n"
" if( dist < collisionMargin )\n"
" {\n"
" int dstIdx = atom_add( lNContacts, 1 );\n"
" if( dstIdx < 32 )\n"
" {\n"
" lCPoints[ dstIdx ] = make_float4( pInA.x, pInA.y, pInA.z, dist );\n"
" }\n"
" }\n"
" }\n"
"\n"
" pIdx += GET_GROUP_SIZE;\n"
" }\n"
"}\n"
"\n"
"void testVtxWithPlane(__local BodyData* bodyA, __local BodyData* bodyB,\n"
" float4 nA, __local ShapeDeviceData* shapeB,\n"
" __local int* lNContactsA, __local float4* lCPointsA, float collisionMargin)\n"
"{\n"
" int pIdx = GET_LOCAL_IDX;\n"
"\n"
" while( pIdx < HEIGHT_RES*HEIGHT_RES*6 )\n"
" {\n"
" __local BodyData* bodyAPtr =bodyA;\n"
" __local BodyData* bodyBPtr =bodyB;\n"
" __local ShapeDeviceData* shapeBPtr =shapeB;\n"
" __local int* lNContacts =lNContactsA;\n"
" __local float4* lCPoints =lCPointsA;\n"
"\n"
" float4 bodyAPos = bodyAPtr->m_pos;\n"
" float4 bodyBPos = bodyBPtr->m_pos;\n"
" Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
" Quaternion bodyBQuat = bodyBPtr->m_quat;\n"
"\n"
" float4 pInB = ShapeDataCalcSamplePoint( shapeBPtr, pIdx%(HEIGHT_RES*HEIGHT_RES*6) );\n"
"\n"
" float4 pInW = transform( &pInB, &bodyBPos, &bodyBQuat );\n"
" {\n"
" float4 pInA = invTransform( &pInW, &bodyAPos, &bodyAQuat );\n"
"\n"
" float dist = dot3w1( pInA, nA );//ShapeDataQueryDistance( shapeAPtr, pInA );\n"
" if( dist < collisionMargin )\n"
" {\n"
" int dstIdx = atom_add( lNContacts, 1 );\n"
" if( dstIdx < 32 )\n"
" {\n"
" lCPoints[ dstIdx ] = make_float4( pInA.x, pInA.y, pInA.z, dist );\n"
" }\n"
" }\n"
" }\n"
"\n"
" pIdx += GET_GROUP_SIZE;\n"
" }\n"
"}\n"
"\n"
"#define GET_SHAPE_IDX(x) (int)((x).m_shapeIdx)\n"
"\n"
"void output(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n"
" __local int2* iPair,\n"
" __local int* lNContacts, __local float4* lCPoints,\n"
" float4 center, \n"
" __global ShapeData* shapeData, __global Contact4* contactsOut, float collisionMargin)\n"
"{\n"
" if( *lNContacts != 0 )\n"
" {\n"
" int nContacts = min2( *lNContacts, 4 );\n"
"\n"
" __global Contact4* c = contactsOut;\n"
"\n"
" if( GET_LOCAL_IDX < nContacts )\n"
" {\n"
" int i = GET_LOCAL_IDX;\n"
" float4 p = lCPoints[i];\n"
" float4 bodyAPos = bodyAPtr->m_pos;\n"
" Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
"\n"
" c->m_worldPos[i] = transform( &p, &bodyAPos, &bodyAQuat );\n"
" c->m_worldPos[i].w = lCPoints[i].w - collisionMargin;\n"
" }\n"
"\n"
" if( GET_LOCAL_IDX == 0 )\n"
" {\n"
" float4 contactNormal;\n"
" contactNormal = ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );\n"
" contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );\n"
"\n"
" c->m_worldNormal = contactNormal;\n"
"// c->m_restituitionCoeff = 0.f;\n"
"// c->m_frictionCoeff = 0.7f;\n"
" c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
" GET_NPOINTS(*c) = nContacts;\n"
" c->m_bodyAPtr = iPair[0].x;\n"
" c->m_bodyBPtr = iPair[0].y;\n"
" }\n"
" }\n"
" else\n"
" {\n"
" if( GET_LOCAL_IDX == 0 )\n"
" GET_NPOINTS(contactsOut[0]) = 0;\n"
" }\n"
"}\n"
"\n"
"// todo. make it better\n"
"void output2(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n"
" int pair0, int pair1,\n"
" __local int* lNContacts, __local float4* lCPoints,\n"
" float4 center, \n"
" const __global ShapeData* shapeData, __global Contact4* contactsOut, counter32_t nContactsOut, int capacity,\n"
" float collisionMargin )\n"
"{\n"
" int lIdx = GET_LOCAL_IDX%32;\n"
" int nContacts = min2( *lNContacts, 4 );\n"
" \n"
" GROUP_LDS_BARRIER;\n"
"\n"
" if( lIdx == 0 && nContacts)\n"
" {\n"
" int dstIdx;\n"
" AppendInc( nContactsOut, dstIdx );\n"
" *lNContacts = dstIdx;\n"
"\n"
" if( dstIdx >= capacity )\n"
" *lNContacts = -1;\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" bool canWrite = (*lNContacts!=-1);\n"
"\n"
" if( nContacts && canWrite )\n"
" {\n"
" __global Contact4* c = contactsOut + (*lNContacts);\n"
"\n"
" if( lIdx < nContacts )\n"
" {\n"
" int i = lIdx;\n"
" float4 p = lCPoints[i];\n"
" float4 bodyAPos = bodyAPtr->m_pos;\n"
" Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
"\n"
" p = transform( &p, &bodyAPos, &bodyAQuat );\n"
" p.w = lCPoints[i].w - collisionMargin;\n"
" c->m_worldPos[i] = p;\n"
" }\n"
"\n"
" if( lIdx == 0 )\n"
" {\n"
" if( nContacts )\n"
" {\n"
" float4 contactNormal;\n"
" contactNormal = ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );\n"
" contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );\n"
"\n"
" c->m_worldNormal = contactNormal;\n"
"// c->m_restituitionCoeff = 0.f;\n"
"// c->m_frictionCoeff = 0.7f;\n"
" c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
" c->m_bodyAPtr = pair0;\n"
" c->m_bodyBPtr = pair1;\n"
" }\n"
" GET_NPOINTS(*c) = nContacts;\n"
" }\n"
" }\n"
"}\n"
"\n"
"__inline\n"
"void output2LDS(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n"
" int pair0, int pair1,\n"
" int lNContacts, __local float4* lCPoints,\n"
" float4 center, \n"
" const __global ShapeData* shapeData, __local Contact4* contactsOut,\n"
" float collisionMargin )\n"
"{\n"
" int lIdx = GET_LOCAL_IDX%32;\n"
"// int lIdx = GET_LOCAL_IDX;\n"
"// int groupIdx = 0;\n"
"\n"
" int nContacts = min2( lNContacts, 4 );\n"
" \n"
" GROUP_LDS_BARRIER;\n"
"\n"
" if( nContacts != 0 )\n"
" {\n"
" if( lIdx < nContacts )\n"
" {\n"
" int i = lIdx;\n"
" float4 p = lCPoints[i];\n"
" float4 bodyAPos = bodyAPtr->m_pos;\n"
" Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
"\n"
" p = transform( &p, &bodyAPos, &bodyAQuat );\n"
" p.w = lCPoints[i].w - collisionMargin;\n"
" contactsOut->m_worldPos[i] = p;\n"
" }\n"
" }\n"
"\n"
" if( lIdx == 0 )\n"
" {\n"
" if( nContacts != 0 )\n"
" {\n"
" float4 contactNormal;\n"
" contactNormal = ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );\n"
" contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );\n"
"\n"
" contactsOut->m_worldNormal = contactNormal;\n"
"// contactsOut->m_worldNormal = make_float4(1.5f,1.4f,1.3f,0.f);\n"
"// contactsOut->m_restituitionCoeff = 0.f;\n"
"// contactsOut->m_frictionCoeff = 0.7f;\n"
" contactsOut->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
" contactsOut->m_bodyAPtr = pair0;\n"
" contactsOut->m_bodyBPtr = pair1;\n"
" }\n"
" GET_NPOINTS(*contactsOut) = nContacts;//nContacts;\n"
" }\n"
"\n"
"// contactsOut[groupIdx].m_worldNormal = make_float4(1.5f,1.4f,1.3f,0.f);\n"
"}\n"
"\n"
"void output2_1(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n"
" int pair0, int pair1,\n"
" __local int* lNContacts, __local float4* lCPoints,\n"
" float4 center, float4 nA, \n"
" const __global ShapeData* shapeData, __global Contact4* contactsOut, counter32_t nContactsOut, int capacity, float collisionMargin )\n"
"{\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int nContacts = min2( *lNContacts, 4 );\n"
" \n"
" GROUP_LDS_BARRIER;\n"
"\n"
" if( lIdx == 0 && nContacts)\n"
" {\n"
" int dstIdx;\n"
" AppendInc( nContactsOut, dstIdx );\n"
" *lNContacts = dstIdx;\n"
"\n"
" if( dstIdx >= capacity )\n"
" *lNContacts = -1;\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" bool canWrite = (*lNContacts!=-1);\n"
"\n"
" if( nContacts && canWrite )\n"
" {\n"
" __global Contact4* c = contactsOut + (*lNContacts);\n"
"\n"
" if( lIdx < nContacts )\n"
" {\n"
" int i = lIdx;\n"
" float4 p = lCPoints[i];\n"
" float4 bodyAPos = bodyAPtr->m_pos;\n"
" Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
"\n"
" p = transform( &p, &bodyAPos, &bodyAQuat );\n"
" p.w = lCPoints[i].w - collisionMargin;\n"
" c->m_worldPos[i] = p;\n"
" }\n"
"\n"
" if( lIdx == 0 )\n"
" {\n"
" if( nContacts )\n"
" {\n"
" float4 contactNormal;\n"
" contactNormal = nA;//ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );\n"
" contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );\n"
"\n"
" c->m_worldNormal = contactNormal;\n"
"// c->m_restituitionCoeff = 0.f;\n"
"// c->m_frictionCoeff = 0.7f;\n"
" c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
" c->m_bodyAPtr = pair0;\n"
" c->m_bodyBPtr = pair1;\n"
" }\n"
" GET_NPOINTS(*c) = nContacts;\n"
" }\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"void manifold(__global float4* vIn, __global float4* vOut)\n"
"{\n"
" __local float4 lCPoints[32];\n"
" __local float4 lManifoldBuffer[64];\n"
" __local int lNContacts;\n"
" __local float4 ab;\n"
"\n"
" if( GET_LOCAL_IDX<32 )\n"
" {\n"
" lCPoints[GET_LOCAL_IDX] = vIn[GET_GLOBAL_IDX];\n"
" }\n"
"\n"
" if( GET_LOCAL_IDX == 0 ) \n"
" {\n"
" lNContacts = 32;\n"
" ab = vIn[GET_GLOBAL_IDX];\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" float4 center = extractManifold( lCPoints, lManifoldBuffer, &lNContacts, ab );\n"
"\n"
" if( GET_LOCAL_IDX < lNContacts )\n"
" {\n"
" vOut[4*GET_GROUP_IDX+GET_LOCAL_IDX] = lCPoints[GET_LOCAL_IDX];\n"
" }\n"
"\n"
"}\n"
"\n"
"//#define COMBINE_REDUCTION \n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64, 1, 1)))\n"
"void NarrowphaseKernel( const __global int2* restrict pairs, const __global ShapeData* shapeData, const __global BodyData* restrict bodyDatas, \n"
" __global Contact4* restrict contactsOut,\n"
" counter32_t nContactsOut, ConstBuffer cb ) \n"
"{\n"
" // 2.5K LDS\n"
" __local Contact4 ldsContacts[2];\n"
" __local BodyData bodyA;\n"
" __local BodyData bodyB;\n"
" __local ShapeDeviceData shapeA;\n"
" __local ShapeDeviceData shapeB;\n"
" __local float4 lCPointsA[32*2];\n"
" __local int lNContactsA;\n"
" __local float4* lCPointsB = lCPointsA+32;\n"
" __local int lNContactsB;\n"
"#ifdef COMBINE_REDUCTION\n"
" __local float4 lManifoldBuffer[64*2];\n"
"#else\n"
" __local float4 lManifoldBuffer[64];\n"
"#endif\n"
" __local int2 iPairAB;\n"
"\n"
" const int capacity = cb.m_capacity;\n"
" const float collisionMargin = cb.m_collisionMargin;\n"
"\n"
"\n"
" int pairIdx = GET_GROUP_IDX;\n"
"// for(int pairIdx = GET_GROUP_IDX; pairIdx<nPairs; pairIdx+=GET_NUM_GROUPS)\n"
" {\n"
" if( GET_LOCAL_IDX == 0 ) // load Bodies\n"
" {\n"
" int2 pair = pairs[pairIdx];\n"
" iPairAB = make_int2(pair.x, pair.y);\n"
" bodyA = bodyDatas[ pair.x ];\n"
" bodyB = bodyDatas[ pair.y ];\n"
" shapeA.m_scale = shapeData[ GET_SHAPE_IDX(bodyA) ].m_scale;\n"
" shapeB.m_scale = shapeData[ GET_SHAPE_IDX(bodyB) ].m_scale;\n"
" lNContactsA = 0;\n"
" lNContactsB = 0;\n"
" }\n"
" \n"
" GROUP_LDS_BARRIER;\n"
"\n"
" // todo. can check if the shape is the same to previous one. If same, dont read\n"
" { // load shape data\n"
" int idx = GET_LOCAL_IDX%32;\n"
" int bIdx = GET_LOCAL_IDX/32;\n"
" __local ShapeDeviceData* myShape = (bIdx==0)?&shapeA: &shapeB;\n"
" int myShapeIdx = (bIdx==0)?GET_SHAPE_IDX(bodyA): GET_SHAPE_IDX(bodyB);\n"
"\n"
" while( idx < HEIGHT_RES*HEIGHT_RES*6/4 )\n"
" {\n"
" myShape->m_height4[idx] = shapeData[ myShapeIdx ].m_height4[idx];\n"
"\n"
" idx+=32;\n"
" }\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" testVtx2( &bodyA, &bodyB, &shapeA, &shapeB, &lNContactsA, lCPointsA, &lNContactsB, lCPointsB, collisionMargin );\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" float4 ab = bodyB.m_pos - bodyA.m_pos;\n"
" float4 center[2];\n"
"\n"
" if( lNContactsA != 0 || lNContactsB != 0 )\n"
" {\n"
" float4 abInA;\n"
" abInA = qtInvRotate( bodyA.m_quat, ab );\n"
"\n"
" float4 abInB;\n"
" abInB = qtInvRotate( bodyB.m_quat, ab );\n"
"\n"
"#ifdef COMBINE_REDUCTION\n"
" extractManifold2( lCPointsA, &lNContactsA, abInA,\n"
" lCPointsB, &lNContactsB, abInB,\n"
" lManifoldBuffer, center );\n"
"#else\n"
" if( lNContactsA != 0 )\n"
" center[0] = extractManifold( lCPointsA, lManifoldBuffer, &lNContactsA, abInA );\n"
" if( lNContactsB != 0 )\n"
" center[1] = extractManifold( lCPointsB, lManifoldBuffer, &lNContactsB, abInB );\n"
"#endif\n"
" }\n"
"\n"
" int firstSet = GET_LOCAL_IDX/32;\n"
"\n"
"/*\n"
" if( GET_LOCAL_IDX == 0 ) // for debug\n"
" {\n"
" ldsContacts[0].m_worldNormal = make_float4(-1,-1,-1,0);\n"
" ldsContacts[0].m_bodyAPtr = 0;\n"
" ldsContacts[0].m_bodyBPtr = 0;\n"
" ldsContacts[0].m_batchIdx = 111;\n"
" ldsContacts[1].m_worldNormal = make_float4(-1,-1,-1,0);\n"
" ldsContacts[1].m_bodyAPtr = 0;\n"
" ldsContacts[1].m_bodyBPtr = 0;\n"
" ldsContacts[1].m_batchIdx = 111;\n"
" }\n"
"*/\n"
" bool doReduction = true;\n"
" if( doReduction )\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" output2LDS( (firstSet)?&bodyA: &bodyB, (firstSet)?&bodyB : &bodyA, \n"
" (firstSet)?iPairAB.x : iPairAB.y, (firstSet)?iPairAB.y : iPairAB.x, \n"
" (firstSet)?lNContactsA : lNContactsB, (firstSet)?lCPointsA:lCPointsB, \n"
" (firstSet)?center[0] : center[1], shapeData, (firstSet)?&ldsContacts[0]: &ldsContacts[1], collisionMargin );\n"
"\n"
" GROUP_LDS_BARRIER;\n"
" \n"
" if( GET_LOCAL_IDX == 0 )\n"
" {\n"
" if( lNContactsA && lNContactsB )\n"
" {\n"
" float nDotn = dot3F4( ldsContacts[0].m_worldNormal, ldsContacts[1].m_worldNormal );\n"
" if( nDotn < -(1.f-0.01f) )\n"
" {\n"
" if( ldsContacts[0].m_bodyAPtr > ldsContacts[1].m_bodyAPtr )\n"
" lNContactsA = 0;\n"
" else\n"
" lNContactsB = 0;\n"
" }\n"
" }\n"
" }\n"
" \n"
" if( GET_LOCAL_IDX == 0 )\n"
" {\n"
" int n = lNContactsA;\n"
" if( n != 0 )\n"
" {\n"
" int dstIdx;\n"
" AppendInc( nContactsOut, dstIdx );\n"
" if( dstIdx < capacity )\n"
" { int idx = 0;\n"
" contactsOut[ dstIdx ] = ldsContacts[idx];\n"
" contactsOut[ dstIdx].m_batchIdx = pairIdx;\n"
" }\n"
" }\n"
"\n"
" n = lNContactsB;\n"
" if( n != 0 )\n"
" {\n"
" int dstIdx;\n"
" AppendInc( nContactsOut, dstIdx );\n"
" if( dstIdx < capacity )\n"
" { int idx = 1;\n"
" contactsOut[ dstIdx ] = ldsContacts[idx];\n"
" contactsOut[ dstIdx].m_batchIdx = pairIdx;\n"
" }\n"
" }\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" else\n"
" {\n"
" //output2( (firstSet)?&bodyA: &bodyB, (firstSet)?&bodyB : &bodyA, \n"
" // (firstSet)?iPairAB.x : iPairAB.y, (firstSet)?iPairAB.y : iPairAB.x, \n"
" // (firstSet)?&lNContactsA : &lNContactsB, (firstSet)?lCPointsA:lCPointsB, \n"
" // (firstSet)?center[0] : center[1], shapeData, contactsOut, nContactsOut, capacity, collisionMargin );\n"
" }\n"
" }\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64, 1, 1)))\n"
"void NarrowphaseWithPlaneKernel( const __global int2* restrict pairs, const __global ShapeData* shapeData, const __global BodyData* restrict bodyDatas, \n"
" __global Contact4* restrict contactsOut,\n"
" counter32_t nContactsOut, ConstBuffer cb ) \n"
"{\n"
" // 2.5K LDS\n"
" __local BodyData bodyA;\n"
" __local BodyData bodyB;\n"
" __local ShapeDeviceData shapeA;\n"
" __local ShapeDeviceData shapeB;\n"
" __local float4 lCPointsA[32*2];\n"
" __local int lNContactsA;\n"
"// __local float4* lCPointsB = lCPointsA+32;\n"
"// __local int lNContactsB;\n"
" __local float4 lManifoldBuffer[64];\n"
" __local int2 iPairAB;\n"
"\n"
" const int capacity = cb.m_capacity;\n"
" const float collisionMargin = cb.m_collisionMargin;\n"
"\n"
" int pairIdx = GET_GROUP_IDX;\n"
" {\n"
" if( GET_LOCAL_IDX == 0 ) // load Bodies\n"
" {\n"
" int2 pair = pairs[pairIdx];\n"
" iPairAB = make_int2(pair.x, pair.y);\n"
" bodyA = bodyDatas[ pair.x ];\n"
" bodyB = bodyDatas[ pair.y ];\n"
" shapeA.m_scale = shapeData[ GET_SHAPE_IDX(bodyA) ].m_scale;\n"
" shapeB.m_scale = shapeData[ GET_SHAPE_IDX(bodyB) ].m_scale;\n"
" lNContactsA = 0;\n"
"// lNContactsB = 0;\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" if (bodyB.m_invMass == 0.f)\n"
" return;\n"
" \n"
" // todo. can check if the shape is the same to previous one. If same, dont read\n"
" { // load shape data\n"
" int idx = GET_LOCAL_IDX%32;\n"
" int bIdx = GET_LOCAL_IDX/32;\n"
" __local ShapeDeviceData* myShape = (bIdx==0)?&shapeA: &shapeB;\n"
" int myShapeIdx = (bIdx==0)?GET_SHAPE_IDX(bodyA): GET_SHAPE_IDX(bodyB);\n"
"\n"
" while( idx < HEIGHT_RES*HEIGHT_RES*6/4 )\n"
" {\n"
" myShape->m_height4[idx] = shapeData[ myShapeIdx ].m_height4[idx];\n"
"\n"
" idx+=32;\n"
" }\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" float4 nA = make_float4(0,1,0,0);\n"
"\n"
"\n"
"// testVtx2( &bodyA, &bodyB, &shapeA, &shapeB, &lNContactsA, lCPointsA, &lNContactsB, lCPointsB );\n"
" testVtxWithPlane( &bodyA, &bodyB, nA, &shapeB, &lNContactsA, lCPointsA, collisionMargin );\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
"// float4 ab = bodyB.m_pos - bodyA.m_pos;\n"
" float4 center[2];\n"
"\n"
" if( lNContactsA != 0 )\n"
" {\n"
" float4 abInA;\n"
" abInA = nA;//qtInvRotate( bodyA.m_quat, ab );\n"
"\n"
" if( lNContactsA != 0 )\n"
" center[0] = extractManifold( lCPointsA, lManifoldBuffer, &lNContactsA, abInA );\n"
" }\n"
"\n"
"// int firstSet = GET_LOCAL_IDX/32;\n"
"\n"
" output2_1( &bodyA, &bodyB, \n"
" iPairAB.x, iPairAB.y, \n"
" &lNContactsA, lCPointsA, \n"
" center[0], nA, shapeData, contactsOut, nContactsOut, capacity, collisionMargin );\n"
" }\n"
"}\n"
;