1629 lines
38 KiB
Common Lisp
1629 lines
38 KiB
Common Lisp
/*
|
|
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
|
|
|
This software is provided 'as-is', without any express or implied warranty.
|
|
In no event will the authors be held liable for any damages arising from the use of this software.
|
|
Permission is granted to anyone to use this software for any purpose,
|
|
including commercial applications, and to alter it and redistribute it freely,
|
|
subject to the following restrictions:
|
|
|
|
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
|
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
|
3. This notice may not be removed or altered from any source distribution.
|
|
*/
|
|
//Originally written by Takahiro Harada
|
|
|
|
|
|
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
|
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
|
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
|
|
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
|
|
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
|
|
|
|
#ifdef cl_ext_atomic_counters_32
|
|
#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
|
|
#else
|
|
#define counter32_t volatile global int*
|
|
#endif
|
|
|
|
|
|
typedef unsigned int u32;
|
|
typedef unsigned short u16;
|
|
typedef unsigned char u8;
|
|
|
|
#define GET_GROUP_IDX get_group_id(0)
|
|
#define GET_LOCAL_IDX get_local_id(0)
|
|
#define GET_GLOBAL_IDX get_global_id(0)
|
|
#define GET_GROUP_SIZE get_local_size(0)
|
|
#define GET_NUM_GROUPS get_num_groups(0)
|
|
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
|
|
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
|
|
#define AtomInc(x) atom_inc(&(x))
|
|
#define AtomInc1(x, out) out = atom_inc(&(x))
|
|
#define AppendInc(x, out) out = atomic_inc(x)
|
|
#define AtomAdd(x, value) atom_add(&(x), value)
|
|
#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
|
|
#define AtomXhg(x, value) atom_xchg ( &(x), value )
|
|
|
|
|
|
#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
|
|
|
|
#define make_float4 (float4)
|
|
#define make_float2 (float2)
|
|
#define make_uint4 (uint4)
|
|
#define make_int4 (int4)
|
|
#define make_uint2 (uint2)
|
|
#define make_int2 (int2)
|
|
|
|
|
|
#define max2 max
|
|
#define min2 min
|
|
|
|
|
|
///////////////////////////////////////
|
|
// Vector
|
|
///////////////////////////////////////
|
|
__inline
|
|
float fastDiv(float numerator, float denominator)
|
|
{
|
|
return native_divide(numerator, denominator);
|
|
// return numerator/denominator;
|
|
}
|
|
|
|
__inline
|
|
float4 fastDiv4(float4 numerator, float4 denominator)
|
|
{
|
|
return native_divide(numerator, denominator);
|
|
}
|
|
|
|
__inline
|
|
float fastSqrtf(float f2)
|
|
{
|
|
return native_sqrt(f2);
|
|
// return sqrt(f2);
|
|
}
|
|
|
|
__inline
|
|
float fastRSqrt(float f2)
|
|
{
|
|
return native_rsqrt(f2);
|
|
}
|
|
|
|
__inline
|
|
float fastLength4(float4 v)
|
|
{
|
|
return fast_length(v);
|
|
}
|
|
|
|
__inline
|
|
float4 fastNormalize4(float4 v)
|
|
{
|
|
return fast_normalize(v);
|
|
}
|
|
|
|
|
|
__inline
|
|
float sqrtf(float a)
|
|
{
|
|
// return sqrt(a);
|
|
return native_sqrt(a);
|
|
}
|
|
|
|
__inline
|
|
float4 cross3(float4 a, float4 b)
|
|
{
|
|
return cross(a,b);
|
|
}
|
|
|
|
__inline
|
|
float dot3F4(float4 a, float4 b)
|
|
{
|
|
float4 a1 = make_float4(a.xyz,0.f);
|
|
float4 b1 = make_float4(b.xyz,0.f);
|
|
return dot(a1, b1);
|
|
}
|
|
|
|
__inline
|
|
float length3(const float4 a)
|
|
{
|
|
return sqrtf(dot3F4(a,a));
|
|
}
|
|
|
|
__inline
|
|
float dot4(const float4 a, const float4 b)
|
|
{
|
|
return dot( a, b );
|
|
}
|
|
|
|
// for height
|
|
__inline
|
|
float dot3w1(const float4 point, const float4 eqn)
|
|
{
|
|
return dot3F4(point,eqn) + eqn.w;
|
|
}
|
|
|
|
__inline
|
|
float4 normalize3(const float4 a)
|
|
{
|
|
float4 n = make_float4(a.x, a.y, a.z, 0.f);
|
|
return fastNormalize4( n );
|
|
// float length = sqrtf(dot3F4(a, a));
|
|
// return 1.f/length * a;
|
|
}
|
|
|
|
__inline
|
|
float4 normalize4(const float4 a)
|
|
{
|
|
float length = sqrtf(dot4(a, a));
|
|
return 1.f/length * a;
|
|
}
|
|
|
|
__inline
|
|
float4 createEquation(const float4 a, const float4 b, const float4 c)
|
|
{
|
|
float4 eqn;
|
|
float4 ab = b-a;
|
|
float4 ac = c-a;
|
|
eqn = normalize3( cross3(ab, ac) );
|
|
eqn.w = -dot3F4(eqn,a);
|
|
return eqn;
|
|
}
|
|
|
|
///////////////////////////////////////
|
|
// Matrix3x3
|
|
///////////////////////////////////////
|
|
|
|
typedef struct
|
|
{
|
|
float4 m_row[3];
|
|
}Matrix3x3;
|
|
|
|
__inline
|
|
Matrix3x3 mtZero();
|
|
|
|
__inline
|
|
Matrix3x3 mtIdentity();
|
|
|
|
__inline
|
|
Matrix3x3 mtTranspose(Matrix3x3 m);
|
|
|
|
__inline
|
|
Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);
|
|
|
|
__inline
|
|
float4 mtMul1(Matrix3x3 a, float4 b);
|
|
|
|
__inline
|
|
float4 mtMul3(float4 a, Matrix3x3 b);
|
|
|
|
__inline
|
|
Matrix3x3 mtZero()
|
|
{
|
|
Matrix3x3 m;
|
|
m.m_row[0] = (float4)(0.f);
|
|
m.m_row[1] = (float4)(0.f);
|
|
m.m_row[2] = (float4)(0.f);
|
|
return m;
|
|
}
|
|
|
|
__inline
|
|
Matrix3x3 mtIdentity()
|
|
{
|
|
Matrix3x3 m;
|
|
m.m_row[0] = (float4)(1,0,0,0);
|
|
m.m_row[1] = (float4)(0,1,0,0);
|
|
m.m_row[2] = (float4)(0,0,1,0);
|
|
return m;
|
|
}
|
|
|
|
__inline
|
|
Matrix3x3 mtTranspose(Matrix3x3 m)
|
|
{
|
|
Matrix3x3 out;
|
|
out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
|
|
out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
|
|
out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
|
|
return out;
|
|
}
|
|
|
|
__inline
|
|
Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
|
|
{
|
|
Matrix3x3 transB;
|
|
transB = mtTranspose( b );
|
|
Matrix3x3 ans;
|
|
// why this doesn't run when 0ing in the for{}
|
|
a.m_row[0].w = 0.f;
|
|
a.m_row[1].w = 0.f;
|
|
a.m_row[2].w = 0.f;
|
|
for(int i=0; i<3; i++)
|
|
{
|
|
// a.m_row[i].w = 0.f;
|
|
ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
|
|
ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
|
|
ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
|
|
ans.m_row[i].w = 0.f;
|
|
}
|
|
return ans;
|
|
}
|
|
|
|
__inline
|
|
float4 mtMul1(Matrix3x3 a, float4 b)
|
|
{
|
|
float4 ans;
|
|
ans.x = dot3F4( a.m_row[0], b );
|
|
ans.y = dot3F4( a.m_row[1], b );
|
|
ans.z = dot3F4( a.m_row[2], b );
|
|
ans.w = 0.f;
|
|
return ans;
|
|
}
|
|
|
|
__inline
|
|
float4 mtMul3(float4 a, Matrix3x3 b)
|
|
{
|
|
float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
|
|
float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
|
|
float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
|
|
|
|
float4 ans;
|
|
ans.x = dot3F4( a, colx );
|
|
ans.y = dot3F4( a, coly );
|
|
ans.z = dot3F4( a, colz );
|
|
return ans;
|
|
}
|
|
|
|
///////////////////////////////////////
|
|
// Quaternion
|
|
///////////////////////////////////////
|
|
|
|
typedef float4 Quaternion;
|
|
|
|
__inline
|
|
Quaternion qtMul(Quaternion a, Quaternion b);
|
|
|
|
__inline
|
|
Quaternion qtNormalize(Quaternion in);
|
|
|
|
__inline
|
|
float4 qtRotate(Quaternion q, float4 vec);
|
|
|
|
__inline
|
|
Quaternion qtInvert(Quaternion q);
|
|
|
|
__inline
|
|
Matrix3x3 qtGetRotationMatrix(Quaternion q);
|
|
|
|
|
|
|
|
__inline
|
|
Quaternion qtMul(Quaternion a, Quaternion b)
|
|
{
|
|
Quaternion ans;
|
|
ans = cross3( a, b );
|
|
ans += a.w*b+b.w*a;
|
|
// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
|
|
ans.w = a.w*b.w - dot3F4(a, b);
|
|
return ans;
|
|
}
|
|
|
|
__inline
|
|
Quaternion qtNormalize(Quaternion in)
|
|
{
|
|
return fastNormalize4(in);
|
|
// in /= length( in );
|
|
// return in;
|
|
}
|
|
__inline
|
|
float4 qtRotate(Quaternion q, float4 vec)
|
|
{
|
|
Quaternion qInv = qtInvert( q );
|
|
float4 vcpy = vec;
|
|
vcpy.w = 0.f;
|
|
float4 out = qtMul(qtMul(q,vcpy),qInv);
|
|
return out;
|
|
}
|
|
|
|
__inline
|
|
Quaternion qtInvert(Quaternion q)
|
|
{
|
|
return (Quaternion)(-q.xyz, q.w);
|
|
}
|
|
|
|
__inline
|
|
float4 qtInvRotate(const Quaternion q, float4 vec)
|
|
{
|
|
return qtRotate( qtInvert( q ), vec );
|
|
}
|
|
|
|
__inline
|
|
Matrix3x3 qtGetRotationMatrix(Quaternion quat)
|
|
{
|
|
float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
|
|
Matrix3x3 out;
|
|
|
|
out.m_row[0].x=1-2*quat2.y-2*quat2.z;
|
|
out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;
|
|
out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;
|
|
out.m_row[0].w = 0.f;
|
|
|
|
out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;
|
|
out.m_row[1].y=1-2*quat2.x-2*quat2.z;
|
|
out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;
|
|
out.m_row[1].w = 0.f;
|
|
|
|
out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;
|
|
out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;
|
|
out.m_row[2].z=1-2*quat2.x-2*quat2.y;
|
|
out.m_row[2].w = 0.f;
|
|
|
|
return out;
|
|
}
|
|
|
|
|
|
#define WG_SIZE 64
|
|
#define HEIGHT_RES 4
|
|
#define SHAPE_CONVEX_HEIGHT_FIELD 1//keep this in sync with AdlCollisionShape.h!
|
|
|
|
typedef struct
|
|
{
|
|
float4 m_normal[HEIGHT_RES*HEIGHT_RES*6];
|
|
u32 m_height4[HEIGHT_RES*HEIGHT_RES*6];
|
|
u32 m_supportHeight4[HEIGHT_RES*HEIGHT_RES*6];
|
|
|
|
float m_scale;
|
|
float m_padding0;
|
|
float m_padding1;
|
|
float m_padding2;
|
|
} ShapeData;
|
|
|
|
typedef struct
|
|
{
|
|
u32 m_height4[HEIGHT_RES*HEIGHT_RES*6/4];
|
|
|
|
float m_scale;
|
|
} ShapeDeviceData;
|
|
|
|
typedef struct
|
|
{
|
|
float4 m_pos;
|
|
float4 m_quat;
|
|
float4 m_linVel;
|
|
float4 m_angVel;
|
|
|
|
u32 m_shapeIdx;
|
|
u32 m_shapeType;
|
|
|
|
float m_invMass;
|
|
float m_restituitionCoeff;
|
|
float m_frictionCoeff;
|
|
} BodyData;
|
|
|
|
typedef struct
|
|
{
|
|
float4 m_worldPos[4];
|
|
float4 m_worldNormal; // w: m_nPoints
|
|
// float m_restituitionCoeff;
|
|
// float m_frictionCoeff;
|
|
u32 m_coeffs;
|
|
u32 m_batchIdx;
|
|
// int m_nPoints;
|
|
// int m_padding0;
|
|
|
|
u32 m_bodyAPtr;//x:m_bodyAPtr, y:m_bodyBPtr
|
|
u32 m_bodyBPtr;
|
|
} Contact4;
|
|
|
|
#define GET_NPOINTS(x) (x).m_worldNormal.w
|
|
|
|
|
|
typedef struct
|
|
{
|
|
int m_nPairs;
|
|
float m_collisionMargin;
|
|
int m_capacity;
|
|
int m_paddings[1];
|
|
} ConstBuffer;
|
|
|
|
__inline
|
|
float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
|
|
{
|
|
return qtRotate( *orientation, *p ) + (*translation);
|
|
}
|
|
|
|
__inline
|
|
float4 invTransform(const float4* p, const float4* translation, const Quaternion* orientation)
|
|
{
|
|
return qtRotate( qtInvert( *orientation ), (*p)-(*translation) ); // use qtInvRotate
|
|
}
|
|
|
|
void CubeMapUtilsCalcCrd(const float4 p, int* faceIdxOut, float* x, float* y)
|
|
{
|
|
{
|
|
int idx;
|
|
float r2[] = {p.x*p.x, p.y*p.y, p.z*p.z};
|
|
|
|
if (r2[1]>r2[0])
|
|
{
|
|
if (r2[2]>r2[1])
|
|
{
|
|
idx = 2;
|
|
|
|
} else
|
|
{
|
|
idx = 1;
|
|
}
|
|
|
|
} else
|
|
{
|
|
if (r2[2]>r2[0])
|
|
{
|
|
idx = 2;
|
|
} else
|
|
{
|
|
idx = 0;
|
|
}
|
|
}
|
|
|
|
*faceIdxOut = (idx*2);
|
|
//==
|
|
float4 abs = make_float4( fabs(p.x), fabs(p.y), fabs(p.z), 0.f );
|
|
|
|
float d;
|
|
if( idx == 0 )
|
|
{
|
|
*x = p.y;
|
|
*y = p.z;
|
|
d = abs.x;
|
|
*faceIdxOut += (p.x < 0.f)? 0: 1.f;
|
|
}
|
|
else if( idx == 1 )
|
|
{
|
|
*x = p.z;
|
|
*y = p.x;
|
|
d = abs.y;
|
|
*faceIdxOut += (p.y < 0.f)? 0: 1.f;
|
|
}
|
|
else
|
|
{
|
|
*x = p.x;
|
|
*y = p.y;
|
|
d = abs.z;
|
|
*faceIdxOut += (p.z < 0.f)? 0: 1.f;
|
|
}
|
|
|
|
float dInv = (d==0.f)? 0.f: fastDiv(1.f,d);
|
|
*x = (*x*dInv+1.f)*0.5f;
|
|
*y = (*y*dInv+1.f)*0.5f;
|
|
}
|
|
}
|
|
|
|
float4 CubeMapUtilsCalcVector(int faceIdx, float x, float y)
|
|
{
|
|
int dir = faceIdx/2;
|
|
float z = (faceIdx%2 == 0)? -1.f:1.f;
|
|
|
|
x = x*2.f-1.f;
|
|
y = y*2.f-1.f;
|
|
|
|
if( dir == 0 )
|
|
{
|
|
return make_float4(z, x, y, 0.f);
|
|
}
|
|
else if( dir == 1 )
|
|
{
|
|
return make_float4(y,z,x, 0.f);
|
|
}
|
|
else
|
|
{
|
|
return make_float4(x,y,z, 0.f);
|
|
}
|
|
}
|
|
|
|
typedef int Face;
|
|
|
|
u32 sample(__local ShapeDeviceData* shape, int face, int x, int y)
|
|
{
|
|
|
|
int idx = HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES;
|
|
__local u8* height = (__local u8*)shape->m_height4;
|
|
return height[idx];
|
|
}
|
|
|
|
u32 sampleSupportGlobal(__global ShapeData* shape, int face, int x, int y)
|
|
{
|
|
|
|
int idx = HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES;
|
|
__global u8* height = (__global u8*)shape->m_supportHeight4;
|
|
return height[idx];
|
|
}
|
|
|
|
float4 sampleNormal(__local ShapeData* shape, int face, int x, int y)
|
|
{
|
|
return shape->m_normal[HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES];
|
|
}
|
|
|
|
float4 sampleNormalGlobal(const __global ShapeData* shape, int face, int x, int y)
|
|
{
|
|
return shape->m_normal[HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES];
|
|
}
|
|
|
|
float4 ShapeDataCalcSamplePoint( __local const ShapeDeviceData* shape, int sIdx )//u8 height, int sIdx, float scale )
|
|
{
|
|
const float oneOver255 = 1.f/255.f;
|
|
|
|
int faceIdx = fastDiv(sIdx,(HEIGHT_RES*HEIGHT_RES));
|
|
int r = (sIdx%(HEIGHT_RES*HEIGHT_RES));
|
|
int i = r/HEIGHT_RES;
|
|
int j = r%HEIGHT_RES;
|
|
|
|
float4 v;
|
|
float x = fastDiv((i+0.5f),(float)HEIGHT_RES);
|
|
float y = fastDiv((j+0.5f),(float)HEIGHT_RES);
|
|
v = CubeMapUtilsCalcVector(faceIdx, x, y);
|
|
v = normalize3( v );
|
|
|
|
int quantizedHeight = sample( shape, faceIdx, i, j );
|
|
float rheight = quantizedHeight*oneOver255*shape->m_scale;
|
|
return rheight*v;
|
|
}
|
|
|
|
float ShapeDataQueryDistance(__local const ShapeDeviceData* shape, float4 p )
|
|
{
|
|
if( dot3F4( p, p ) >= shape->m_scale*shape->m_scale ) return FLT_MAX;
|
|
|
|
const float oneOver255 = 1.f/255.f;
|
|
|
|
int faceIdx;
|
|
float x, y;
|
|
CubeMapUtilsCalcCrd( p, &faceIdx, &x, &y );
|
|
x = (x*HEIGHT_RES) - 0.5f;
|
|
y = (y*HEIGHT_RES) - 0.5f;
|
|
|
|
float height;
|
|
{
|
|
int xi = (int)(x);
|
|
int yi = (int)(y);
|
|
float dx = x-xi;
|
|
float dy = y-yi;
|
|
|
|
{
|
|
int xip = min2((int)(HEIGHT_RES-1), xi+1);
|
|
int yip = min2((int)(HEIGHT_RES-1), yi+1);
|
|
|
|
u32 xy = sample( shape, faceIdx, xi, yi );
|
|
u32 xpy = sample( shape, faceIdx, xip, yi );
|
|
u32 xpyp = sample( shape, faceIdx, xip, yip );
|
|
u32 xyp = sample( shape, faceIdx, xi, yip );
|
|
|
|
height = (xy*(1.f-dx)+xpy*dx)*(1.f-dy) + (xyp*(1.f-dx)+xpyp*dx)*dy;
|
|
height = height*oneOver255*shape->m_scale;
|
|
|
|
p.w = 0.f;
|
|
|
|
height = fastLength4( p ) - height;
|
|
}
|
|
}
|
|
|
|
return height;
|
|
}
|
|
|
|
float ShapeDataQuerySupportHeight(__global ShapeData* shape, float4 p )
|
|
{
|
|
int faceIdx;
|
|
float x, y;
|
|
CubeMapUtilsCalcCrd( p, &faceIdx, &x, &y );
|
|
x = (x*HEIGHT_RES) - 0.5f;
|
|
y = (y*HEIGHT_RES) - 0.5f;
|
|
|
|
float height;
|
|
{
|
|
int xi = (int)(x);
|
|
int yi = (int)(y);
|
|
|
|
{
|
|
int xip = min2((int)(HEIGHT_RES-1), xi+1);
|
|
int yip = min2((int)(HEIGHT_RES-1), yi+1);
|
|
|
|
u32 xy = sampleSupportGlobal( shape, faceIdx, xi, yi );
|
|
u32 xpy = sampleSupportGlobal( shape, faceIdx, xip, yi );
|
|
u32 xpyp = sampleSupportGlobal( shape, faceIdx, xip, yip );
|
|
u32 xyp = sampleSupportGlobal( shape, faceIdx, xi, yip );
|
|
|
|
height = max2( xy, max2( xpy, max2( xpyp, xyp ) ) );
|
|
height = height/255.f*shape->m_scale;
|
|
}
|
|
}
|
|
|
|
return height;
|
|
|
|
}
|
|
|
|
float4 ShapeDataQueryNormal(__global const ShapeData* shape, float4 p )
|
|
{
|
|
int faceIdx;
|
|
float x, y;
|
|
CubeMapUtilsCalcCrd( p, &faceIdx, &x, &y );
|
|
x = (x*HEIGHT_RES) - 0.5f;
|
|
y = (y*HEIGHT_RES) - 0.5f;
|
|
|
|
float4 normalOut;
|
|
{
|
|
int xi = (int)(x);
|
|
int yi = (int)(y);
|
|
|
|
normalOut = sampleNormalGlobal( shape, faceIdx, xi, yi );
|
|
}
|
|
return normalOut;
|
|
}
|
|
|
|
|
|
|
|
// kernels
|
|
|
|
|
|
__kernel
|
|
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
|
|
void SupportCullingKernel( __global int2* restrict gPairsIn, __global ShapeData* gShapes,
|
|
__global BodyData* gBodies,
|
|
__global int2* gPairsOut,
|
|
counter32_t gNPairs,
|
|
ConstBuffer cb )
|
|
{
|
|
int gIdx = GET_GLOBAL_IDX;
|
|
if( gIdx >= cb.m_nPairs ) return;
|
|
|
|
const float collisionMargin = cb.m_collisionMargin;
|
|
const int capacity = cb.m_capacity;
|
|
|
|
int2 pair = gPairsIn[gIdx];
|
|
BodyData bodyA = gBodies[pair.x];
|
|
BodyData bodyB = gBodies[pair.y];
|
|
int shapeAIdx = bodyA.m_shapeIdx;
|
|
int shapeBIdx = bodyB.m_shapeIdx;
|
|
|
|
|
|
bool collide = false;
|
|
|
|
//only collide if one of the two bodies has a non-zero mass
|
|
if (bodyA.m_invMass==0.f && bodyB.m_invMass==0.f)
|
|
return;
|
|
|
|
|
|
if (bodyA.m_shapeType == SHAPE_CONVEX_HEIGHT_FIELD && bodyB.m_shapeType==SHAPE_CONVEX_HEIGHT_FIELD)
|
|
{
|
|
float4 abInA, baInB;
|
|
float4 ab = bodyB.m_pos - bodyA.m_pos;
|
|
{
|
|
abInA = qtInvRotate( bodyA.m_quat, ab );
|
|
baInB = qtInvRotate( bodyB.m_quat, -ab );
|
|
}
|
|
float hA = ShapeDataQuerySupportHeight( gShapes+shapeAIdx, abInA );
|
|
float hB = ShapeDataQuerySupportHeight( gShapes+shapeBIdx, baInB );
|
|
|
|
float h2 = dot3F4( ab, ab );
|
|
|
|
collide = ( hA + hB + collisionMargin > sqrtf(h2) );
|
|
}
|
|
|
|
if( collide )
|
|
{
|
|
int dstIdx;
|
|
AppendInc( gNPairs, dstIdx );
|
|
if( dstIdx < capacity )
|
|
gPairsOut[dstIdx] = pair;
|
|
}
|
|
}
|
|
|
|
|
|
#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}
|
|
#define PARALLEL_REDUCE_MAX32(h) \
|
|
{int lIdx = GET_LOCAL_IDX;\
|
|
if( lIdx < 32 )\
|
|
{\
|
|
h[lIdx] = (h[lIdx].y > h[lIdx+1].y)? h[lIdx]: h[lIdx+1];\
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );\
|
|
h[lIdx] = (h[lIdx].y > h[lIdx+2].y)? h[lIdx]: h[lIdx+2];\
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );\
|
|
h[lIdx] = (h[lIdx].y > h[lIdx+4].y)? h[lIdx]: h[lIdx+4];\
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );\
|
|
h[lIdx] = (h[lIdx].y > h[lIdx+8].y)? h[lIdx]: h[lIdx+8];\
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );\
|
|
h[lIdx] = (h[lIdx].y > h[lIdx+16].y)? h[lIdx]: h[lIdx+16];\
|
|
}}
|
|
|
|
#define PARALLEL_REDUCE32(h) \
|
|
{int lIdx = GET_LOCAL_IDX;\
|
|
if( lIdx < 32 )\
|
|
{\
|
|
h[lIdx] += h[lIdx+1];\
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );\
|
|
h[lIdx] += h[lIdx+2];\
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );\
|
|
h[lIdx] += h[lIdx+4];\
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );\
|
|
h[lIdx] += h[lIdx+8];\
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );\
|
|
h[lIdx] += h[lIdx+16];\
|
|
}}
|
|
|
|
|
|
float4 extractManifold(__local float4* p, __local float4* h, __local int* nPointsPtr, float4 nearNormal)
|
|
{
|
|
int nPoints = *nPointsPtr;
|
|
float4 center = make_float4(0,0,0,0);
|
|
{ // calculate center
|
|
nPoints = min2( nPoints, 32 );
|
|
{
|
|
int lIdx = GET_LOCAL_IDX;
|
|
h[lIdx] = p[lIdx];
|
|
h[lIdx] = (lIdx<nPoints)? h[lIdx] : make_float4(0,0,0,0);
|
|
}
|
|
GROUP_LDS_BARRIER;
|
|
|
|
PARALLEL_REDUCE32( h );//working on h[64]
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
// if( GET_LOCAL_IDX == 0 )
|
|
{
|
|
center = fastDiv4( h[0], make_float4(nPoints, nPoints, nPoints, 0.f) );
|
|
}
|
|
GROUP_LDS_BARRIER;
|
|
|
|
if( nPoints < 4 ) return center;
|
|
}
|
|
// is center set on all the WIs?
|
|
float4 aVector = p[0] - center;
|
|
float4 u = normalize3( cross3( nearNormal, aVector ) );
|
|
float4 v = normalize3( cross3( nearNormal, u ) );
|
|
|
|
int idx[4];
|
|
|
|
__local int4* a = (__local int4*)h;
|
|
{ // select 4
|
|
{ // set dot of 4 directions for xyzw
|
|
int ie = GET_LOCAL_IDX;
|
|
{
|
|
float f;
|
|
float4 r = p[ie]-center;
|
|
f = dot3F4( u, r );
|
|
a[ie].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
|
|
|
|
f = dot3F4( -u, r );
|
|
a[ie].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
|
|
|
|
f = dot3F4( v, r );
|
|
a[ie].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
|
|
|
|
f = dot3F4( -v, r );
|
|
a[ie].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
|
|
|
|
if( ie >= nPoints ) a[ie] = make_int4(-0xfffffff, -0xfffffff, -0xfffffff, -0xfffffff);
|
|
}
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
{ // vector reduce, h[64]
|
|
int lIdx = GET_LOCAL_IDX;
|
|
if( lIdx < 32 )
|
|
{
|
|
h[lIdx] = max2( h[lIdx], h[lIdx+1] );
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );
|
|
h[lIdx] = max2( h[lIdx], h[lIdx+2] );
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );
|
|
h[lIdx] = max2( h[lIdx], h[lIdx+4] );
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );
|
|
h[lIdx] = max2( h[lIdx], h[lIdx+8] );
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );
|
|
h[lIdx] = max2( h[lIdx], h[lIdx+16] );
|
|
}
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
}
|
|
{
|
|
{ // set to idx
|
|
idx[0] = (int)a[0].x & 0xff;
|
|
idx[1] = (int)a[0].y & 0xff;
|
|
idx[2] = (int)a[0].z & 0xff;
|
|
idx[3] = (int)a[0].w & 0xff;
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
float4 selection;
|
|
if( GET_LOCAL_IDX < 4 ) selection = p[idx[GET_LOCAL_IDX]];
|
|
|
|
GROUP_LDS_BARRIER;
|
|
if( GET_LOCAL_IDX < 4 ) p[GET_LOCAL_IDX] = selection;
|
|
}
|
|
|
|
|
|
return center;
|
|
}
|
|
|
|
void extractManifold1(__local float4* p, __local float4* h, __local int* nPointsPtr, float4 center)
|
|
{
|
|
__local int* a = (__local int*)h;
|
|
{
|
|
GROUP_LDS_BARRIER;
|
|
float4 selection;
|
|
if( GET_LOCAL_IDX < 4 )
|
|
{
|
|
int idx = (int)a[GET_LOCAL_IDX] & 0xff;
|
|
selection = p[idx];
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
if( GET_LOCAL_IDX < 4 ) p[GET_LOCAL_IDX] = selection;
|
|
}
|
|
|
|
}
|
|
|
|
void extractManifold2( __local float4* p0, __local int* nPointsPtr0, float4 nearNormal0,
|
|
__local float4* p1, __local int* nPointsPtr1, float4 nearNormal1,
|
|
__local float4* h, float4 centerOut[2])
|
|
{
|
|
|
|
int nPoints[2];
|
|
nPoints[0] = *nPointsPtr0;
|
|
nPoints[1] = *nPointsPtr1;
|
|
float4 center[2];
|
|
center[0] = make_float4(0,0,0,0);
|
|
center[1] = make_float4(0,0,0,0);
|
|
{ // calculate center
|
|
nPoints[0] = min2( nPoints[0], 32 );
|
|
nPoints[1] = min2( nPoints[1], 32 );
|
|
{
|
|
int lIdx = GET_LOCAL_IDX;
|
|
h[lIdx] = (lIdx<nPoints[0])? p0[lIdx] : make_float4(0,0,0,0);
|
|
h[lIdx+64] = (lIdx<nPoints[1])? p1[lIdx] : make_float4(0,0,0,0);
|
|
}
|
|
GROUP_LDS_BARRIER;
|
|
|
|
{
|
|
int bIdx = GET_LOCAL_IDX/32;
|
|
int eIdx = GET_LOCAL_IDX%32;
|
|
int lIdx = eIdx + bIdx*64;
|
|
{
|
|
h[lIdx] += h[lIdx+1];
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );
|
|
h[lIdx] += h[lIdx+2];
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );
|
|
h[lIdx] += h[lIdx+4];
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );
|
|
h[lIdx] += h[lIdx+8];
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );
|
|
h[lIdx] += h[lIdx+16];
|
|
}
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
for(int bIdx=0; bIdx<2; bIdx++)
|
|
{
|
|
center[bIdx] = fastDiv4( h[bIdx*64], make_float4(nPoints[bIdx], nPoints[bIdx], nPoints[bIdx], 0.f) );
|
|
}
|
|
GROUP_LDS_BARRIER;
|
|
}
|
|
|
|
centerOut[0] = center[0];
|
|
centerOut[1] = center[1];
|
|
|
|
float4 u[2];
|
|
float4 v[2];
|
|
|
|
{
|
|
float4 aVector = p0[0] - center[0];
|
|
u[0] = normalize3( cross3( nearNormal0, aVector ) );
|
|
v[0] = normalize3( cross3( nearNormal0, u[0] ) );
|
|
}
|
|
{
|
|
float4 aVector = p1[0] - center[1];
|
|
u[1] = normalize3( cross3( nearNormal1, aVector ) );
|
|
v[1] = normalize3( cross3( nearNormal1, u[1] ) );
|
|
}
|
|
|
|
{
|
|
__local int4* a = (__local int4*)h;
|
|
{ // select 4
|
|
{ // set dot of 4 directions for xyzw
|
|
int ie = GET_LOCAL_IDX%32;
|
|
int setIdx = GET_LOCAL_IDX/32;
|
|
{
|
|
float f;
|
|
float4 r = p0[ie + setIdx*32]-center[setIdx];
|
|
f = dot3F4( u[setIdx], r );
|
|
a[ie + setIdx*64].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
|
|
|
|
f = dot3F4( -u[setIdx], r );
|
|
a[ie + setIdx*64].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
|
|
|
|
f = dot3F4( v[setIdx], r );
|
|
a[ie + setIdx*64].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
|
|
|
|
f = dot3F4( -v[setIdx], r );
|
|
a[ie + setIdx*64].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
|
|
|
|
if( ie >= nPoints[setIdx] ) a[ie + setIdx*64] = make_int4(-0xfffffff, -0xfffffff, -0xfffffff, -0xfffffff);
|
|
|
|
a[ie + 32] = make_int4(-0xfffffff, -0xfffffff, -0xfffffff, -0xfffffff);
|
|
}
|
|
}
|
|
}
|
|
GROUP_LDS_BARRIER;
|
|
|
|
{ // vector reduce, h[64]
|
|
int bIdx = GET_LOCAL_IDX/32;
|
|
int eIdx = GET_LOCAL_IDX%32;
|
|
int lIdx = eIdx + bIdx*64;
|
|
{
|
|
h[lIdx] = max2( h[lIdx], h[lIdx+1] );
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );
|
|
h[lIdx] = max2( h[lIdx], h[lIdx+2] );
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );
|
|
h[lIdx] = max2( h[lIdx], h[lIdx+4] );
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );
|
|
h[lIdx] = max2( h[lIdx], h[lIdx+8] );
|
|
mem_fence( CLK_LOCAL_MEM_FENCE );
|
|
h[lIdx] = max2( h[lIdx], h[lIdx+16] );
|
|
}
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
}
|
|
__local int* a = (__local int*)h;
|
|
{
|
|
GROUP_LDS_BARRIER;
|
|
|
|
float4 selection;
|
|
|
|
int bIdx = GET_LOCAL_IDX/32;
|
|
int eIdx = GET_LOCAL_IDX%32;
|
|
|
|
if( eIdx < 4 )
|
|
{
|
|
int idx = (int)a[eIdx+64*4*bIdx] & 0xff;
|
|
selection = p0[idx+32*bIdx];
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
if( eIdx < 4 ) p0[eIdx+32*bIdx] = selection;
|
|
}
|
|
}
|
|
|
|
/*
|
|
1. Query Normal
|
|
2. Fill Normal
|
|
3. A->B, B->A
|
|
*/
|
|
|
|
void testVtx(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,
|
|
__local ShapeDeviceData* shapeAPtr, __local ShapeDeviceData* shapeBPtr,
|
|
__local int* lNContacts, __local float4* lCPoints)
|
|
{
|
|
int pIdx = GET_LOCAL_IDX;
|
|
float4 bodyAPos = bodyAPtr->m_pos;
|
|
float4 bodyBPos = bodyBPtr->m_pos;
|
|
Quaternion bodyAQuat = bodyAPtr->m_quat;
|
|
Quaternion bodyBQuat = bodyBPtr->m_quat;
|
|
while( pIdx < HEIGHT_RES*HEIGHT_RES*6 )
|
|
{
|
|
float4 pInB = ShapeDataCalcSamplePoint( shapeBPtr, pIdx );
|
|
|
|
float4 pInW = transform( &pInB, &bodyBPos, &bodyBQuat );
|
|
// Aabb bodyAAabb = bodyAPtr->m_aabb;
|
|
// if( AabbOverlapsPoint( &bodyAAabb, pInW ) )
|
|
{
|
|
float4 pInA = invTransform( &pInW, &bodyAPos, &bodyAQuat );
|
|
|
|
float dist = ShapeDataQueryDistance( shapeAPtr, pInA );
|
|
if( dist < 0.010f )
|
|
{
|
|
int dstIdx = atom_add( lNContacts, 1 );
|
|
if( dstIdx < 32 )
|
|
{
|
|
lCPoints[ dstIdx ] = make_float4( pInA.x, pInA.y, pInA.z, dist );
|
|
}
|
|
}
|
|
}
|
|
|
|
pIdx += GET_GROUP_SIZE;
|
|
}
|
|
}
|
|
|
|
void testVtx2(__local const BodyData* bodyA, __local const BodyData* bodyB,
|
|
__local const ShapeDeviceData* shapeA, __local const ShapeDeviceData* shapeB,
|
|
__local int* lNContactsA, __local float4* lCPointsA,
|
|
__local int* lNContactsB, __local float4* lCPointsB, float collisionMargin )
|
|
{
|
|
int pIdx = GET_LOCAL_IDX;
|
|
|
|
while( pIdx < HEIGHT_RES*HEIGHT_RES*6*2 )
|
|
{
|
|
__local const BodyData* bodyAPtr =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?bodyA:bodyB;
|
|
__local const BodyData* bodyBPtr =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?bodyB:bodyA;
|
|
__local const ShapeDeviceData* shapeAPtr =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?shapeA:shapeB;
|
|
__local const ShapeDeviceData* shapeBPtr =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?shapeB:shapeA;
|
|
__local int* lNContacts =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?lNContactsA:lNContactsB;
|
|
__local float4* lCPoints =( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?lCPointsA:lCPointsB;
|
|
|
|
float4 bodyAPos = bodyAPtr->m_pos;
|
|
float4 bodyBPos = bodyBPtr->m_pos;
|
|
Quaternion bodyAQuat = bodyAPtr->m_quat;
|
|
Quaternion bodyBQuat = bodyBPtr->m_quat;
|
|
|
|
float4 pInB = ShapeDataCalcSamplePoint( shapeBPtr, pIdx%(HEIGHT_RES*HEIGHT_RES*6) );
|
|
|
|
float4 pInW = transform( &pInB, &bodyBPos, &bodyBQuat );
|
|
// Aabb bodyAAabb = bodyAPtr->m_aabb;
|
|
// if( AabbOverlapsPoint( &bodyAAabb, pInW ) )
|
|
{
|
|
float4 pInA = invTransform( &pInW, &bodyAPos, &bodyAQuat );
|
|
|
|
float dist = ShapeDataQueryDistance( shapeAPtr, pInA );
|
|
if( dist < collisionMargin )
|
|
{
|
|
int dstIdx = atom_add( lNContacts, 1 );
|
|
if( dstIdx < 32 )
|
|
{
|
|
lCPoints[ dstIdx ] = make_float4( pInA.x, pInA.y, pInA.z, dist );
|
|
}
|
|
}
|
|
}
|
|
|
|
pIdx += GET_GROUP_SIZE;
|
|
}
|
|
}
|
|
|
|
void testVtxWithPlane(__local BodyData* bodyA, __local BodyData* bodyB,
|
|
float4 nA, __local ShapeDeviceData* shapeB,
|
|
__local int* lNContactsA, __local float4* lCPointsA, float collisionMargin)
|
|
{
|
|
int pIdx = GET_LOCAL_IDX;
|
|
|
|
while( pIdx < HEIGHT_RES*HEIGHT_RES*6 )
|
|
{
|
|
__local BodyData* bodyAPtr =bodyA;
|
|
__local BodyData* bodyBPtr =bodyB;
|
|
__local ShapeDeviceData* shapeBPtr =shapeB;
|
|
__local int* lNContacts =lNContactsA;
|
|
__local float4* lCPoints =lCPointsA;
|
|
|
|
float4 bodyAPos = bodyAPtr->m_pos;
|
|
float4 bodyBPos = bodyBPtr->m_pos;
|
|
Quaternion bodyAQuat = bodyAPtr->m_quat;
|
|
Quaternion bodyBQuat = bodyBPtr->m_quat;
|
|
|
|
float4 pInB = ShapeDataCalcSamplePoint( shapeBPtr, pIdx%(HEIGHT_RES*HEIGHT_RES*6) );
|
|
|
|
float4 pInW = transform( &pInB, &bodyBPos, &bodyBQuat );
|
|
{
|
|
float4 pInA = invTransform( &pInW, &bodyAPos, &bodyAQuat );
|
|
|
|
float dist = dot3w1( pInA, nA );//ShapeDataQueryDistance( shapeAPtr, pInA );
|
|
if( dist < collisionMargin )
|
|
{
|
|
int dstIdx = atom_add( lNContacts, 1 );
|
|
if( dstIdx < 32 )
|
|
{
|
|
lCPoints[ dstIdx ] = make_float4( pInA.x, pInA.y, pInA.z, dist );
|
|
}
|
|
}
|
|
}
|
|
|
|
pIdx += GET_GROUP_SIZE;
|
|
}
|
|
}
|
|
|
|
#define GET_SHAPE_IDX(x) (int)((x).m_shapeIdx)
|
|
|
|
void output(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,
|
|
__local int2* iPair,
|
|
__local int* lNContacts, __local float4* lCPoints,
|
|
float4 center,
|
|
__global ShapeData* shapeData, __global Contact4* contactsOut, float collisionMargin)
|
|
{
|
|
if( *lNContacts != 0 )
|
|
{
|
|
int nContacts = min2( *lNContacts, 4 );
|
|
|
|
__global Contact4* c = contactsOut;
|
|
|
|
if( GET_LOCAL_IDX < nContacts )
|
|
{
|
|
int i = GET_LOCAL_IDX;
|
|
float4 p = lCPoints[i];
|
|
float4 bodyAPos = bodyAPtr->m_pos;
|
|
Quaternion bodyAQuat = bodyAPtr->m_quat;
|
|
|
|
c->m_worldPos[i] = transform( &p, &bodyAPos, &bodyAQuat );
|
|
c->m_worldPos[i].w = lCPoints[i].w - collisionMargin;
|
|
}
|
|
|
|
if( GET_LOCAL_IDX == 0 )
|
|
{
|
|
float4 contactNormal;
|
|
contactNormal = ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );
|
|
contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );
|
|
|
|
c->m_worldNormal = contactNormal;
|
|
// c->m_restituitionCoeff = 0.f;
|
|
// c->m_frictionCoeff = 0.7f;
|
|
c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
|
|
GET_NPOINTS(*c) = nContacts;
|
|
c->m_bodyAPtr = iPair[0].x;
|
|
c->m_bodyBPtr = iPair[0].y;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if( GET_LOCAL_IDX == 0 )
|
|
GET_NPOINTS(contactsOut[0]) = 0;
|
|
}
|
|
}
|
|
|
|
// todo. make it better
|
|
void output2(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,
|
|
int pair0, int pair1,
|
|
__local int* lNContacts, __local float4* lCPoints,
|
|
float4 center,
|
|
const __global ShapeData* shapeData, __global Contact4* contactsOut, counter32_t nContactsOut, int capacity,
|
|
float collisionMargin )
|
|
{
|
|
int lIdx = GET_LOCAL_IDX%32;
|
|
int nContacts = min2( *lNContacts, 4 );
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
if( lIdx == 0 && nContacts)
|
|
{
|
|
int dstIdx;
|
|
AppendInc( nContactsOut, dstIdx );
|
|
*lNContacts = dstIdx;
|
|
|
|
if( dstIdx >= capacity )
|
|
*lNContacts = -1;
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
bool canWrite = (*lNContacts!=-1);
|
|
|
|
if( nContacts && canWrite )
|
|
{
|
|
__global Contact4* c = contactsOut + (*lNContacts);
|
|
|
|
if( lIdx < nContacts )
|
|
{
|
|
int i = lIdx;
|
|
float4 p = lCPoints[i];
|
|
float4 bodyAPos = bodyAPtr->m_pos;
|
|
Quaternion bodyAQuat = bodyAPtr->m_quat;
|
|
|
|
p = transform( &p, &bodyAPos, &bodyAQuat );
|
|
p.w = lCPoints[i].w - collisionMargin;
|
|
c->m_worldPos[i] = p;
|
|
}
|
|
|
|
if( lIdx == 0 )
|
|
{
|
|
if( nContacts )
|
|
{
|
|
float4 contactNormal;
|
|
contactNormal = ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );
|
|
contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );
|
|
|
|
c->m_worldNormal = contactNormal;
|
|
// c->m_restituitionCoeff = 0.f;
|
|
// c->m_frictionCoeff = 0.7f;
|
|
c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
|
|
c->m_bodyAPtr = pair0;
|
|
c->m_bodyBPtr = pair1;
|
|
}
|
|
GET_NPOINTS(*c) = nContacts;
|
|
}
|
|
}
|
|
}
|
|
|
|
__inline
|
|
void output2LDS(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,
|
|
int pair0, int pair1,
|
|
int lNContacts, __local float4* lCPoints,
|
|
float4 center,
|
|
const __global ShapeData* shapeData, __local Contact4* contactsOut,
|
|
float collisionMargin )
|
|
{
|
|
int lIdx = GET_LOCAL_IDX%32;
|
|
// int lIdx = GET_LOCAL_IDX;
|
|
// int groupIdx = 0;
|
|
|
|
int nContacts = min2( lNContacts, 4 );
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
if( nContacts != 0 )
|
|
{
|
|
if( lIdx < nContacts )
|
|
{
|
|
int i = lIdx;
|
|
float4 p = lCPoints[i];
|
|
float4 bodyAPos = bodyAPtr->m_pos;
|
|
Quaternion bodyAQuat = bodyAPtr->m_quat;
|
|
|
|
p = transform( &p, &bodyAPos, &bodyAQuat );
|
|
p.w = lCPoints[i].w - collisionMargin;
|
|
contactsOut->m_worldPos[i] = p;
|
|
}
|
|
}
|
|
|
|
if( lIdx == 0 )
|
|
{
|
|
if( nContacts != 0 )
|
|
{
|
|
float4 contactNormal;
|
|
contactNormal = ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );
|
|
contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );
|
|
|
|
contactsOut->m_worldNormal = contactNormal;
|
|
// contactsOut->m_worldNormal = make_float4(1.5f,1.4f,1.3f,0.f);
|
|
// contactsOut->m_restituitionCoeff = 0.f;
|
|
// contactsOut->m_frictionCoeff = 0.7f;
|
|
contactsOut->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
|
|
contactsOut->m_bodyAPtr = pair0;
|
|
contactsOut->m_bodyBPtr = pair1;
|
|
}
|
|
GET_NPOINTS(*contactsOut) = nContacts;//nContacts;
|
|
}
|
|
|
|
// contactsOut[groupIdx].m_worldNormal = make_float4(1.5f,1.4f,1.3f,0.f);
|
|
}
|
|
|
|
void output2_1(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,
|
|
int pair0, int pair1,
|
|
__local int* lNContacts, __local float4* lCPoints,
|
|
float4 center, float4 nA,
|
|
const __global ShapeData* shapeData, __global Contact4* contactsOut, counter32_t nContactsOut, int capacity, float collisionMargin )
|
|
{
|
|
int lIdx = GET_LOCAL_IDX;
|
|
int nContacts = min2( *lNContacts, 4 );
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
if( lIdx == 0 && nContacts)
|
|
{
|
|
int dstIdx;
|
|
AppendInc( nContactsOut, dstIdx );
|
|
*lNContacts = dstIdx;
|
|
|
|
if( dstIdx >= capacity )
|
|
*lNContacts = -1;
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
bool canWrite = (*lNContacts!=-1);
|
|
|
|
if( nContacts && canWrite )
|
|
{
|
|
__global Contact4* c = contactsOut + (*lNContacts);
|
|
|
|
if( lIdx < nContacts )
|
|
{
|
|
int i = lIdx;
|
|
float4 p = lCPoints[i];
|
|
float4 bodyAPos = bodyAPtr->m_pos;
|
|
Quaternion bodyAQuat = bodyAPtr->m_quat;
|
|
|
|
p = transform( &p, &bodyAPos, &bodyAQuat );
|
|
p.w = lCPoints[i].w - collisionMargin;
|
|
c->m_worldPos[i] = p;
|
|
}
|
|
|
|
if( lIdx == 0 )
|
|
{
|
|
if( nContacts )
|
|
{
|
|
float4 contactNormal;
|
|
contactNormal = nA;//ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );
|
|
contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );
|
|
|
|
c->m_worldNormal = contactNormal;
|
|
// c->m_restituitionCoeff = 0.f;
|
|
// c->m_frictionCoeff = 0.7f;
|
|
c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
|
|
c->m_bodyAPtr = pair0;
|
|
c->m_bodyBPtr = pair1;
|
|
}
|
|
GET_NPOINTS(*c) = nContacts;
|
|
}
|
|
}
|
|
}
|
|
|
|
__kernel
|
|
void manifold(__global float4* vIn, __global float4* vOut)
|
|
{
|
|
__local float4 lCPoints[32];
|
|
__local float4 lManifoldBuffer[64];
|
|
__local int lNContacts;
|
|
__local float4 ab;
|
|
|
|
if( GET_LOCAL_IDX<32 )
|
|
{
|
|
lCPoints[GET_LOCAL_IDX] = vIn[GET_GLOBAL_IDX];
|
|
}
|
|
|
|
if( GET_LOCAL_IDX == 0 )
|
|
{
|
|
lNContacts = 32;
|
|
ab = vIn[GET_GLOBAL_IDX];
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
float4 center = extractManifold( lCPoints, lManifoldBuffer, &lNContacts, ab );
|
|
|
|
if( GET_LOCAL_IDX < lNContacts )
|
|
{
|
|
vOut[4*GET_GROUP_IDX+GET_LOCAL_IDX] = lCPoints[GET_LOCAL_IDX];
|
|
}
|
|
|
|
}
|
|
|
|
//#define COMBINE_REDUCTION
|
|
|
|
__kernel
|
|
__attribute__((reqd_work_group_size(64, 1, 1)))
|
|
void NarrowphaseKernel( const __global int2* restrict pairs, const __global ShapeData* shapeData, const __global BodyData* restrict bodyDatas,
|
|
__global Contact4* restrict contactsOut,
|
|
counter32_t nContactsOut, ConstBuffer cb )
|
|
{
|
|
// 2.5K LDS
|
|
__local Contact4 ldsContacts[2];
|
|
__local BodyData bodyA;
|
|
__local BodyData bodyB;
|
|
__local ShapeDeviceData shapeA;
|
|
__local ShapeDeviceData shapeB;
|
|
__local float4 lCPointsA[32*2];
|
|
__local int lNContactsA;
|
|
__local float4* lCPointsB = lCPointsA+32;
|
|
__local int lNContactsB;
|
|
#ifdef COMBINE_REDUCTION
|
|
__local float4 lManifoldBuffer[64*2];
|
|
#else
|
|
__local float4 lManifoldBuffer[64];
|
|
#endif
|
|
__local int2 iPairAB;
|
|
|
|
const int capacity = cb.m_capacity;
|
|
const float collisionMargin = cb.m_collisionMargin;
|
|
|
|
|
|
int pairIdx = GET_GROUP_IDX;
|
|
// for(int pairIdx = GET_GROUP_IDX; pairIdx<nPairs; pairIdx+=GET_NUM_GROUPS)
|
|
{
|
|
if( GET_LOCAL_IDX == 0 ) // load Bodies
|
|
{
|
|
int2 pair = pairs[pairIdx];
|
|
iPairAB = make_int2(pair.x, pair.y);
|
|
bodyA = bodyDatas[ pair.x ];
|
|
bodyB = bodyDatas[ pair.y ];
|
|
shapeA.m_scale = shapeData[ GET_SHAPE_IDX(bodyA) ].m_scale;
|
|
shapeB.m_scale = shapeData[ GET_SHAPE_IDX(bodyB) ].m_scale;
|
|
lNContactsA = 0;
|
|
lNContactsB = 0;
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
// todo. can check if the shape is the same to previous one. If same, dont read
|
|
{ // load shape data
|
|
int idx = GET_LOCAL_IDX%32;
|
|
int bIdx = GET_LOCAL_IDX/32;
|
|
__local ShapeDeviceData* myShape = (bIdx==0)?&shapeA: &shapeB;
|
|
int myShapeIdx = (bIdx==0)?GET_SHAPE_IDX(bodyA): GET_SHAPE_IDX(bodyB);
|
|
|
|
while( idx < HEIGHT_RES*HEIGHT_RES*6/4 )
|
|
{
|
|
myShape->m_height4[idx] = shapeData[ myShapeIdx ].m_height4[idx];
|
|
|
|
idx+=32;
|
|
}
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
testVtx2( &bodyA, &bodyB, &shapeA, &shapeB, &lNContactsA, lCPointsA, &lNContactsB, lCPointsB, collisionMargin );
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
float4 ab = bodyB.m_pos - bodyA.m_pos;
|
|
float4 center[2];
|
|
|
|
if( lNContactsA != 0 || lNContactsB != 0 )
|
|
{
|
|
float4 abInA;
|
|
abInA = qtInvRotate( bodyA.m_quat, ab );
|
|
|
|
float4 abInB;
|
|
abInB = qtInvRotate( bodyB.m_quat, ab );
|
|
|
|
#ifdef COMBINE_REDUCTION
|
|
extractManifold2( lCPointsA, &lNContactsA, abInA,
|
|
lCPointsB, &lNContactsB, abInB,
|
|
lManifoldBuffer, center );
|
|
#else
|
|
if( lNContactsA != 0 )
|
|
center[0] = extractManifold( lCPointsA, lManifoldBuffer, &lNContactsA, abInA );
|
|
if( lNContactsB != 0 )
|
|
center[1] = extractManifold( lCPointsB, lManifoldBuffer, &lNContactsB, abInB );
|
|
#endif
|
|
}
|
|
|
|
int firstSet = GET_LOCAL_IDX/32;
|
|
|
|
/*
|
|
if( GET_LOCAL_IDX == 0 ) // for debug
|
|
{
|
|
ldsContacts[0].m_worldNormal = make_float4(-1,-1,-1,0);
|
|
ldsContacts[0].m_bodyAPtr = 0;
|
|
ldsContacts[0].m_bodyBPtr = 0;
|
|
ldsContacts[0].m_batchIdx = 111;
|
|
ldsContacts[1].m_worldNormal = make_float4(-1,-1,-1,0);
|
|
ldsContacts[1].m_bodyAPtr = 0;
|
|
ldsContacts[1].m_bodyBPtr = 0;
|
|
ldsContacts[1].m_batchIdx = 111;
|
|
}
|
|
*/
|
|
bool doReduction = true;
|
|
if( doReduction )
|
|
{
|
|
GROUP_LDS_BARRIER;
|
|
|
|
output2LDS( (firstSet)?&bodyA: &bodyB, (firstSet)?&bodyB : &bodyA,
|
|
(firstSet)?iPairAB.x : iPairAB.y, (firstSet)?iPairAB.y : iPairAB.x,
|
|
(firstSet)?lNContactsA : lNContactsB, (firstSet)?lCPointsA:lCPointsB,
|
|
(firstSet)?center[0] : center[1], shapeData, (firstSet)?&ldsContacts[0]: &ldsContacts[1], collisionMargin );
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
if( GET_LOCAL_IDX == 0 )
|
|
{
|
|
if( lNContactsA && lNContactsB )
|
|
{
|
|
float nDotn = dot3F4( ldsContacts[0].m_worldNormal, ldsContacts[1].m_worldNormal );
|
|
if( nDotn < -(1.f-0.01f) )
|
|
{
|
|
if( ldsContacts[0].m_bodyAPtr > ldsContacts[1].m_bodyAPtr )
|
|
lNContactsA = 0;
|
|
else
|
|
lNContactsB = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
if( GET_LOCAL_IDX == 0 )
|
|
{
|
|
int n = lNContactsA;
|
|
if( n != 0 )
|
|
{
|
|
int dstIdx;
|
|
AppendInc( nContactsOut, dstIdx );
|
|
if( dstIdx < capacity )
|
|
{ int idx = 0;
|
|
contactsOut[ dstIdx ] = ldsContacts[idx];
|
|
contactsOut[ dstIdx].m_batchIdx = pairIdx;
|
|
}
|
|
}
|
|
|
|
n = lNContactsB;
|
|
if( n != 0 )
|
|
{
|
|
int dstIdx;
|
|
AppendInc( nContactsOut, dstIdx );
|
|
if( dstIdx < capacity )
|
|
{ int idx = 1;
|
|
contactsOut[ dstIdx ] = ldsContacts[idx];
|
|
contactsOut[ dstIdx].m_batchIdx = pairIdx;
|
|
}
|
|
}
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
}
|
|
else
|
|
{
|
|
//output2( (firstSet)?&bodyA: &bodyB, (firstSet)?&bodyB : &bodyA,
|
|
// (firstSet)?iPairAB.x : iPairAB.y, (firstSet)?iPairAB.y : iPairAB.x,
|
|
// (firstSet)?&lNContactsA : &lNContactsB, (firstSet)?lCPointsA:lCPointsB,
|
|
// (firstSet)?center[0] : center[1], shapeData, contactsOut, nContactsOut, capacity, collisionMargin );
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
__kernel
|
|
__attribute__((reqd_work_group_size(64, 1, 1)))
|
|
void NarrowphaseWithPlaneKernel( const __global int2* restrict pairs, const __global ShapeData* shapeData, const __global BodyData* restrict bodyDatas,
|
|
__global Contact4* restrict contactsOut,
|
|
counter32_t nContactsOut, ConstBuffer cb )
|
|
{
|
|
// 2.5K LDS
|
|
__local BodyData bodyA;
|
|
__local BodyData bodyB;
|
|
__local ShapeDeviceData shapeA;
|
|
__local ShapeDeviceData shapeB;
|
|
__local float4 lCPointsA[32*2];
|
|
__local int lNContactsA;
|
|
// __local float4* lCPointsB = lCPointsA+32;
|
|
// __local int lNContactsB;
|
|
__local float4 lManifoldBuffer[64];
|
|
__local int2 iPairAB;
|
|
|
|
const int capacity = cb.m_capacity;
|
|
const float collisionMargin = cb.m_collisionMargin;
|
|
|
|
int pairIdx = GET_GROUP_IDX;
|
|
{
|
|
if( GET_LOCAL_IDX == 0 ) // load Bodies
|
|
{
|
|
int2 pair = pairs[pairIdx];
|
|
iPairAB = make_int2(pair.x, pair.y);
|
|
bodyA = bodyDatas[ pair.x ];
|
|
bodyB = bodyDatas[ pair.y ];
|
|
shapeA.m_scale = shapeData[ GET_SHAPE_IDX(bodyA) ].m_scale;
|
|
shapeB.m_scale = shapeData[ GET_SHAPE_IDX(bodyB) ].m_scale;
|
|
lNContactsA = 0;
|
|
// lNContactsB = 0;
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
if (bodyB.m_invMass == 0.f)
|
|
return;
|
|
|
|
// todo. can check if the shape is the same to previous one. If same, dont read
|
|
{ // load shape data
|
|
int idx = GET_LOCAL_IDX%32;
|
|
int bIdx = GET_LOCAL_IDX/32;
|
|
__local ShapeDeviceData* myShape = (bIdx==0)?&shapeA: &shapeB;
|
|
int myShapeIdx = (bIdx==0)?GET_SHAPE_IDX(bodyA): GET_SHAPE_IDX(bodyB);
|
|
|
|
while( idx < HEIGHT_RES*HEIGHT_RES*6/4 )
|
|
{
|
|
myShape->m_height4[idx] = shapeData[ myShapeIdx ].m_height4[idx];
|
|
|
|
idx+=32;
|
|
}
|
|
}
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
float4 nA = make_float4(0,1,0,0);
|
|
|
|
|
|
// testVtx2( &bodyA, &bodyB, &shapeA, &shapeB, &lNContactsA, lCPointsA, &lNContactsB, lCPointsB );
|
|
testVtxWithPlane( &bodyA, &bodyB, nA, &shapeB, &lNContactsA, lCPointsA, collisionMargin );
|
|
|
|
GROUP_LDS_BARRIER;
|
|
|
|
// float4 ab = bodyB.m_pos - bodyA.m_pos;
|
|
float4 center[2];
|
|
|
|
if( lNContactsA != 0 )
|
|
{
|
|
float4 abInA;
|
|
abInA = nA;//qtInvRotate( bodyA.m_quat, ab );
|
|
|
|
if( lNContactsA != 0 )
|
|
center[0] = extractManifold( lCPointsA, lManifoldBuffer, &lNContactsA, abInA );
|
|
}
|
|
|
|
// int firstSet = GET_LOCAL_IDX/32;
|
|
|
|
output2_1( &bodyA, &bodyB,
|
|
iPairAB.x, iPairAB.y,
|
|
&lNContactsA, lCPointsA,
|
|
center[0], nA, shapeData, contactsOut, nContactsOut, capacity, collisionMargin );
|
|
}
|
|
} |