Gpu3D demo added

2009-05-22 02:41:22 +00:00
parent fb6146f0be
commit bc1662abee
12 changed files with 2598 additions and 0 deletions
--- a/Demos/Gpu3dDemo/btGpuDemo3dSharedCode.h
+++ b/Demos/Gpu3dDemo/btGpuDemo3dSharedCode.h
@@ -0,0 +1,542 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2007 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+#define USE_FRICTION 1
+#define FRICTION_BOX_GROUND_FACT 0.01f
+#define FRICTION_BOX_BOX_FACT 0.01f
+//#define FRICTION_BOX_BOX_FACT 0.05f
+#define USE_CENTERS 1
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------   C o n s t r a i n t   s o l v e r    d e m o  3D --------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+// kernel functions
+
+
+BT_GPU___global__ void clearAccumulationOfLambdaDtD(float* lambdaDtBox, int numConstraints, int numContPoints)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+	if(index < numConstraints)
+	{
+		for(int i=0; i < numContPoints; i++)
+			lambdaDtBox[numContPoints * index + i] = 0;
+	}
+} // clearAccumulationOfLambdaDtD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___device__ float computeImpulse3D(float3 rVel,
+								 float positionConstraint,
+								 float3 cNormal,
+								 float dt)
+{
+	const float collisionConstant	=	0.1f;
+	const float baumgarteConstant	=	0.1f;
+	const float penetrationError	=	0.02f;
+
+	float lambdaDt=0;
+	float3 impulse=BT_GPU_make_float3(0.f,0.f,0.f);
+
+	if(positionConstraint >= 0)
+		return lambdaDt;
+
+	positionConstraint = min(0.0f,positionConstraint+penetrationError);
+	
+	lambdaDt	=	-(BT_GPU_dot(cNormal,rVel)*(collisionConstant));
+	lambdaDt	-=	(baumgarteConstant/dt*positionConstraint);
+
+	return lambdaDt;
+} // computeImpulse3D()
+
+//----------------------------------------------------------------------------------------
+
+#if 0
+#define VLIM 1000.f
+void BT_GPU___device__ chk_vect(float4* v)
+{
+	if(v->x < -VLIM) v->x = 0.f;
+	if(v->x >  VLIM) v->x = 0.f;
+	if(v->y < -VLIM) v->y = 0.f;
+	if(v->y >  VLIM) v->y = 0.f;
+	if(v->z < -VLIM) v->z = 0.f;
+	if(v->z >  VLIM) v->z = 0.f;
+} // chk_vect()
+#endif
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void collisionWithWallBox3DD(float4 *trans,
+								   float4 *vel,
+								   float4* angVel,
+								   btCudaPartProps pProp,
+								   btCudaBoxProps gProp,
+								   int nParticles,
+								   float dt)
+{
+    int idx = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+	float3 aPos;
+	float positionConstraint;
+	float3 impulse;
+	
+	if(idx < nParticles)
+	{
+		aPos = BT_GPU_make_float34(trans[idx * 4 + 3]);
+		for(int iVtx=0; iVtx < 8; iVtx++)
+		{
+			float3 dx = BT_GPU_make_float34(trans[idx * 4 + 0]);
+			float3 dy = BT_GPU_make_float34(trans[idx * 4 + 1]);
+			float3 dz = BT_GPU_make_float34(trans[idx * 4 + 2]);
+			float3 rerVertex = ((iVtx & 1) ? dx : dx * (-1.f));
+			rerVertex += ((iVtx & 2) ? dy : dy * (-1.f));
+			rerVertex += ((iVtx & 4) ? dz : dz * (-1.f));
+			float3 vPos = aPos + rerVertex;
+			float3 aVel	= BT_GPU_make_float3(vel[idx].x, vel[idx].y, vel[idx].z);
+			float3 aAngVel	= BT_GPU_make_float34(angVel[idx]);
+			float3 vVel	=aVel+BT_GPU_cross(aAngVel, rerVertex);
+			float restitution=0.5;
+			{
+				positionConstraint  = vPos.y - gProp.minY;
+				impulse				= BT_GPU_make_float31(0.0f);
+				if(positionConstraint < 0)
+				{
+					float3 groundNormal;
+					groundNormal = BT_GPU_make_float3(0.0f,1.0f,0.0f);
+					impulse	= groundNormal * restitution * computeImpulse3D(vVel, positionConstraint, groundNormal, dt);
+#if USE_FRICTION	// only with ground for now
+					float3 lat_vel = vVel - groundNormal * BT_GPU_dot(groundNormal,vVel);
+					float lat_vel_len = BT_GPU_dot(lat_vel, lat_vel);
+					if (lat_vel_len > 0)
+					{
+						lat_vel_len = sqrtf(lat_vel_len);
+						lat_vel *= 1.f/lat_vel_len;	
+						impulse	-= lat_vel * BT_GPU_dot(lat_vel, vVel) * FRICTION_BOX_GROUND_FACT;
+					}
+#endif //USE_FRICTION
+					vel[idx]	+=	BT_GPU_make_float42(impulse,0.0f);
+					angVel[idx]	+=	BT_GPU_make_float42(BT_GPU_cross(rerVertex,impulse), 0.0f);
+				}
+			}
+			{
+				positionConstraint	= vPos.x - gProp.minX;
+				impulse	= BT_GPU_make_float31(0.0f);
+				if(positionConstraint < 0)
+				{
+					float3 normal = BT_GPU_make_float3(1.0f,0.0f,0.0f);
+					impulse	= normal * restitution * computeImpulse3D(vVel,positionConstraint,normal,dt);
+					vel[idx]	+=	BT_GPU_make_float42(impulse,0.0f);
+					angVel[idx]	+=	BT_GPU_make_float42(BT_GPU_cross(rerVertex,impulse), 0.0f);
+				}
+			}
+			{
+				positionConstraint	= gProp.maxX - vPos.x;
+				impulse	= BT_GPU_make_float31(0.0f);
+				if(positionConstraint < 0)
+				{
+					float3 normal = BT_GPU_make_float3(-1.0f,0.0f,0.0f);
+					impulse	= normal * restitution * computeImpulse3D(vVel,positionConstraint,normal,dt);
+					vel[idx]	+=	BT_GPU_make_float42(impulse,0.0f);
+					angVel[idx]	+=	BT_GPU_make_float42(BT_GPU_cross(rerVertex,impulse), 0.0f);
+				}
+			}
+			{
+				positionConstraint	= vPos.z - gProp.minZ;
+				impulse	= BT_GPU_make_float31(0.0f);
+				if(positionConstraint < 0)
+				{
+					float3 normal = BT_GPU_make_float3(0.0f,0.0f,1.0f);
+					impulse	= normal * restitution * computeImpulse3D(vVel,positionConstraint,normal,dt);
+					vel[idx]	+=	BT_GPU_make_float42(impulse,0.0f);
+					angVel[idx]	+=	BT_GPU_make_float42(BT_GPU_cross(rerVertex,impulse), 0.0f);
+				}
+			}
+			{
+				positionConstraint	= gProp.maxZ - vPos.z;
+				impulse	= BT_GPU_make_float31(0.0f);
+				if(positionConstraint < 0)
+				{
+					float3 normal = BT_GPU_make_float3(0.0f,0.0f,-1.0f);
+					impulse	= normal * restitution * computeImpulse3D(vVel,positionConstraint,normal,dt);
+					vel[idx]	+=	BT_GPU_make_float42(impulse,0.0f);
+					angVel[idx]	+=	BT_GPU_make_float42(BT_GPU_cross(rerVertex,impulse), 0.0f);
+				}
+			}
+		}
+	}
+} // collisionWithWallBox3DD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void collisionBatchResolutionBox3DD(int2 *constraints,
+										 int *batch,
+										 int nConstraints,
+										 float4 *trans,
+										 float4 *vel,
+										 float4 *angularVel,
+										 float *lambdaDtBox,
+										 float *iPositionConstraint,
+										 float3 *normal,
+										 float3 *contact,
+										 btCudaPartProps pProp,
+										 int iBatch,
+										 float dt)
+{
+	float3 relVel;
+	float3 impulse;
+	float lambdaDt;
+	float positionConstraint;
+    int k_idx = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+	if(k_idx < nConstraints)
+	{
+		int idx = batch[k_idx];
+		int aId=constraints[idx].x;
+		int bId=constraints[idx].y;
+		float3 aPos = BT_GPU_make_float34(trans[aId * 4 + 3]);
+		float3 bPos = BT_GPU_make_float34(trans[bId * 4 + 3]);
+		float3 aVel = BT_GPU_make_float34(vel[aId]);
+		float3 bVel = BT_GPU_make_float34(vel[bId]);
+		float3 aAngVel = BT_GPU_make_float34(angularVel[aId]);
+		float3 bAngVel = BT_GPU_make_float34(angularVel[bId]);
+		for(int iVtx = 0; iVtx < 4; iVtx++)
+		{
+			float3 contactPoint	= contact[idx * 4 + iVtx] - aPos;
+			positionConstraint = iPositionConstraint[idx * 4 + iVtx];
+			if(positionConstraint > 0)
+			{
+				float3 contactNormal = normal[idx * 4 + iVtx];
+				relVel = (aVel + BT_GPU_cross(aAngVel, contactPoint))
+				 -(bVel + BT_GPU_cross(bAngVel, contactPoint+aPos-bPos));
+
+				lambdaDt=	computeImpulse3D(relVel, -positionConstraint, contactNormal, dt);
+				{
+					float rLambdaDt=lambdaDtBox[idx * 4 + iVtx];
+					float pLambdaDt=rLambdaDt;
+					rLambdaDt=max(pLambdaDt+lambdaDt,0.0f);
+					lambdaDt=rLambdaDt-pLambdaDt;
+					lambdaDtBox[idx * 4 + iVtx]=rLambdaDt;
+				}
+				impulse = contactNormal*lambdaDt*0.5;
+#if USE_FRICTION
+				float3 lat_vel = relVel - contactNormal * BT_GPU_dot(contactNormal, relVel);
+				float lat_vel_len = BT_GPU_dot(lat_vel, lat_vel);
+				if (lat_vel_len > 0)
+				{
+					lat_vel_len = sqrtf(lat_vel_len);
+					lat_vel *= 1.f/lat_vel_len;
+					impulse	-= lat_vel * BT_GPU_dot(lat_vel , relVel) * FRICTION_BOX_BOX_FACT;
+				}
+#endif //USE_FRICTION
+				aVel+=	impulse;
+				bVel-=	impulse;
+				aAngVel += BT_GPU_cross(contactPoint, impulse);
+				bAngVel -= BT_GPU_cross(contactPoint+aPos-bPos, impulse);
+			}
+		}
+		vel[aId]=BT_GPU_make_float42(aVel,0.0f);
+		vel[bId]=BT_GPU_make_float42(bVel,0.0f);
+		angularVel[aId]=BT_GPU_make_float42(aAngVel,0.0f);
+		angularVel[bId]=BT_GPU_make_float42(bAngVel,0.0f);
+	}
+} // collisionBatchResolutionBox3DD()
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+
+extern "C"
+{
+
+// global functions
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(clearAccumulationOfLambdaDt(float* lambdaDtBox, int numConstraints, int numContPoints))
+{
+	if(!numConstraints) 
+	{
+		return;
+	}
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numConstraints, 256, numBlocks, numThreads);
+    // execute the kernel
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, clearAccumulationOfLambdaDtD, (lambdaDtBox, numConstraints, numContPoints));
+    // check if kernel invocation generated an error
+    BT_GPU_CHECK_ERROR("clearAccumulationOfLambdaDtD kernel execution failed");
+    
+} // clearAccumulationOfLambdaDt()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(collisionWithWallBox3D(void* trans,void* vel,void* angVel,btCudaPartProps pProp,	btCudaBoxProps gProp,int numObjs,float dt))
+{
+	if(!numObjs) 
+	{
+		return;
+	}
+	float4* pTrans = (float4*)trans;
+	float4* pVel = (float4*)vel;
+	float4* pAngVel = (float4*)angVel;
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numObjs, 256, numBlocks, numThreads);
+    // execute the kernel
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, collisionWithWallBox3DD, (pTrans,pVel,pAngVel,pProp,gProp,numObjs,dt));
+    // check if kernel invocation generated an error
+    BT_GPU_CHECK_ERROR("collisionWithWallBox3DD kernel execution failed");
+} // collisionWithWallBox3D()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(collisionBatchResolutionBox3D(void* constraints,int *batch,int numConstraints,void *trans,void *vel,
+											void *angularVel,float *lambdaDtBox,float *positionConstraint,void* normal,void* contact,
+											btCudaPartProps pProp,int iBatch,float dt))
+{
+	if(!numConstraints) 
+	{
+		return;
+	}
+	int2* pConstr = (int2*)constraints;
+	float4* pTrans = (float4*)trans;
+	float4* pVel = (float4*)vel;
+	float4* pAngVel = (float4*)angularVel;
+	float3* pNorm = (float3*)normal;
+	float3* pContact = (float3*)contact;
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numConstraints, 128, numBlocks, numThreads);
+    // execute the kernel
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, collisionBatchResolutionBox3DD, (pConstr,batch,numConstraints,pTrans,pVel,pAngVel,lambdaDtBox,positionConstraint,pNorm,pContact,pProp,iBatch,dt));
+    // check if kernel invocation generated an error
+    BT_GPU_CHECK_ERROR("collisionBatchResolutionBox3DD kernel execution failed");
+} // collisionBatchResolutionBox3D()
+
+//----------------------------------------------------------------------------------------
+
+} // extern "C"
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------   M o t i o n   i n t e g r a t o r   d e m o   -----------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+// kernel functions
+
+BT_GPU___global__ void integrVelD(float4* pForceTorqueDamp, float4* pInvInertiaMass, float4* pVel, float4* pAngVel, float timeStep, unsigned int numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	// unpack input data
+	float3 force =  BT_GPU_make_float34(pForceTorqueDamp[index * 2]);
+	float lin_damp = pForceTorqueDamp[index * 2].w;
+	float3 torque =  BT_GPU_make_float34(pForceTorqueDamp[index * 2 + 1]);
+	float ang_damp = pForceTorqueDamp[index * 2 + 1].w;
+	float3 linVel =  BT_GPU_make_float34(pVel[index]);
+	float3 angVel =  BT_GPU_make_float34(pAngVel[index]);
+	float3 in_mass_0 = BT_GPU_make_float34(pInvInertiaMass[index * 3]);
+	float3 in_mass_1 = BT_GPU_make_float34(pInvInertiaMass[index * 3 + 1]);
+	float3 in_mass_2 = BT_GPU_make_float34(pInvInertiaMass[index * 3 + 2]);
+	float mass = pInvInertiaMass[index * 3].w;
+	// integrate linear velocity
+	float3 outLinVel, outAngVel;
+	outLinVel = linVel + force * mass * timeStep;
+	// integrate angular velocity
+	outAngVel.x = BT_GPU_dot(in_mass_0, torque);
+	outAngVel.y = BT_GPU_dot(in_mass_1, torque);
+	outAngVel.z = BT_GPU_dot(in_mass_2, torque);
+	outAngVel += angVel;
+	/// clamp angular velocity. collision calculations will fail on higher angular velocities	
+	#if(!defined(M_PI))
+	#define M_PI 3.1415926f
+	#endif
+	#define BT_CUDA_MAX_SQ_ANGVEL (M_PI*M_PI)
+	float sq_angvel = BT_GPU_dot(outAngVel, outAngVel);
+	sq_angvel *= timeStep * timeStep;
+	float fact;
+	if(sq_angvel > BT_CUDA_MAX_SQ_ANGVEL)
+	{
+		fact = sqrtf(BT_CUDA_MAX_SQ_ANGVEL/sq_angvel) / timeStep;
+		outAngVel *= fact;
+	}
+	// now apply damping
+	fact = powf(1.0f - lin_damp, timeStep);
+	outLinVel *= fact;
+	fact = powf(1.0f - ang_damp, timeStep);
+	outAngVel *= fact;
+	// pack results
+	pVel[index] = BT_GPU_make_float42(outLinVel, 0.f);
+	pAngVel[index] = BT_GPU_make_float42(outAngVel, 0.f);
+} // integrVelD()
+
+#define BT_GPU__ANGULAR_MOTION_THRESHOLD (0.25f * M_PI)
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___device__ float4 getRotation(float4* trans)
+{
+	float trace = trans[0].x + trans[1].y + trans[2].z;
+	float temp[4];
+	if(trace > 0.0f)
+	{
+		float s = sqrtf(trace + 1.0f);
+		temp[3] = s * 0.5f;
+		s = 0.5f / s;
+		temp[0] = (trans[1].z - trans[2].y) * s;
+		temp[1] = (trans[2].x - trans[0].z) * s;
+		temp[2] = (trans[0].y - trans[1].x) * s;
+	}
+	else
+	{
+			typedef float btMatrRow[4];
+			btMatrRow* m_el = (btMatrRow*)trans;
+			int i = m_el[0][0] < m_el[1][1] ? 
+				(m_el[1][1] < m_el[2][2] ? 2 : 1) :
+				(m_el[0][0] < m_el[2][2] ? 2 : 0); 
+			int j = (i + 1) % 3;  
+			int k = (i + 2) % 3;
+			float s = sqrtf(m_el[i][i] - m_el[j][j] - m_el[k][k] + 1.0f);
+			temp[i] = s * 0.5f;
+			s = 0.5f / s;
+			temp[3] = (m_el[j][k] - m_el[k][j]) * s;
+			temp[j] = (m_el[i][j] + m_el[j][i]) * s;
+			temp[k] = (m_el[i][k] + m_el[k][i]) * s;
+	}
+	float4 q = BT_GPU_make_float44(temp[0],temp[1],temp[2],temp[3]);
+	return q;
+} // getRotation()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___device__ float4 quatMult(float4& q1, float4& q2)
+{
+	return BT_GPU_make_float44(q1.w * q2.x + q1.x * q2.w + q1.y * q2.z - q1.z * q2.y,
+		q1.w * q2.y + q1.y * q2.w + q1.z * q2.x - q1.x * q2.z,
+		q1.w * q2.z + q1.z * q2.w + q1.x * q2.y - q1.y * q2.x,
+		q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z); 
+} // quatMult()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___device__ void quatNorm(float4& q)
+{
+	float len = sqrtf(BT_GPU_dot4(q, q));
+	q *= 1.f / len;
+} // quatNorm()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___device__ void setRotation(float4& q, float4* trans) 
+{
+	float d = BT_GPU_dot4(q, q);
+	float s = 2.0f / d;
+	float xs = q.x * s,   ys = q.y * s,   zs = q.z * s;
+	float wx = q.w * xs,  wy = q.w * ys,  wz = q.w * zs;
+	float xx = q.x * xs,  xy = q.x * ys,  xz = q.x * zs;
+	float yy = q.y * ys,  yz = q.y * zs,  zz = q.z * zs;
+    trans[0].x = 1.0f - (yy + zz);
+	trans[1].x = xy - wz;
+	trans[2].x = xz + wy;
+	trans[0].y = xy + wz;
+	trans[1].y = 1.0f - (xx + zz);
+	trans[2].y = yz - wx;
+	trans[0].z = xz - wy;
+	trans[1].z = yz + wx;
+	trans[2].z = 1.0f - (xx + yy);
+	trans[0].w = trans[1].w = trans[2].w = 0.0f;
+} // setRotation()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void integrTransD(float4* pTrans, float4* pVel, float4* pAngVel, float timeStep, unsigned int numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	float3 pos = BT_GPU_make_float34(pTrans[index * 4 + 3]);
+	float3 linvel = BT_GPU_make_float34(pVel[index]);
+	pos += linvel * timeStep;
+
+	float3 axis;
+	float3 angvel = BT_GPU_make_float34(pAngVel[index]);
+	float fAngle = sqrtf(BT_GPU_dot(angvel, angvel));
+	//limit the angular motion
+	if(fAngle*timeStep > BT_GPU__ANGULAR_MOTION_THRESHOLD)
+	{
+		fAngle = BT_GPU__ANGULAR_MOTION_THRESHOLD / timeStep;
+	}
+	if(fAngle < 0.001f)
+	{
+		// use Taylor's expansions of sync function
+		axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);
+	}
+	else
+	{
+		// sync(fAngle) = sin(c*fAngle)/t
+		axis = angvel * ( sinf(0.5f * fAngle * timeStep) / fAngle);
+	}
+	float4 dorn = BT_GPU_make_float42(axis, cosf(fAngle * timeStep * 0.5f));
+	float4 orn0 = getRotation(pTrans + index * 4);
+	float4 predictedOrn = quatMult(dorn, orn0);
+	quatNorm(predictedOrn);
+	setRotation(predictedOrn, pTrans + index * 4);
+	pTrans[index * 4 + 3] = BT_GPU_make_float42(pos, 0.f);
+} // integrTransD()
+
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+// global functions
+
+extern "C"
+{
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(integrVel(float* pForceTorqueDamp, float* pInvInertiaMass, void* pVel, void* pAngVel, float timeStep, unsigned int numBodies))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, integrVelD, ((float4*)pForceTorqueDamp, (float4*)pInvInertiaMass, (float4*)pVel, (float4*)pAngVel, timeStep, numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCuda_integrVelD");
+} // integrVel()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(integrTrans(void* trans, void* vel, void* angVel, float timeStep, int numBodies))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, integrTransD, ((float4*)trans, (float4*)vel, (float4*)angVel, timeStep, numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCuda_integrTransD");
+} // integrTrans()
+
+//----------------------------------------------------------------------------------------
+
+} // extern "C"
+
+//------------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------