From d3c80fe160bc8f82a8ef2639b1074130b87bb61e Mon Sep 17 00:00:00 2001 From: erwin coumans Date: Sun, 17 Mar 2013 01:19:27 -0700 Subject: [PATCH] add Takahiro's batching pgs solver --- build/stringify.bat | 6 + demo/gpudemo/GpuDemo.h | 16 +- demo/gpudemo/main_opengl3core.cpp | 5 +- demo/gpudemo/rigidbody/GpuRigidBodyDemo.cpp | 9 +- opencl/gpu_rigidbody/host/Solver.cpp | 927 ++++++++++++++++++ opencl/gpu_rigidbody/host/Solver.h | 168 ++++ .../host/btGpuBatchingPgsSolver.cpp | 824 ++++++++++++++++ .../host/btGpuBatchingPgsSolver.h | 32 + opencl/gpu_rigidbody/host/btGpuConstraint4.h | 31 + .../host/btGpuRigidBodyPipeline.cpp | 49 +- .../host/btGpuRigidBodyPipelineInternalData.h | 4 +- .../gpu_rigidbody/host/btPgsJacobiSolver.cpp | 2 +- .../gpu_rigidbody/kernels/batchingKernels.cl | 341 +++++++ .../gpu_rigidbody/kernels/batchingKernels.h | 345 +++++++ opencl/gpu_rigidbody/kernels/solveContact.cl | 476 +++++++++ opencl/gpu_rigidbody/kernels/solveContact.h | 480 +++++++++ opencl/gpu_rigidbody/kernels/solveFriction.cl | 506 ++++++++++ opencl/gpu_rigidbody/kernels/solveFriction.h | 510 ++++++++++ opencl/gpu_rigidbody/kernels/solverSetup.cl | 660 +++++++++++++ opencl/gpu_rigidbody/kernels/solverSetup.h | 664 +++++++++++++ opencl/gpu_rigidbody/kernels/solverSetup2.cl | 494 ++++++++++ opencl/gpu_rigidbody/kernels/solverSetup2.h | 498 ++++++++++ opencl/gpu_sat/host/btCollidable.h | 2 +- 23 files changed, 7020 insertions(+), 29 deletions(-) create mode 100644 opencl/gpu_rigidbody/host/Solver.cpp create mode 100644 opencl/gpu_rigidbody/host/Solver.h create mode 100644 opencl/gpu_rigidbody/host/btGpuBatchingPgsSolver.cpp create mode 100644 opencl/gpu_rigidbody/host/btGpuBatchingPgsSolver.h create mode 100644 opencl/gpu_rigidbody/host/btGpuConstraint4.h create mode 100644 opencl/gpu_rigidbody/kernels/batchingKernels.cl create mode 100644 opencl/gpu_rigidbody/kernels/batchingKernels.h create mode 100644 opencl/gpu_rigidbody/kernels/solveContact.cl create mode 100644 opencl/gpu_rigidbody/kernels/solveContact.h create mode 100644 opencl/gpu_rigidbody/kernels/solveFriction.cl create mode 100644 opencl/gpu_rigidbody/kernels/solveFriction.h create mode 100644 opencl/gpu_rigidbody/kernels/solverSetup.cl create mode 100644 opencl/gpu_rigidbody/kernels/solverSetup.h create mode 100644 opencl/gpu_rigidbody/kernels/solverSetup2.cl create mode 100644 opencl/gpu_rigidbody/kernels/solverSetup2.h diff --git a/build/stringify.bat b/build/stringify.bat index 04859cf99..5c45bd8a5 100644 --- a/build/stringify.bat +++ b/build/stringify.bat @@ -16,6 +16,12 @@ premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_sat/kernels/satC premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/integrateKernel.cl" --headerfile="../opencl/gpu_rigidbody/kernels/integrateKernel.h" --stringname="integrateKernelCL" stringify premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/updateAabbsKernel.cl" --headerfile="../opencl/gpu_rigidbody/kernels/updateAabbsKernel.h" --stringname="updateAabbsKernelCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverSetup.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverSetup.h" --stringname="solverSetupCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverSetup2.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverSetup2.h" --stringname="solverSetup2CL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/batchingKernels.cl" --headerfile="../opencl/gpu_rigidbody/kernels/batchingKernels.h" --stringname="batchingKernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solveContact.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solveContact.h" --stringname="solveContactCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solveFriction.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solveFriction.h" --stringname="solveFrictionCL" stringify + diff --git a/demo/gpudemo/GpuDemo.h b/demo/gpudemo/GpuDemo.h index d493ff875..af4474b7a 100644 --- a/demo/gpudemo/GpuDemo.h +++ b/demo/gpudemo/GpuDemo.h @@ -31,16 +31,16 @@ public: class btgWindowInterface* m_window; ConstructionInfo() - :useOpenCL(false),//true), + :useOpenCL(true), preferredOpenCLPlatformIndex(-1), preferredOpenCLDeviceIndex(-1), - arraySizeX(22), - arraySizeY(22 ), - arraySizeZ(22), - m_useConcaveMesh(false), - gapX(4.3), - gapY(4.0), - gapZ(4.3), + arraySizeX(33), + arraySizeY(30 ), + arraySizeZ(33), + m_useConcaveMesh(false), + gapX(4.3), + gapY(2.0), + gapZ(4.3), m_instancingRenderer(0), m_window(0) { diff --git a/demo/gpudemo/main_opengl3core.cpp b/demo/gpudemo/main_opengl3core.cpp index 654165b42..54df8b292 100644 --- a/demo/gpudemo/main_opengl3core.cpp +++ b/demo/gpudemo/main_opengl3core.cpp @@ -61,10 +61,13 @@ btAlignedObjectArray demoNames; int selectedDemo = 0; GpuDemo::CreateFunc* allDemos[]= { + GpuRigidBodyDemo::MyCreateFunc, + //BroadphaseBenchmark::CreateFunc, //GpuBoxDemo::CreateFunc, + PairBench::MyCreateFunc, - GpuRigidBodyDemo::MyCreateFunc, + ParticleDemo::MyCreateFunc, diff --git a/demo/gpudemo/rigidbody/GpuRigidBodyDemo.cpp b/demo/gpudemo/rigidbody/GpuRigidBodyDemo.cpp index 45b4598ed..8405da0a7 100644 --- a/demo/gpudemo/rigidbody/GpuRigidBodyDemo.cpp +++ b/demo/gpudemo/rigidbody/GpuRigidBodyDemo.cpp @@ -136,9 +136,9 @@ void GpuRigidBodyDemo::initPhysics(const ConstructionInfo& ci) { for (int k=0;ksetCameraTargetPosition(camPos); - m_instancingRenderer->setCameraDistance(60); + m_instancingRenderer->setCameraDistance(90); m_instancingRenderer->writeTransforms(); diff --git a/opencl/gpu_rigidbody/host/Solver.cpp b/opencl/gpu_rigidbody/host/Solver.cpp new file mode 100644 index 000000000..7158c3ce6 --- /dev/null +++ b/opencl/gpu_rigidbody/host/Solver.cpp @@ -0,0 +1,927 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +#include "Solver.h" +#include "../Stubs/AdlMatrix3x3.h" + +#define SOLVER_SETUP_KERNEL_PATH "opencl/gpu_rigidbody/kernels/solverSetup.cl" +#define SOLVER_SETUP2_KERNEL_PATH "opencl/gpu_rigidbody/kernels/solverSetup2.cl" + +#define SOLVER_CONTACT_KERNEL_PATH "opencl/gpu_rigidbody/kernels/solveContact.cl" +#define SOLVER_FRICTION_KERNEL_PATH "opencl/gpu_rigidbody/kernels/solveFriction.cl" + +#define BATCHING_PATH "opencl/gpu_rigidbody/kernels/batchingKernels.cl" + + +#include "../kernels/solverSetup.h" +#include "../kernels/solverSetup2.h" + +#include "../kernels/solveContact.h" +#include "../kernels/solveFriction.h" + +#include "../kernels/batchingKernels.h" +#include "BulletCommon/btQuickprof.h" +#include "../../parallel_primitives/host/btLauncherCL.h" + + +struct SolverDebugInfo +{ + int m_valInt0; + int m_valInt1; + int m_valInt2; + int m_valInt3; + + int m_valInt4; + int m_valInt5; + int m_valInt6; + int m_valInt7; + + int m_valInt8; + int m_valInt9; + int m_valInt10; + int m_valInt11; + + int m_valInt12; + int m_valInt13; + int m_valInt14; + int m_valInt15; + + + float m_val0; + float m_val1; + float m_val2; + float m_val3; +}; + + + + +class SolverDeviceInl +{ +public: + struct ParallelSolveData + { + btOpenCLArray* m_numConstraints; + btOpenCLArray* m_offsets; + }; +}; + + + +Solver::Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity) + :m_nIterations(4), + m_context(ctx), + m_device(device), + m_queue(queue) +{ + m_sort32 = new btRadixSort32CL(ctx,device,queue); + m_scan = new btPrefixScanCL(ctx,device,queue,N_SPLIT*N_SPLIT); + m_search = new btBoundSearchCL(ctx,device,queue,N_SPLIT*N_SPLIT); + + const int sortSize = NEXTMULTIPLEOF( pairCapacity, 512 ); + + m_sortDataBuffer = new btOpenCLArray(ctx,queue,sortSize); + m_contactBuffer = new btOpenCLArray(ctx,queue); + + m_numConstraints = new btOpenCLArray(ctx,queue,N_SPLIT*N_SPLIT ); + m_numConstraints->resize(N_SPLIT*N_SPLIT); + + m_offsets = new btOpenCLArray( ctx,queue, N_SPLIT*N_SPLIT ); + m_offsets->resize(N_SPLIT*N_SPLIT); + const char* additionalMacros = ""; + const char* srcFileNameForCaching=""; + + + + cl_int pErrNum; + const char* batchKernelSource = batchingKernelsCL; + const char* solverSetupSource = solverSetupCL; + const char* solverSetup2Source = solverSetup2CL; + const char* solveContactSource = solveContactCL; + const char* solveFrictionSource = solveFrictionCL; + + + + { + + cl_program solveContactProg= btOpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, SOLVER_CONTACT_KERNEL_PATH); + btAssert(solveContactProg); + + cl_program solveFrictionProg= btOpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, SOLVER_FRICTION_KERNEL_PATH); + btAssert(solveFrictionProg); + + cl_program solverSetup2Prog= btOpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, SOLVER_SETUP2_KERNEL_PATH); + btAssert(solverSetup2Prog); + + + cl_program solverSetupProg= btOpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, SOLVER_SETUP_KERNEL_PATH); + btAssert(solverSetupProg); + + + m_solveFrictionKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros ); + btAssert(m_solveFrictionKernel); + + m_solveContactKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros ); + btAssert(m_solveContactKernel); + + m_contactToConstraintKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros ); + btAssert(m_contactToConstraintKernel); + + m_setSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + btAssert(m_setSortDataKernel); + + m_reorderContactKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + btAssert(m_reorderContactKernel); + + + m_copyConstraintKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + btAssert(m_copyConstraintKernel); + + } + + { + cl_program batchingProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, BATCHING_PATH); + btAssert(batchingProg); + + m_batchingKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros ); + btAssert(m_batchingKernel); + } + +} + +Solver::~Solver() +{ + delete m_sortDataBuffer; + delete m_contactBuffer; + + delete m_sort32; + delete m_scan; + delete m_search; + + + clReleaseKernel(m_batchingKernel); + + clReleaseKernel( m_solveContactKernel); + clReleaseKernel( m_solveFrictionKernel); + + clReleaseKernel( m_contactToConstraintKernel); + clReleaseKernel( m_setSortDataKernel); + clReleaseKernel( m_reorderContactKernel); + clReleaseKernel( m_copyConstraintKernel); + +} + + + + + +/*void Solver::reorderConvertToConstraints( const btOpenCLArray* bodyBuf, + const btOpenCLArray* shapeBuf, + btOpenCLArray* contactsIn, btOpenCLArray* contactCOut, void* additionalData, + int nContacts, const Solver::ConstraintCfg& cfg ) +{ + if( m_contactBuffer ) + { + m_contactBuffer->resize(nContacts); + } + if( m_contactBuffer == 0 ) + { + BT_PROFILE("new m_contactBuffer;"); + m_contactBuffer = new btOpenCLArray(m_context,m_queue,nContacts ); + m_contactBuffer->resize(nContacts); + } + + + + + //DeviceUtils::Config dhCfg; + //Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg ); + if( cfg.m_enableParallelSolve ) + { + + + clFinish(m_queue); + + // contactsIn -> m_contactBuffer + { + BT_PROFILE("sortContacts"); + sortContacts( bodyBuf, contactsIn, additionalData, nContacts, cfg ); + clFinish(m_queue); + } + + + { + BT_PROFILE("m_copyConstraintKernel"); + + + + btInt4 cdata; cdata.x = nContacts; + btBufferInfoCL bInfo[] = { btBufferInfoCL( m_contactBuffer->getBufferCL() ), btBufferInfoCL( contactsIn->getBufferCL() ) }; +// btLauncherCL launcher( m_queue, data->m_device->getKernel( PATH, "CopyConstraintKernel", "-I ..\\..\\ -Wf,--c++", 0 ) ); + btLauncherCL launcher( m_queue, m_copyConstraintKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nContacts, 64 ); + clFinish(m_queue); + } + + { + BT_PROFILE("batchContacts"); + Solver::batchContacts( contactsIn, nContacts, m_numConstraints, m_offsets, cfg.m_staticIdx ); + + } + } + { + BT_PROFILE("waitForCompletion (batchContacts)"); + clFinish(m_queue); + } + + //================ + + { + BT_PROFILE("convertToConstraints"); + Solver::convertToConstraints( bodyBuf, shapeBuf, contactsIn, contactCOut, additionalData, nContacts, cfg ); + } + + { + BT_PROFILE("convertToConstraints waitForCompletion"); + clFinish(m_queue); + } + +} +*/ + + static + inline + float calcRelVel(const float4& l0, const float4& l1, const float4& a0, const float4& a1, + const float4& linVel0, const float4& angVel0, const float4& linVel1, const float4& angVel1) + { + return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1); + } + + + static + inline + void setLinearAndAngular(const float4& n, const float4& r0, const float4& r1, + float4& linear, float4& angular0, float4& angular1) + { + linear = -n; + angular0 = -cross3(r0, n); + angular1 = cross3(r1, n); + } + + +template +static +__inline +void solveContact(btGpuConstraint4& cs, + const float4& posA, float4& linVelA, float4& angVelA, float invMassA, const Matrix3x3& invInertiaA, + const float4& posB, float4& linVelB, float4& angVelB, float invMassB, const Matrix3x3& invInertiaB, + float maxRambdaDt[4], float minRambdaDt[4]) +{ + float4 dLinVelA = make_float4(0.f); + float4 dAngVelA = make_float4(0.f); + float4 dLinVelB = make_float4(0.f); + float4 dAngVelB = make_float4(0.f); + + for(int ic=0; ic<4; ic++) + { + // dont necessary because this makes change to 0 + if( cs.m_jacCoeffInv[ic] == 0.f ) continue; + + { + float4 angular0, angular1, linear; + btVector3 r0 = cs.m_worldPos[ic] - (btVector3&)posA; + btVector3 r1 = cs.m_worldPos[ic] - (btVector3&)posB; + setLinearAndAngular( (const float4 &)-cs.m_linear, (const float4 &)r0, (const float4 &)r1, linear, angular0, angular1 ); + + float rambdaDt = calcRelVel((const float4 &)cs.m_linear,(const float4 &) -cs.m_linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic]; + rambdaDt *= cs.m_jacCoeffInv[ic]; + + { + float prevSum = cs.m_appliedRambdaDt[ic]; + float updated = prevSum; + updated += rambdaDt; + updated = max2( updated, minRambdaDt[ic] ); + updated = min2( updated, maxRambdaDt[ic] ); + rambdaDt = updated - prevSum; + cs.m_appliedRambdaDt[ic] = updated; + } + + float4 linImp0 = invMassA*linear*rambdaDt; + float4 linImp1 = invMassB*(-linear)*rambdaDt; + float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt; + float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt; +#ifdef _WIN32 + btAssert(_finite(linImp0.x)); + btAssert(_finite(linImp1.x)); +#endif + if( JACOBI ) + { + dLinVelA += linImp0; + dAngVelA += angImp0; + dLinVelB += linImp1; + dAngVelB += angImp1; + } + else + { + linVelA += linImp0; + angVelA += angImp0; + linVelB += linImp1; + angVelB += angImp1; + } + } + } + + if( JACOBI ) + { + linVelA += dLinVelA; + angVelA += dAngVelA; + linVelB += dLinVelB; + angVelB += dAngVelB; + } +} + + + void btPlaneSpace1 (const float4* n, float4* p, float4* q) +{ + if (btFabs(n->z) > SIMDSQRT12) { + // choose p in y-z plane + btScalar a = n->y*n->y + n->z*n->z; + btScalar k = btRecipSqrt (a); + p->x = 0; + p->y = -n->z*k; + p->z = n->y*k; + // set q = n x p + q->x = a*k; + q->y = -n->x*p->z; + q->z = n->x*p->y; + } + else { + // choose p in x-y plane + btScalar a = n->x*n->x + n->y*n->y; + btScalar k = btRecipSqrt (a); + p->x = -n->y*k; + p->y = n->x*k; + p->z = 0; + // set q = n x p + q->x = -n->z*p->y; + q->y = n->z*p->x; + q->z = a*k; + } +} + + + static + __inline + void solveFriction(btGpuConstraint4& cs, + const float4& posA, float4& linVelA, float4& angVelA, float invMassA, const Matrix3x3& invInertiaA, + const float4& posB, float4& linVelB, float4& angVelB, float invMassB, const Matrix3x3& invInertiaB, + float maxRambdaDt[4], float minRambdaDt[4]) + { + if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return; + const float4& center = (const float4&)cs.m_center; + + float4 n = -(const float4&)cs.m_linear; + + float4 tangent[2]; +#if 1 + btPlaneSpace1 (&n, &tangent[0],&tangent[1]); +#else + float4 r = cs.m_worldPos[0]-center; + tangent[0] = cross3( n, r ); + tangent[1] = cross3( tangent[0], n ); + tangent[0] = normalize3( tangent[0] ); + tangent[1] = normalize3( tangent[1] ); +#endif + + float4 angular0, angular1, linear; + float4 r0 = center - posA; + float4 r1 = center - posB; + for(int i=0; i<2; i++) + { + setLinearAndAngular( tangent[i], r0, r1, linear, angular0, angular1 ); + float rambdaDt = calcRelVel(linear, -linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB ); + rambdaDt *= cs.m_fJacCoeffInv[i]; + + { + float prevSum = cs.m_fAppliedRambdaDt[i]; + float updated = prevSum; + updated += rambdaDt; + updated = max2( updated, minRambdaDt[i] ); + updated = min2( updated, maxRambdaDt[i] ); + rambdaDt = updated - prevSum; + cs.m_fAppliedRambdaDt[i] = updated; + } + + float4 linImp0 = invMassA*linear*rambdaDt; + float4 linImp1 = invMassB*(-linear)*rambdaDt; + float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt; + float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt; +#ifdef _WIN32 + btAssert(_finite(linImp0.x)); + btAssert(_finite(linImp1.x)); +#endif + linVelA += linImp0; + angVelA += angImp0; + linVelB += linImp1; + angVelB += angImp1; + } + + { // angular damping for point constraint + float4 ab = normalize3( posB - posA ); + float4 ac = normalize3( center - posA ); + if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f)) + { + float angNA = dot3F4( n, angVelA ); + float angNB = dot3F4( n, angVelB ); + + angVelA -= (angNA*0.1f)*n; + angVelB -= (angNB*0.1f)*n; + } + } + } + + +struct SolveTask// : public ThreadPool::Task +{ + SolveTask(btAlignedObjectArray& bodies, btAlignedObjectArray& shapes, btAlignedObjectArray& constraints, + int start, int nConstraints) + : m_bodies( bodies ), m_shapes( shapes ), m_constraints( constraints ), m_start( start ), m_nConstraints( nConstraints ), + m_solveFriction( true ){} + + u16 getType(){ return 0; } + + void run(int tIdx) + { + + + for(int ic=0; ic( m_constraints[i], (float4&)bodyA.m_pos, (float4&)bodyA.m_linVel, (float4&)bodyA.m_angVel, bodyA.m_invMass, (const Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, + (float4&)bodyB.m_pos, (float4&)bodyB.m_linVel, (float4&)bodyB.m_angVel, bodyB.m_invMass, (const Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt ); + + } + else + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + float sum = 0; + for(int j=0; j<4; j++) + { + sum +=m_constraints[i].m_appliedRambdaDt[j]; + } + frictionCoeff = 0.7f; + for(int j=0; j<4; j++) + { + maxRambdaDt[j] = frictionCoeff*sum; + minRambdaDt[j] = -maxRambdaDt[j]; + } + + solveFriction( m_constraints[i], (float4&)bodyA.m_pos, (float4&)bodyA.m_linVel, (float4&)bodyA.m_angVel, bodyA.m_invMass,(const Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, + (float4&)bodyB.m_pos, (float4&)bodyB.m_linVel, (float4&)bodyB.m_angVel, bodyB.m_invMass,(const Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt ); + + } + } + + + } + + btAlignedObjectArray& m_bodies; + btAlignedObjectArray& m_shapes; + btAlignedObjectArray& m_constraints; + int m_start; + int m_nConstraints; + bool m_solveFriction; +}; + + +void Solver::solveContactConstraintHost( btOpenCLArray* bodyBuf, btOpenCLArray* shapeBuf, + btOpenCLArray* constraint, void* additionalData, int n ,int maxNumBatches) +{ + + btAlignedObjectArray bodyNative; + bodyBuf->copyToHost(bodyNative); + btAlignedObjectArray shapeNative; + shapeBuf->copyToHost(shapeNative); + btAlignedObjectArray constraintNative; + constraint->copyToHost(constraintNative); + + for(int iter=0; itercopyFromHost(bodyNative); + shapeBuf->copyFromHost(shapeNative); + constraint->copyFromHost(constraintNative); + + +} + +void Solver::solveContactConstraint( const btOpenCLArray* bodyBuf, const btOpenCLArray* shapeBuf, + btOpenCLArray* constraint, void* additionalData, int n ,int maxNumBatches) +{ + + + btInt4 cdata = btMakeInt4( n, 0, 0, 0 ); + { + + const int nn = N_SPLIT*N_SPLIT; + + cdata.x = 0; + cdata.y = maxNumBatches;//250; + + + int numWorkItems = 64*nn/N_BATCHES; +#ifdef DEBUG_ME + SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; + adl::btOpenCLArray gpuDebugInfo(data->m_device,numWorkItems); +#endif + + + + { + + BT_PROFILE("m_batchSolveKernel iterations"); + for(int iter=0; itergetBufferCL() ), + btBufferInfoCL( shapeBuf->getBufferCL() ), + btBufferInfoCL( constraint->getBufferCL() ), + btBufferInfoCL( m_numConstraints->getBufferCL() ), + btBufferInfoCL( m_offsets->getBufferCL() ) +#ifdef DEBUG_ME + , btBufferInfoCL(&gpuDebugInfo) +#endif + }; + + + + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + //launcher.setConst( cdata.x ); + launcher.setConst( cdata.y ); + launcher.setConst( cdata.z ); + launcher.setConst( cdata.w ); + launcher.launch1D( numWorkItems, 64 ); + + +#else + const char* fileName = "m_batchSolveKernel.bin"; + FILE* f = fopen(fileName,"rb"); + if (f) + { + int sizeInBytes=0; + if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) + { + printf("error, cannot get file size\n"); + exit(0); + } + + unsigned char* buf = (unsigned char*) malloc(sizeInBytes); + fread(buf,sizeInBytes,1,f); + int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); + int num = *(int*)&buf[serializedBytes]; + + launcher.launch1D( num); + + //this clFinish is for testing on errors + clFinish(m_queue); + } + +#endif + + +#ifdef DEBUG_ME + clFinish(m_queue); + gpuDebugInfo.read(debugInfo,numWorkItems); + clFinish(m_queue); + for (int i=0;i0) + { + printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2); + } + + if (debugInfo[i].m_valInt3>0) + { + printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3); + } + } +#endif //DEBUG_ME + + + } + } + + clFinish(m_queue); + + + } + + cdata.x = 1; + bool applyFriction=true; + if (applyFriction) + { + BT_PROFILE("m_batchSolveKernel iterations2"); + for(int iter=0; itergetBufferCL() ), + btBufferInfoCL( shapeBuf->getBufferCL() ), + btBufferInfoCL( constraint->getBufferCL() ), + btBufferInfoCL( m_numConstraints->getBufferCL() ), + btBufferInfoCL( m_offsets->getBufferCL() ) +#ifdef DEBUG_ME + ,btBufferInfoCL(&gpuDebugInfo) +#endif //DEBUG_ME + }; + btLauncherCL launcher( m_queue, m_solveFrictionKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + //launcher.setConst( cdata.x ); + launcher.setConst( cdata.y ); + launcher.setConst( cdata.z ); + launcher.setConst( cdata.w ); + + launcher.launch1D( 64*nn/N_BATCHES, 64 ); + } + } + clFinish(m_queue); + + } +#ifdef DEBUG_ME + delete[] debugInfo; +#endif //DEBUG_ME + } + + +} + +void Solver::convertToConstraints( const btOpenCLArray* bodyBuf, + const btOpenCLArray* shapeBuf, + btOpenCLArray* contactsIn, btOpenCLArray* contactCOut, void* additionalData, + int nContacts, const ConstraintCfg& cfg ) +{ + btOpenCLArray* constraintNative =0; + + struct CB + { + int m_nContacts; + float m_dt; + float m_positionDrift; + float m_positionConstraintCoeff; + }; + + { + BT_PROFILE("m_contactToConstraintKernel"); + CB cdata; + cdata.m_nContacts = nContacts; + cdata.m_dt = cfg.m_dt; + cdata.m_positionDrift = cfg.m_positionDrift; + cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff; + + + btBufferInfoCL bInfo[] = { btBufferInfoCL( contactsIn->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL() ), btBufferInfoCL( shapeBuf->getBufferCL()), + btBufferInfoCL( contactCOut->getBufferCL() )}; + btLauncherCL launcher( m_queue, m_contactToConstraintKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + //launcher.setConst( cdata ); + + launcher.setConst(cdata.m_nContacts); + launcher.setConst(cdata.m_dt); + launcher.setConst(cdata.m_positionDrift); + launcher.setConst(cdata.m_positionConstraintCoeff); + + launcher.launch1D( nContacts, 64 ); + clFinish(m_queue); + + } + + contactCOut->resize(nContacts); +} + +/* +void Solver::sortContacts( const btOpenCLArray* bodyBuf, + btOpenCLArray* contactsIn, void* additionalData, + int nContacts, const Solver::ConstraintCfg& cfg ) +{ + + + + const int sortAlignment = 512; // todo. get this out of sort + if( cfg.m_enableParallelSolve ) + { + + + int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment ); + + btOpenCLArray* countsNative = m_numConstraints;//BufferUtils::map( data->m_device, &countsHost ); + btOpenCLArray* offsetsNative = m_offsets;//BufferUtils::map( data->m_device, &offsetsHost ); + + { // 2. set cell idx + struct CB + { + int m_nContacts; + int m_staticIdx; + float m_scale; + int m_nSplit; + }; + + btAssert( sortSize%64 == 0 ); + CB cdata; + cdata.m_nContacts = nContacts; + cdata.m_staticIdx = cfg.m_staticIdx; + cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent); + cdata.m_nSplit = N_SPLIT; + + + btBufferInfoCL bInfo[] = { btBufferInfoCL( contactsIn->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL() ), btBufferInfoCL( m_sortDataBuffer->getBufferCL() ) }; + btLauncherCL launcher( m_queue, m_setSortDataKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( sortSize, 64 ); + } + + { // 3. sort by cell idx + int n = N_SPLIT*N_SPLIT; + int sortBit = 32; + //if( n <= 0xffff ) sortBit = 16; + //if( n <= 0xff ) sortBit = 8; + m_sort32->execute(*m_sortDataBuffer,sortSize); + } + { // 4. find entries + m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, N_SPLIT*N_SPLIT, btBoundSearchCL::COUNT); + + m_scan->execute( *countsNative, *offsetsNative, N_SPLIT*N_SPLIT ); + } + + { // 5. sort constraints by cellIdx + // todo. preallocate this +// btAssert( contactsIn->getType() == TYPE_HOST ); +// btOpenCLArray* out = BufferUtils::map( data->m_device, contactsIn ); // copying contacts to this buffer + + { + + + btInt4 cdata; cdata.x = nContacts; + btBufferInfoCL bInfo[] = { btBufferInfoCL( contactsIn->getBufferCL() ), btBufferInfoCL( m_contactBuffer->getBufferCL() ), btBufferInfoCL( m_sortDataBuffer->getBufferCL() ) }; + btLauncherCL launcher( m_queue, m_reorderContactKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nContacts, 64 ); + } +// BufferUtils::unmap( out, contactsIn, nContacts ); + } + } + + +} + +*/ + +void Solver::batchContacts( btOpenCLArray* contacts, int nContacts, btOpenCLArray* nNative, btOpenCLArray* offsetsNative, int staticIdx ) +{ + + { + BT_PROFILE("batch generation"); + + btInt4 cdata; + cdata.x = nContacts; + cdata.y = 0; + cdata.z = staticIdx; + + int numWorkItems = 64*N_SPLIT*N_SPLIT; +#ifdef BATCH_DEBUG + SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; + adl::btOpenCLArray gpuDebugInfo(data->m_device,numWorkItems); + memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems); + gpuDebugInfo.write(debugInfo,numWorkItems); +#endif + + + btBufferInfoCL bInfo[] = { + btBufferInfoCL( contacts->getBufferCL() ), + btBufferInfoCL( m_contactBuffer->getBufferCL() ), + btBufferInfoCL( nNative->getBufferCL() ), + btBufferInfoCL( offsetsNative->getBufferCL() ) +#ifdef BATCH_DEBUG + , btBufferInfoCL(&gpuDebugInfo) +#endif + }; + + + { + BT_PROFILE("batchingKernel"); + btLauncherCL launcher( m_queue, m_batchingKernel); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + //launcher.setConst( cdata ); + launcher.setConst(staticIdx); + + launcher.launch1D( numWorkItems, 64 ); + clFinish(m_queue); + } + +#ifdef BATCH_DEBUG + aaaa + btContact4* hostContacts = new btContact4[nContacts]; + m_contactBuffer->read(hostContacts,nContacts); + clFinish(m_queue); + + gpuDebugInfo.read(debugInfo,numWorkItems); + clFinish(m_queue); + + for (int i=0;i0) + { + printf("catch\n"); + } + if (debugInfo[i].m_valInt2>0) + { + printf("catch22\n"); + } + + if (debugInfo[i].m_valInt3>0) + { + printf("catch666\n"); + } + + if (debugInfo[i].m_valInt4>0) + { + printf("catch777\n"); + } + } + delete[] debugInfo; +#endif //BATCH_DEBUG + + } + +// copy buffer to buffer + btAssert(m_contactBuffer->size()==nContacts); + //contacts->copyFromOpenCLArray( *m_contactBuffer); + //clFinish(m_queue);//needed? + + +} + diff --git a/opencl/gpu_rigidbody/host/Solver.h b/opencl/gpu_rigidbody/host/Solver.h new file mode 100644 index 000000000..be1a3ab55 --- /dev/null +++ b/opencl/gpu_rigidbody/host/Solver.h @@ -0,0 +1,168 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +#ifndef __ADL_SOLVER_H +#define __ADL_SOLVER_H + +#include "../../parallel_primitives/host/btOpenCLArray.h" +#include "../host/btGpuConstraint4.h" + + +//#include +//#include "AdlRigidBody.h" +#include "../../gpu_sat/host/btRigidBodyCL.h" +#include "../../gpu_sat/host/btContact4.h" +#include "../Stubs/AdlMath.h" +#include "../Stubs/AdlMatrix3x3.h" + + +//#include "AdlPhysics/Batching/Batching.h> + + +#define MYF4 float4 +#define MAKE_MYF4 make_float4 + +//#define MYF4 float4sse +//#define MAKE_MYF4 make_float4sse + +#include "../host/btGpuConstraint4.h" +#include "../../parallel_primitives/host/btPrefixScanCL.h" +#include "../../parallel_primitives/host/btRadixSort32CL.h" +#include "../../parallel_primitives/host/btBoundSearchCL.h" + +#include "../../basic_initialize/btOpenCLUtils.h" + + + +class SolverBase +{ + public: + + + struct ConstraintData + { + ConstraintData(): m_b(0.f), m_appliedRambdaDt(0.f) {} + + float4 m_linear; // have to be normalized + float4 m_angular0; + float4 m_angular1; + float m_jacCoeffInv; + float m_b; + float m_appliedRambdaDt; + + unsigned int m_bodyAPtr; + unsigned int m_bodyBPtr; + + bool isInvalid() const { return ((unsigned int)m_bodyAPtr+(unsigned int)m_bodyBPtr) == 0; } + float getFrictionCoeff() const { return m_linear.w; } + void setFrictionCoeff(float coeff) { m_linear.w = coeff; } + }; + + struct ConstraintCfg + { + ConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(-1) {} + + float m_positionDrift; + float m_positionConstraintCoeff; + float m_dt; + bool m_enableParallelSolve; + float m_averageExtent; + int m_staticIdx; + }; + + + + enum + { + N_SPLIT = 16, + N_BATCHES = 4, + N_OBJ_PER_SPLIT = 10, + N_TASKS_PER_BATCH = N_SPLIT*N_SPLIT, + }; +}; + +class Solver : public SolverBase +{ + public: + + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + + + btOpenCLArray* m_numConstraints; + btOpenCLArray* m_offsets; + + + int m_nIterations; + cl_kernel m_batchingKernel; + cl_kernel m_solveContactKernel; + cl_kernel m_solveFrictionKernel; + cl_kernel m_contactToConstraintKernel; + cl_kernel m_setSortDataKernel; + cl_kernel m_reorderContactKernel; + cl_kernel m_copyConstraintKernel; + + class btRadixSort32CL* m_sort32; + class btBoundSearchCL* m_search; + class btPrefixScanCL* m_scan; + + btOpenCLArray* m_sortDataBuffer; + btOpenCLArray* m_contactBuffer; + + enum + { + DYNAMIC_CONTACT_ALLOCATION_THRESHOLD = 2000000, + }; + + + + + Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity); + + virtual ~Solver(); + + /* void reorderConvertToConstraints( const btOpenCLArray* bodyBuf, + const btOpenCLArray* shapeBuf, + btOpenCLArray* contactsIn, btOpenCLArray* contactCOut, void* additionalData, + int nContacts, const ConstraintCfg& cfg ); + */ + + void solveContactConstraint( const btOpenCLArray* bodyBuf, const btOpenCLArray* inertiaBuf, + btOpenCLArray* constraint, void* additionalData, int n ,int maxNumBatches); + + void solveContactConstraintHost( btOpenCLArray* bodyBuf, btOpenCLArray* shapeBuf, + btOpenCLArray* constraint, void* additionalData, int n ,int maxNumBatches); + + + void convertToConstraints( const btOpenCLArray* bodyBuf, + const btOpenCLArray* shapeBuf, + btOpenCLArray* contactsIn, btOpenCLArray* contactCOut, void* additionalData, + int nContacts, const ConstraintCfg& cfg ); + + /* void sortContacts( const btOpenCLArray* bodyBuf, + btOpenCLArray* contactsIn, void* additionalData, + int nContacts, const ConstraintCfg& cfg ); + */ + void batchContacts( btOpenCLArray* contacts, int nContacts, btOpenCLArray* n, btOpenCLArray* offsets, int staticIdx ); + +}; + + +#undef MYF4 +#undef MAKE_MYF4 + +#endif //__ADL_SOLVER_H diff --git a/opencl/gpu_rigidbody/host/btGpuBatchingPgsSolver.cpp b/opencl/gpu_rigidbody/host/btGpuBatchingPgsSolver.cpp new file mode 100644 index 000000000..51948ea51 --- /dev/null +++ b/opencl/gpu_rigidbody/host/btGpuBatchingPgsSolver.cpp @@ -0,0 +1,824 @@ + + +#include "btGpuBatchingPgsSolver.h" +#include "../../parallel_primitives/host/btRadixSort32CL.h" +#include "BulletCommon/btQuickprof.h" +#include "../../parallel_primitives/host/btLauncherCL.h" +#include "../../parallel_primitives/host/btBoundSearchCL.h" +#include "../../parallel_primitives/host/btPrefixScanCL.h" +#include +#include "../../basic_initialize/btOpenCLUtils.h" +#include "../host/btConfig.h" +#include "../Stubs/Solver.h" + + +#define SOLVER_SETUP_KERNEL_PATH "opencl/gpu_rigidbody/kernels/solverSetup.cl" +#define SOLVER_SETUP2_KERNEL_PATH "opencl/gpu_rigidbody/kernels/solverSetup2.cl" +#define SOLVER_CONTACT_KERNEL_PATH "opencl/gpu_rigidbody/kernels/solveContact.cl" +#define SOLVER_FRICTION_KERNEL_PATH "opencl/gpu_rigidbody/kernels/solveFriction.cl" +#define BATCHING_PATH "opencl/gpu_rigidbody/kernels/batchingKernels.cl" + +#include "../kernels/solverSetup.h" +#include "../kernels/solverSetup2.h" +#include "../kernels/solveContact.h" +#include "../kernels/solveFriction.h" +#include "../kernels/batchingKernels.h" + + +#define BTNEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment)) +enum +{ + BT_SOLVER_N_SPLIT = 16, + BT_SOLVER_N_BATCHES = 4, + BT_SOLVER_N_OBJ_PER_SPLIT = 10, + BT_SOLVER_N_TASKS_PER_BATCH = BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT, +}; + + +bool gpuBatchContacts = true; +bool gpuSolveConstraint = true; + + +struct btGpuBatchingPgsSolverInternalData +{ + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + int m_pairCapacity; + int m_nIterations; + + btOpenCLArray* m_contactCGPU; + + btOpenCLArray* m_numConstraints; + btOpenCLArray* m_offsets; + + Solver* m_solverGPU; + + cl_kernel m_batchingKernel; + cl_kernel m_solveContactKernel; + cl_kernel m_solveFrictionKernel; + cl_kernel m_contactToConstraintKernel; + cl_kernel m_setSortDataKernel; + cl_kernel m_reorderContactKernel; + cl_kernel m_copyConstraintKernel; + + class btRadixSort32CL* m_sort32; + class btBoundSearchCL* m_search; + class btPrefixScanCL* m_scan; + + btOpenCLArray* m_sortDataBuffer; + btOpenCLArray* m_contactBuffer; + + btOpenCLArray* m_bodyBufferGPU; + btOpenCLArray* m_inertiaBufferGPU; + btOpenCLArray* m_pBufContactOutGPU; +}; + + + +btGpuBatchingPgsSolver::btGpuBatchingPgsSolver(cl_context ctx,cl_device_id device, cl_command_queue q,int pairCapacity) +{ + m_data = new btGpuBatchingPgsSolverInternalData; + m_data->m_context = ctx; + m_data->m_device = device; + m_data->m_queue = q; + m_data->m_pairCapacity = pairCapacity; + m_data->m_nIterations = 4; + + m_data->m_bodyBufferGPU = new btOpenCLArray(ctx,q); + m_data->m_inertiaBufferGPU = new btOpenCLArray(ctx,q); + m_data->m_pBufContactOutGPU = new btOpenCLArray(ctx,q); + + m_data->m_solverGPU = new Solver(ctx,device,q,512*1024); + + m_data->m_sort32 = new btRadixSort32CL(ctx,device,m_data->m_queue); + m_data->m_scan = new btPrefixScanCL(ctx,device,m_data->m_queue,BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT); + m_data->m_search = new btBoundSearchCL(ctx,device,m_data->m_queue,BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT); + + const int sortSize = BTNEXTMULTIPLEOF( pairCapacity, 512 ); + + m_data->m_sortDataBuffer = new btOpenCLArray(ctx,m_data->m_queue,sortSize); + m_data->m_contactBuffer = new btOpenCLArray(ctx,m_data->m_queue); + + m_data->m_numConstraints = new btOpenCLArray(ctx,m_data->m_queue,BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT ); + m_data->m_numConstraints->resize(BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT); + + m_data->m_contactCGPU = new btOpenCLArray(ctx,q,pairCapacity); + + m_data->m_offsets = new btOpenCLArray( ctx,m_data->m_queue, BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT ); + m_data->m_offsets->resize(BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT); + const char* additionalMacros = ""; + const char* srcFileNameForCaching=""; + + + + cl_int pErrNum; + const char* batchKernelSource = batchingKernelsCL; + const char* solverSetupSource = solverSetupCL; + const char* solverSetup2Source = solverSetup2CL; + const char* solveContactSource = solveContactCL; + const char* solveFrictionSource = solveFrictionCL; + + + + { + + cl_program solveContactProg= btOpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, SOLVER_CONTACT_KERNEL_PATH); + btAssert(solveContactProg); + + cl_program solveFrictionProg= btOpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, SOLVER_FRICTION_KERNEL_PATH); + btAssert(solveFrictionProg); + + cl_program solverSetup2Prog= btOpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, SOLVER_SETUP2_KERNEL_PATH); + btAssert(solverSetup2Prog); + + + cl_program solverSetupProg= btOpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, SOLVER_SETUP_KERNEL_PATH); + btAssert(solverSetupProg); + + + m_data->m_solveFrictionKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros ); + btAssert(m_data->m_solveFrictionKernel); + + m_data->m_solveContactKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros ); + btAssert(m_data->m_solveContactKernel); + + m_data->m_contactToConstraintKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros ); + btAssert(m_data->m_contactToConstraintKernel); + + m_data->m_setSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + btAssert(m_data->m_setSortDataKernel); + + m_data->m_reorderContactKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + btAssert(m_data->m_reorderContactKernel); + + + m_data->m_copyConstraintKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + btAssert(m_data->m_copyConstraintKernel); + + } + + { + cl_program batchingProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, BATCHING_PATH); + btAssert(batchingProg); + + m_data->m_batchingKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros ); + btAssert(m_data->m_batchingKernel); + } + + + + + + + + +} + +btGpuBatchingPgsSolver::~btGpuBatchingPgsSolver() +{ + delete m_data->m_sortDataBuffer; + delete m_data->m_contactBuffer; + + delete m_data->m_sort32; + delete m_data->m_scan; + delete m_data->m_search; + + + clReleaseKernel(m_data->m_batchingKernel); + + clReleaseKernel( m_data->m_solveContactKernel); + clReleaseKernel( m_data->m_solveFrictionKernel); + + clReleaseKernel( m_data->m_contactToConstraintKernel); + clReleaseKernel( m_data->m_setSortDataKernel); + clReleaseKernel( m_data->m_reorderContactKernel); + clReleaseKernel( m_data->m_copyConstraintKernel); + + delete m_data; +} + + + +struct btConstraintCfg +{ + btConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(-1) {} + + float m_positionDrift; + float m_positionConstraintCoeff; + float m_dt; + bool m_enableParallelSolve; + float m_averageExtent; + int m_staticIdx; +}; + + + + + +void btGpuBatchingPgsSolver::solveContactConstraint( const btOpenCLArray* bodyBuf, const btOpenCLArray* shapeBuf, + btOpenCLArray* constraint, void* additionalData, int n ,int maxNumBatches,int numIterations) +{ + + + btInt4 cdata = btMakeInt4( n, 0, 0, 0 ); + { + + const int nn = BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT; + + cdata.x = 0; + cdata.y = maxNumBatches;//250; + + + int numWorkItems = 64*nn/BT_SOLVER_N_BATCHES; +#ifdef DEBUG_ME + SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; + adl::btOpenCLArray gpuDebugInfo(data->m_device,numWorkItems); +#endif + + + + { + + BT_PROFILE("m_batchSolveKernel iterations"); + for(int iter=0; iterm_queue, m_data->m_solveContactKernel ); +#if 1 + + btBufferInfoCL bInfo[] = { + + btBufferInfoCL( bodyBuf->getBufferCL() ), + btBufferInfoCL( shapeBuf->getBufferCL() ), + btBufferInfoCL( constraint->getBufferCL() ), + btBufferInfoCL( m_data->m_numConstraints->getBufferCL() ), + btBufferInfoCL( m_data->m_offsets->getBufferCL() ) +#ifdef DEBUG_ME + , btBufferInfoCL(&gpuDebugInfo) +#endif + }; + + + + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + //launcher.setConst( cdata.x ); + launcher.setConst( cdata.y ); + launcher.setConst( cdata.z ); + launcher.setConst( cdata.w ); + launcher.launch1D( numWorkItems, 64 ); + + +#else + const char* fileName = "m_batchSolveKernel.bin"; + FILE* f = fopen(fileName,"rb"); + if (f) + { + int sizeInBytes=0; + if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) + { + printf("error, cannot get file size\n"); + exit(0); + } + + unsigned char* buf = (unsigned char*) malloc(sizeInBytes); + fread(buf,sizeInBytes,1,f); + int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); + int num = *(int*)&buf[serializedBytes]; + + launcher.launch1D( num); + + //this clFinish is for testing on errors + clFinish(m_queue); + } + +#endif + + +#ifdef DEBUG_ME + clFinish(m_queue); + gpuDebugInfo.read(debugInfo,numWorkItems); + clFinish(m_queue); + for (int i=0;i0) + { + printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2); + } + + if (debugInfo[i].m_valInt3>0) + { + printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3); + } + } +#endif //DEBUG_ME + + + } + } + + clFinish(m_data->m_queue); + + + } + + cdata.x = 1; + bool applyFriction=true; + if (applyFriction) + { + BT_PROFILE("m_batchSolveKernel iterations2"); + for(int iter=0; itergetBufferCL() ), + btBufferInfoCL( shapeBuf->getBufferCL() ), + btBufferInfoCL( constraint->getBufferCL() ), + btBufferInfoCL( m_data->m_numConstraints->getBufferCL() ), + btBufferInfoCL( m_data->m_offsets->getBufferCL() ) +#ifdef DEBUG_ME + ,btBufferInfoCL(&gpuDebugInfo) +#endif //DEBUG_ME + }; + btLauncherCL launcher( m_data->m_queue, m_data->m_solveFrictionKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + //launcher.setConst( cdata.x ); + launcher.setConst( cdata.y ); + launcher.setConst( cdata.z ); + launcher.setConst( cdata.w ); + + launcher.launch1D( 64*nn/BT_SOLVER_N_BATCHES, 64 ); + } + } + clFinish(m_data->m_queue); + + } +#ifdef DEBUG_ME + delete[] debugInfo; +#endif //DEBUG_ME + } + + +} + + + + + + + + + + + + + + +void btGpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const btConfig& config) +{ + m_data->m_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf,numBodies); + m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf,numBodies); + m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf,numContacts); + + int nContactOut = m_data->m_pBufContactOutGPU->size(); + + bool useSolver = true; + + if (useSolver) + { + float dt=1./60.; + btConstraintCfg csCfg( dt ); + csCfg.m_enableParallelSolve = true; + csCfg.m_averageExtent = 0.2f;//@TODO m_averageObjExtent; + csCfg.m_staticIdx = -1;//m_static0Index;//m_planeBodyIndex; + + btOpenCLArray* contactsIn = m_data->m_pBufContactOutGPU; + btOpenCLArray* bodyBuf = m_data->m_bodyBufferGPU; + + void* additionalData = 0;//m_data->m_frictionCGPU; + const btOpenCLArray* shapeBuf = m_data->m_inertiaBufferGPU; + btOpenCLArray* contactConstraintOut = m_data->m_contactCGPU; + int nContacts = nContactOut; + + + int maxNumBatches = 0; + + { + + if( m_data->m_solverGPU->m_contactBuffer) + { + m_data->m_solverGPU->m_contactBuffer->resize(nContacts); + } + + if( m_data->m_solverGPU->m_contactBuffer == 0 ) + { + m_data->m_solverGPU->m_contactBuffer = new btOpenCLArray(m_data->m_context,m_data->m_queue, nContacts ); + m_data->m_solverGPU->m_contactBuffer->resize(nContacts); + } + clFinish(m_data->m_queue); + + + + { + BT_PROFILE("batching"); + //@todo: just reserve it, without copy of original contact (unless we use warmstarting) + + + btOpenCLArray* contactNative = contactsIn; + const btOpenCLArray* bodyNative = bodyBuf; + + + { + + //btOpenCLArray* bodyNative = btOpenCLArrayUtils::map( data->m_device, bodyBuf ); + //btOpenCLArray* contactNative = btOpenCLArrayUtils::map( data->m_device, contactsIn ); + + const int sortAlignment = 512; // todo. get this out of sort + if( csCfg.m_enableParallelSolve ) + { + + + int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment ); + + btOpenCLArray* countsNative = m_data->m_solverGPU->m_numConstraints; + btOpenCLArray* offsetsNative = m_data->m_solverGPU->m_offsets; + + { // 2. set cell idx + BT_PROFILE("GPU set cell idx"); + struct CB + { + int m_nContacts; + int m_staticIdx; + float m_scale; + int m_nSplit; + }; + + btAssert( sortSize%64 == 0 ); + CB cdata; + cdata.m_nContacts = nContacts; + cdata.m_staticIdx = csCfg.m_staticIdx; + cdata.m_scale = 1.f/(BT_SOLVER_N_OBJ_PER_SPLIT*csCfg.m_averageExtent); + cdata.m_nSplit = BT_SOLVER_N_SPLIT; + + m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts); + + + btBufferInfoCL bInfo[] = { btBufferInfoCL( contactNative->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL()), btBufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; + btLauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( cdata.m_nContacts ); + launcher.setConst( cdata.m_scale ); + launcher.setConst(cdata.m_nSplit); + + + launcher.launch1D( sortSize, 64 ); + } + + + bool gpuRadixSort=true; + if (gpuRadixSort) + { // 3. sort by cell idx + BT_PROFILE("gpuRadixSort"); + int n = BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT; + int sortBit = 32; + //if( n <= 0xffff ) sortBit = 16; + //if( n <= 0xff ) sortBit = 8; + //adl::RadixSort::execute( data->m_sort, *data->m_sortDataBuffer, sortSize ); + //adl::RadixSort32::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize ); + btOpenCLArray& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer); + this->m_data->m_solverGPU->m_sort32->execute(keyValuesInOut); + + /*btAlignedObjectArray hostValues; + keyValuesInOut.copyToHost(hostValues); + printf("hostValues.size=%d\n",hostValues.size()); + */ + + } + + { + // 4. find entries + BT_PROFILE("gpuBoundSearch"); + + m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative, + BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT,btBoundSearchCL::COUNT); + + + //adl::BoundSearch::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, + // BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT ); + + //unsigned int sum; + m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT);//,&sum ); + //printf("sum = %d\n",sum); + } + + + + + if (nContacts) + { // 5. sort constraints by cellIdx + { + BT_PROFILE("gpu m_reorderContactKernel"); + + btInt4 cdata; + cdata.x = nContacts; + + btBufferInfoCL bInfo[] = { btBufferInfoCL( contactNative->getBufferCL() ), btBufferInfoCL( m_data->m_solverGPU->m_contactBuffer->getBufferCL()) + , btBufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; + btLauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nContacts, 64 ); + } + } + + + + + } + + } + + clFinish(m_data->m_queue); + + if (nContacts) + { + BT_PROFILE("gpu m_copyConstraintKernel"); + + btInt4 cdata; cdata.x = nContacts; + btBufferInfoCL bInfo[] = { btBufferInfoCL( m_data->m_solverGPU->m_contactBuffer->getBufferCL() ), btBufferInfoCL( contactNative->getBufferCL() ) }; + btLauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nContacts, 64 ); + clFinish(m_data->m_queue); + } + + + + bool compareGPU = false; + if (nContacts) + { + if (gpuBatchContacts) + { + maxNumBatches=250;//for now + BT_PROFILE("gpu batchContacts"); + m_data->m_solverGPU->batchContacts( (btOpenCLArray*)contactNative, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx ); + } else + { + BT_PROFILE("cpu batchContacts"); + btAlignedObjectArray cpuContacts; + btOpenCLArray* contactsIn = m_data->m_pBufContactOutGPU; + contactsIn->copyToHost(cpuContacts); + + btOpenCLArray* countsNative = m_data->m_solverGPU->m_numConstraints; + btOpenCLArray* offsetsNative = m_data->m_solverGPU->m_offsets; + + btAlignedObjectArray nNativeHost; + btAlignedObjectArray offsetsNativeHost; + + { + BT_PROFILE("countsNative/offsetsNative copyToHost"); + countsNative->copyToHost(nNativeHost); + offsetsNative->copyToHost(offsetsNativeHost); + } + + + int numNonzeroGrid=0; + + { + BT_PROFILE("batch grid"); + for(int i=0; im_queue); + + } + } + } + { + BT_PROFILE("m_contactBuffer->copyFromHost"); + m_data->m_solverGPU->m_contactBuffer->copyFromHost((btAlignedObjectArray&)cpuContacts); + } + // printf("maxNumBatches = %d\n", maxNumBatches); + } + } + + + + + if (nContacts) + { + //BT_PROFILE("gpu convertToConstraints"); + m_data->m_solverGPU->convertToConstraints( bodyBuf, + shapeBuf, m_data->m_solverGPU->m_contactBuffer /*contactNative*/, + contactConstraintOut, + additionalData, nContacts, + (SolverBase::ConstraintCfg&) csCfg ); + clFinish(m_data->m_queue); + } + + + + } + + + } + + + if (1) + { + BT_PROFILE("GPU solveContactConstraint"); + m_data->m_solverGPU->m_nIterations = 4;//10 + if (gpuSolveConstraint) + { + m_data->m_solverGPU->solveContactConstraint( + m_data->m_bodyBufferGPU, + m_data->m_inertiaBufferGPU, + m_data->m_contactCGPU,0, + nContactOut , + maxNumBatches); + } + else + { + //m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU,0, nContactOut ,maxNumBatches); + } + + clFinish(m_data->m_queue); + } + + +#if 0 + if (0) + { + BT_PROFILE("read body velocities back to CPU"); + //read body updated linear/angular velocities back to CPU + m_data->m_bodyBufferGPU->read( + m_data->m_bodyBufferCPU->m_ptr,numOfConvexRBodies); + adl::DeviceUtils::waitForCompletion( m_data->m_deviceCL ); + } +#endif + + } + +} + + +void btGpuBatchingPgsSolver::batchContacts( btOpenCLArray* contacts, int nContacts, btOpenCLArray* n, btOpenCLArray* offsets, int staticIdx ) +{ +} + + + +static bool sortfnc(const btSortData& a,const btSortData& b) +{ + return (a.m_key idxBuffer; +btAlignedObjectArray sortData; +btAlignedObjectArray old; + + +inline int btGpuBatchingPgsSolver::sortConstraintByBatch( btContact4* cs, int n, int simdWidth , int staticIdx) +{ + int numIter = 0; + + sortData.resize(n); + idxBuffer.resize(n); + old.resize(n); + + unsigned int* idxSrc = &idxBuffer[0]; + unsigned int* idxDst = &idxBuffer[0]; + int nIdxSrc, nIdxDst; + + const int N_FLG = 256; + const int FLG_MASK = N_FLG-1; + unsigned int flg[N_FLG/32]; +#if defined(_DEBUG) + for(int i=0; i=0)&&bodyAS!=staticIdx? aUnavailable:0;// + bUnavailable = (bodyBS>=0)&&bodyBS!=staticIdx? bUnavailable:0; + + if( aUnavailable==0 && bUnavailable==0 ) // ok + { + flg[ aIdx/32 ] |= (1<<(aIdx&31)); + flg[ bIdx/32 ] |= (1<<(bIdx&31)); + cs[idx].getBatchIdx() = batchIdx; + sortData[idx].m_key = batchIdx; + sortData[idx].m_value = idx; + + { + nCurrentBatch++; + if( nCurrentBatch == simdWidth ) + { + nCurrentBatch = 0; + for(int i=0; i* contacts, int nContacts, btOpenCLArray* n, btOpenCLArray* offsets, int staticIdx ); + inline int sortConstraintByBatch( btContact4* cs, int n, int simdWidth , int staticIdx); + void solveContactConstraint( const btOpenCLArray* bodyBuf, const btOpenCLArray* shapeBuf, + btOpenCLArray* constraint, void* additionalData, int n ,int maxNumBatches, int numIterations); + +public: + + btGpuBatchingPgsSolver(cl_context ctx,cl_device_id device, cl_command_queue q,int pairCapacity); + virtual ~btGpuBatchingPgsSolver(); + + void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct btConfig& config); + +}; + +#endif //BT_GPU_BATCHING_PGS_SOLVER_H + diff --git a/opencl/gpu_rigidbody/host/btGpuConstraint4.h b/opencl/gpu_rigidbody/host/btGpuConstraint4.h new file mode 100644 index 000000000..c00f0b23b --- /dev/null +++ b/opencl/gpu_rigidbody/host/btGpuConstraint4.h @@ -0,0 +1,31 @@ + +#ifndef BT_CONSTRAINT4_h +#define BT_CONSTRAINT4_h +#include "BulletCommon/btVector3.h" + +ATTRIBUTE_ALIGNED16(struct) btGpuConstraint4 +{ + BT_DECLARE_ALIGNED_ALLOCATOR(); + + btVector3 m_linear;//normal? + btVector3 m_worldPos[4]; + btVector3 m_center; // friction + float m_jacCoeffInv[4]; + float m_b[4]; + float m_appliedRambdaDt[4]; + + float m_fJacCoeffInv[2]; // friction + float m_fAppliedRambdaDt[2]; // friction + + unsigned int m_bodyA; + unsigned int m_bodyB; + + unsigned int m_batchIdx; + unsigned int m_paddings[1]; + + inline void setFrictionCoeff(float value) { m_linear[3] = value; } + inline float getFrictionCoeff() const { return m_linear[3]; } +}; + +#endif //BT_CONSTRAINT4_h + diff --git a/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp b/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp index a627d4867..f23b0574a 100644 --- a/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp +++ b/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp @@ -12,7 +12,10 @@ #include "btPgsJacobiSolver.h" #include "../../gpu_sat/host/btRigidBodyCL.h" #include "../../gpu_sat/host/btContact4.h" +#include "btGpuBatchingPgsSolver.h" +#include "../Stubs/Solver.h" +#include "btConfig.h" btGpuRigidBodyPipeline::btGpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue q,class btGpuNarrowPhase* narrowphase, class btGpuSapBroadphase* broadphaseSap ) { @@ -21,7 +24,9 @@ btGpuRigidBodyPipeline::btGpuRigidBodyPipeline(cl_context ctx,cl_device_id devic m_data->m_device = device; m_data->m_queue = q; m_data->m_solver = new btPgsJacobiSolver(); - + m_data->m_solver2 = new btGpuBatchingPgsSolver(ctx,device,q,256*1024); + + m_data->m_broadphaseSap = broadphaseSap; m_data->m_narrowphase = narrowphase; @@ -48,7 +53,9 @@ btGpuRigidBodyPipeline::btGpuRigidBodyPipeline(cl_context ctx,cl_device_id devic btGpuRigidBodyPipeline::~btGpuRigidBodyPipeline() { clReleaseKernel(m_data->m_integrateTransformsKernel); - + + delete m_data->m_solver; + delete m_data->m_solver2; delete m_data; } @@ -66,11 +73,13 @@ void btGpuRigidBodyPipeline::stepSimulation(float deltaTime) int numPairs = m_data->m_broadphaseSap->getNumOverlap(); int numContacts = 0; + int numBodies = m_data->m_narrowphase->getNumBodiesGpu(); + if (numPairs) { cl_mem pairs = m_data->m_broadphaseSap->getOverlappingPairBuffer(); cl_mem aabbs = m_data->m_broadphaseSap->getAabbBuffer(); - int numBodies = m_data->m_narrowphase->getNumBodiesGpu(); + m_data->m_narrowphase->computeContacts(pairs,numPairs,aabbs,numBodies); numContacts = m_data->m_narrowphase->getNumContactsGpu(); @@ -85,27 +94,41 @@ void btGpuRigidBodyPipeline::stepSimulation(float deltaTime) if (numContacts) { - // m_data->m_solver->solveGroup(bodies, inertias,numBodies,contacts,numContacts,0,0,infoGlobal); - btAlignedObjectArray hostBodies; btOpenCLArray gpuBodies(m_data->m_context,m_data->m_queue,0,true); gpuBodies.setFromOpenCLBuffer(m_data->m_narrowphase->getBodiesGpu(),m_data->m_narrowphase->getNumBodiesGpu()); - gpuBodies.copyToHost(hostBodies); - - btAlignedObjectArray hostInertias; btOpenCLArray gpuInertias(m_data->m_context,m_data->m_queue,0,true); gpuInertias.setFromOpenCLBuffer(m_data->m_narrowphase->getBodyInertiasGpu(),m_data->m_narrowphase->getNumBodiesGpu()); - gpuInertias.copyToHost(hostInertias); - - btAlignedObjectArray hostContacts; btOpenCLArray gpuContacts(m_data->m_context,m_data->m_queue,0,true); gpuContacts.setFromOpenCLBuffer(m_data->m_narrowphase->getContactsGpu(),m_data->m_narrowphase->getNumContactsGpu()); - gpuContacts.copyToHost(hostContacts); + bool useJacobi = false; + if (useJacobi) + { + btAlignedObjectArray hostBodies; + gpuBodies.copyToHost(hostBodies); + btAlignedObjectArray hostInertias; + gpuInertias.copyToHost(hostInertias); + btAlignedObjectArray hostContacts; + gpuContacts.copyToHost(hostContacts); { m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,&hostContacts[0]); } - gpuBodies.copyFromHost(hostBodies); + } else + { + btConfig config; + m_data->m_solver2->solveContacts(numBodies, gpuBodies.getBufferCL(),gpuInertias.getBufferCL(),numContacts, gpuContacts.getBufferCL(),config); + + //m_data->m_solver4->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(), gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL()); + + + /*m_data->m_solver3->solveContactConstraintHost( + (btOpenCLArray*)&gpuBodies, + (btOpenCLArray*)&gpuInertias, + (btOpenCLArray*) &gpuContacts, + 0,numContacts,256); + */ + } } integrate(deltaTime); diff --git a/opencl/gpu_rigidbody/host/btGpuRigidBodyPipelineInternalData.h b/opencl/gpu_rigidbody/host/btGpuRigidBodyPipelineInternalData.h index 19ad9ee35..ee08b59c2 100644 --- a/opencl/gpu_rigidbody/host/btGpuRigidBodyPipelineInternalData.h +++ b/opencl/gpu_rigidbody/host/btGpuRigidBodyPipelineInternalData.h @@ -7,6 +7,7 @@ #include "../../parallel_primitives/host/btOpenCLArray.h" #include "../../gpu_sat/host/btCollidable.h" + struct btGpuRigidBodyPipelineInternalData { @@ -18,7 +19,8 @@ struct btGpuRigidBodyPipelineInternalData cl_kernel m_updateAabbsKernel; class btPgsJacobiSolver* m_solver; - + class btGpuBatchingPgsSolver* m_solver2; + class btGpuSapBroadphase* m_broadphaseSap; class btGpuNarrowPhase* m_narrowphase; diff --git a/opencl/gpu_rigidbody/host/btPgsJacobiSolver.cpp b/opencl/gpu_rigidbody/host/btPgsJacobiSolver.cpp index 161dfee12..3795c679b 100644 --- a/opencl/gpu_rigidbody/host/btPgsJacobiSolver.cpp +++ b/opencl/gpu_rigidbody/host/btPgsJacobiSolver.cpp @@ -33,7 +33,7 @@ subject to the following restrictions: //#include "../../dynamics/basic_demo/Stubs/AdlContact4.h" #include "../../gpu_sat/host/btContact4.h" -bool usePgs = false; +bool usePgs = true; int gNumSplitImpulseRecoveries2 = 0; #include "btRigidBody.h" diff --git a/opencl/gpu_rigidbody/kernels/batchingKernels.cl b/opencl/gpu_rigidbody/kernels/batchingKernels.cl new file mode 100644 index 000000000..988d8c932 --- /dev/null +++ b/opencl/gpu_rigidbody/kernels/batchingKernels.cl @@ -0,0 +1,341 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile __global int* +#endif + + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +#define WG_SIZE 64 + + + +typedef struct +{ + float4 m_worldPos[4]; + float4 m_worldNormal; + u32 m_coeffs; + int m_batchIdx; + + int m_bodyA;//sign bit set for fixed objects + int m_bodyB; +}Contact4; + +typedef struct +{ + int m_n; + int m_start; + int m_staticIdx; + int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_a; + int m_b; + u32 m_idx; +}Elem; + +#define STACK_SIZE (WG_SIZE*10) +//#define STACK_SIZE (WG_SIZE) +#define RING_SIZE 1024 +#define RING_SIZE_MASK (RING_SIZE-1) +#define CHECK_SIZE (WG_SIZE) + + +#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd) +#define RING_END ldsTmp + +u32 readBuf(__local u32* buff, int idx) +{ + idx = idx % (32*CHECK_SIZE); + int bitIdx = idx%32; + int bufIdx = idx/32; + return buff[bufIdx] & (1<> bitIdx)&1) == 0; +} + +// batching on the GPU +__kernel void CreateBatches( __global const Contact4* gConstraints, __global Contact4* gConstraintsOut, + __global const u32* gN, __global const u32* gStart, + int m_staticIdx ) +{ + __local u32 ldsStackIdx[STACK_SIZE]; + __local u32 ldsStackEnd; + __local Elem ldsRingElem[RING_SIZE]; + __local u32 ldsRingEnd; + __local u32 ldsTmp; + __local u32 ldsCheckBuffer[CHECK_SIZE]; + __local u32 ldsFixedBuffer[CHECK_SIZE]; + __local u32 ldsGEnd; + __local u32 ldsDstEnd; + + int wgIdx = GET_GROUP_IDX; + int lIdx = GET_LOCAL_IDX; + + const int m_n = gN[wgIdx]; + const int m_start = gStart[wgIdx]; + + if( lIdx == 0 ) + { + ldsRingEnd = 0; + ldsGEnd = 0; + ldsStackEnd = 0; + ldsDstEnd = m_start; + } + +// while(1) +//was 250 + for(int ie=0; ie<50; ie++) + { + ldsFixedBuffer[lIdx] = 0; + + for(int giter=0; giter<4; giter++) + { + int ringCap = GET_RING_CAPACITY; + + // 1. fill ring + if( ldsGEnd < m_n ) + { + while( ringCap > WG_SIZE ) + { + if( ldsGEnd >= m_n ) break; + if( lIdx < ringCap - WG_SIZE ) + { + int srcIdx; + AtomInc1( ldsGEnd, srcIdx ); + if( srcIdx < m_n ) + { + int dstIdx; + AtomInc1( ldsRingEnd, dstIdx ); + + int a = gConstraints[m_start+srcIdx].m_bodyA; + int b = gConstraints[m_start+srcIdx].m_bodyB; + ldsRingElem[dstIdx].m_a = (a>b)? b:a; + ldsRingElem[dstIdx].m_b = (a>b)? a:b; + ldsRingElem[dstIdx].m_idx = srcIdx; + } + } + ringCap = GET_RING_CAPACITY; + } + } + + GROUP_LDS_BARRIER; + + // 2. fill stack + __local Elem* dst = ldsRingElem; + if( lIdx == 0 ) RING_END = 0; + + int srcIdx=lIdx; + int end = ldsRingEnd; + + { + for(int ii=0; ii> bitIdx)&1) == 0;\n" +"}\n" +"\n" +"// batching on the GPU\n" +"__kernel void CreateBatches( __global const Contact4* gConstraints, __global Contact4* gConstraintsOut,\n" +" __global const u32* gN, __global const u32* gStart, \n" +" int m_staticIdx )\n" +"{\n" +" __local u32 ldsStackIdx[STACK_SIZE];\n" +" __local u32 ldsStackEnd;\n" +" __local Elem ldsRingElem[RING_SIZE];\n" +" __local u32 ldsRingEnd;\n" +" __local u32 ldsTmp;\n" +" __local u32 ldsCheckBuffer[CHECK_SIZE];\n" +" __local u32 ldsFixedBuffer[CHECK_SIZE];\n" +" __local u32 ldsGEnd;\n" +" __local u32 ldsDstEnd;\n" +"\n" +" int wgIdx = GET_GROUP_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" \n" +" const int m_n = gN[wgIdx];\n" +" const int m_start = gStart[wgIdx];\n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" ldsRingEnd = 0;\n" +" ldsGEnd = 0;\n" +" ldsStackEnd = 0;\n" +" ldsDstEnd = m_start;\n" +" }\n" +" \n" +"// while(1)\n" +"//was 250\n" +" for(int ie=0; ie<50; ie++)\n" +" {\n" +" ldsFixedBuffer[lIdx] = 0;\n" +"\n" +" for(int giter=0; giter<4; giter++)\n" +" {\n" +" int ringCap = GET_RING_CAPACITY;\n" +" \n" +" // 1. fill ring\n" +" if( ldsGEnd < m_n )\n" +" {\n" +" while( ringCap > WG_SIZE )\n" +" {\n" +" if( ldsGEnd >= m_n ) break;\n" +" if( lIdx < ringCap - WG_SIZE )\n" +" {\n" +" int srcIdx;\n" +" AtomInc1( ldsGEnd, srcIdx );\n" +" if( srcIdx < m_n )\n" +" {\n" +" int dstIdx;\n" +" AtomInc1( ldsRingEnd, dstIdx );\n" +" \n" +" int a = gConstraints[m_start+srcIdx].m_bodyA;\n" +" int b = gConstraints[m_start+srcIdx].m_bodyB;\n" +" ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n" +" ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n" +" ldsRingElem[dstIdx].m_idx = srcIdx;\n" +" }\n" +" }\n" +" ringCap = GET_RING_CAPACITY;\n" +" }\n" +" }\n" +"\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" // 2. fill stack\n" +" __local Elem* dst = ldsRingElem;\n" +" if( lIdx == 0 ) RING_END = 0;\n" +"\n" +" int srcIdx=lIdx;\n" +" int end = ldsRingEnd;\n" +"\n" +" {\n" +" for(int ii=0; iim_jacCoeffInv[ic] == 0.f ) continue; + + float4 angular0, angular1, linear; + float4 r0 = cs->m_worldPos[ic] - posA; + float4 r1 = cs->m_worldPos[ic] - posB; + setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 ); + + float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, + *linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic]; + rambdaDt *= cs->m_jacCoeffInv[ic]; + + { + float prevSum = cs->m_appliedRambdaDt[ic]; + float updated = prevSum; + updated += rambdaDt; + updated = max2( updated, minRambdaDt ); + updated = min2( updated, maxRambdaDt ); + rambdaDt = updated - prevSum; + cs->m_appliedRambdaDt[ic] = updated; + } + + float4 linImp0 = invMassA*linear*rambdaDt; + float4 linImp1 = invMassB*(-linear)*rambdaDt; + float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt; + float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt; + + *linVelA += linImp0; + *angVelA += angImp0; + *linVelB += linImp1; + *angVelB += angImp1; + } +} + +void btPlaneSpace1 (const float4* n, float4* p, float4* q); + void btPlaneSpace1 (const float4* n, float4* p, float4* q) +{ + if (fabs(n[0].z) > 0.70710678f) { + // choose p in y-z plane + float a = n[0].y*n[0].y + n[0].z*n[0].z; + float k = 1.f/sqrt(a); + p[0].x = 0; + p[0].y = -n[0].z*k; + p[0].z = n[0].y*k; + // set q = n x p + q[0].x = a*k; + q[0].y = -n[0].x*p[0].z; + q[0].z = n[0].x*p[0].y; + } + else { + // choose p in x-y plane + float a = n[0].x*n[0].x + n[0].y*n[0].y; + float k = 1.f/sqrt(a); + p[0].x = -n[0].y*k; + p[0].y = n[0].x*k; + p[0].z = 0; + // set q = n x p + q[0].x = -n[0].z*p[0].y; + q[0].y = n[0].z*p[0].x; + q[0].z = a*k; + } +} + +void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs); +void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs) +{ + //float frictionCoeff = ldsCs[0].m_linear.w; + int aIdx = ldsCs[0].m_bodyA; + int bIdx = ldsCs[0].m_bodyB; + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; + + solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA, + posB, &linVelB, &angVelB, invMassB, invInertiaB ); + + if (gBodies[aIdx].m_invMass) + { + gBodies[aIdx].m_linVel = linVelA; + gBodies[aIdx].m_angVel = angVelA; + } else + { + gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0); + gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0); + + } + if (gBodies[bIdx].m_invMass) + { + gBodies[bIdx].m_linVel = linVelB; + gBodies[bIdx].m_angVel = angVelB; + } else + { + gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0); + gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0); + + } + +} + + + +typedef struct +{ + int m_valInt0; + int m_valInt1; + int m_valInt2; + int m_valInt3; + + float m_val0; + float m_val1; + float m_val2; + float m_val3; +} SolverDebugInfo; + + + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void BatchSolveKernelContact(__global Body* gBodies, + __global Shape* gShapes, + __global Constraint4* gConstraints, + __global int* gN, + __global int* gOffsets, + int maxBatch, + int bIdx, + int nSplit + ) +{ + //__local int ldsBatchIdx[WG_SIZE+1]; + __local int ldsCurBatch; + __local int ldsNextBatch; + __local int ldsStart; + + int lIdx = GET_LOCAL_IDX; + int wgIdx = GET_GROUP_IDX; + +// int gIdx = GET_GLOBAL_IDX; +// debugInfo[gIdx].m_valInt0 = gIdx; + //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE; + + + int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1); + int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1); + int cellIdx = xIdx+yIdx*nSplit; + + if( gN[cellIdx] == 0 ) + return; + + const int start = gOffsets[cellIdx]; + const int end = start + gN[cellIdx]; + + + if( lIdx == 0 ) + { + ldsCurBatch = 0; + ldsNextBatch = 0; + ldsStart = start; + } + + + GROUP_LDS_BARRIER; + + int idx=ldsStart+lIdx; + while (ldsCurBatch < maxBatch) + { + for(; idxm_jacCoeffInv[ic] == 0.f ) continue;\n" +"\n" +" float4 angular0, angular1, linear;\n" +" float4 r0 = cs->m_worldPos[ic] - posA;\n" +" float4 r1 = cs->m_worldPos[ic] - posB;\n" +" setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n" +"\n" +" float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n" +" *linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];\n" +" rambdaDt *= cs->m_jacCoeffInv[ic];\n" +"\n" +" {\n" +" float prevSum = cs->m_appliedRambdaDt[ic];\n" +" float updated = prevSum;\n" +" updated += rambdaDt;\n" +" updated = max2( updated, minRambdaDt );\n" +" updated = min2( updated, maxRambdaDt );\n" +" rambdaDt = updated - prevSum;\n" +" cs->m_appliedRambdaDt[ic] = updated;\n" +" }\n" +"\n" +" float4 linImp0 = invMassA*linear*rambdaDt;\n" +" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" +" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" +" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" +"\n" +" *linVelA += linImp0;\n" +" *angVelA += angImp0;\n" +" *linVelB += linImp1;\n" +" *angVelB += angImp1;\n" +" }\n" +"}\n" +"\n" +"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n" +" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n" +"{\n" +" if (fabs(n[0].z) > 0.70710678f) {\n" +" // choose p in y-z plane\n" +" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = 0;\n" +" p[0].y = -n[0].z*k;\n" +" p[0].z = n[0].y*k;\n" +" // set q = n x p\n" +" q[0].x = a*k;\n" +" q[0].y = -n[0].x*p[0].z;\n" +" q[0].z = n[0].x*p[0].y;\n" +" }\n" +" else {\n" +" // choose p in x-y plane\n" +" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = -n[0].y*k;\n" +" p[0].y = n[0].x*k;\n" +" p[0].z = 0;\n" +" // set q = n x p\n" +" q[0].x = -n[0].z*p[0].y;\n" +" q[0].y = n[0].z*p[0].x;\n" +" q[0].z = a*k;\n" +" }\n" +"}\n" +"\n" +"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n" +"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n" +"{\n" +" //float frictionCoeff = ldsCs[0].m_linear.w;\n" +" int aIdx = ldsCs[0].m_bodyA;\n" +" int bIdx = ldsCs[0].m_bodyB;\n" +"\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" +"\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" +"\n" +" solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" +" posB, &linVelB, &angVelB, invMassB, invInertiaB );\n" +"\n" +" if (gBodies[aIdx].m_invMass)\n" +" {\n" +" gBodies[aIdx].m_linVel = linVelA;\n" +" gBodies[aIdx].m_angVel = angVelA;\n" +" } else\n" +" {\n" +" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n" +" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n" +" \n" +" }\n" +" if (gBodies[bIdx].m_invMass)\n" +" {\n" +" gBodies[bIdx].m_linVel = linVelB;\n" +" gBodies[bIdx].m_angVel = angVelB;\n" +" } else\n" +" {\n" +" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n" +" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n" +" \n" +" }\n" +"\n" +"}\n" +"\n" +"\n" +"\n" +"typedef struct \n" +"{\n" +" int m_valInt0;\n" +" int m_valInt1;\n" +" int m_valInt2;\n" +" int m_valInt3;\n" +"\n" +" float m_val0;\n" +" float m_val1;\n" +" float m_val2;\n" +" float m_val3;\n" +"} SolverDebugInfo;\n" +"\n" +"\n" +"\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void BatchSolveKernelContact(__global Body* gBodies,\n" +" __global Shape* gShapes,\n" +" __global Constraint4* gConstraints,\n" +" __global int* gN,\n" +" __global int* gOffsets,\n" +" int maxBatch,\n" +" int bIdx,\n" +" int nSplit\n" +" )\n" +"{\n" +" //__local int ldsBatchIdx[WG_SIZE+1];\n" +" __local int ldsCurBatch;\n" +" __local int ldsNextBatch;\n" +" __local int ldsStart;\n" +"\n" +" int lIdx = GET_LOCAL_IDX;\n" +" int wgIdx = GET_GROUP_IDX;\n" +"\n" +"// int gIdx = GET_GLOBAL_IDX;\n" +"// debugInfo[gIdx].m_valInt0 = gIdx;\n" +" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n" +"\n" +"\n" +" int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n" +" int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n" +" int cellIdx = xIdx+yIdx*nSplit;\n" +" \n" +" if( gN[cellIdx] == 0 ) \n" +" return;\n" +"\n" +" const int start = gOffsets[cellIdx];\n" +" const int end = start + gN[cellIdx];\n" +"\n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" ldsCurBatch = 0;\n" +" ldsNextBatch = 0;\n" +" ldsStart = start;\n" +" }\n" +"\n" +"\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" int idx=ldsStart+lIdx;\n" +" while (ldsCurBatch < maxBatch)\n" +" {\n" +" for(; idx 0.70710678f) { + // choose p in y-z plane + float a = n[0].y*n[0].y + n[0].z*n[0].z; + float k = 1.f/sqrt(a); + p[0].x = 0; + p[0].y = -n[0].z*k; + p[0].z = n[0].y*k; + // set q = n x p + q[0].x = a*k; + q[0].y = -n[0].x*p[0].z; + q[0].z = n[0].x*p[0].y; + } + else { + // choose p in x-y plane + float a = n[0].x*n[0].x + n[0].y*n[0].y; + float k = 1.f/sqrt(a); + p[0].x = -n[0].y*k; + p[0].y = n[0].x*k; + p[0].z = 0; + // set q = n x p + q[0].x = -n[0].z*p[0].y; + q[0].y = n[0].z*p[0].x; + q[0].z = a*k; + } +} + + +void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs); +void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs) +{ + float frictionCoeff = ldsCs[0].m_linear.w; + int aIdx = ldsCs[0].m_bodyA; + int bIdx = ldsCs[0].m_bodyB; + + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; + + + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + float sum = 0; + for(int j=0; j<4; j++) + { + sum +=ldsCs[0].m_appliedRambdaDt[j]; + } + frictionCoeff = 0.7f; + for(int j=0; j<4; j++) + { + maxRambdaDt[j] = frictionCoeff*sum; + minRambdaDt[j] = -maxRambdaDt[j]; + } + + +// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA, +// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt ); + + + { + + __global Constraint4* cs = ldsCs; + + if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return; + const float4 center = cs->m_center; + + float4 n = -cs->m_linear; + + float4 tangent[2]; + btPlaneSpace1(&n,&tangent[0],&tangent[1]); + float4 angular0, angular1, linear; + float4 r0 = center - posA; + float4 r1 = center - posB; + for(int i=0; i<2; i++) + { + setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 ); + float rambdaDt = calcRelVel(linear, -linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB ); + rambdaDt *= cs->m_fJacCoeffInv[i]; + + { + float prevSum = cs->m_fAppliedRambdaDt[i]; + float updated = prevSum; + updated += rambdaDt; + updated = max2( updated, minRambdaDt[i] ); + updated = min2( updated, maxRambdaDt[i] ); + rambdaDt = updated - prevSum; + cs->m_fAppliedRambdaDt[i] = updated; + } + + float4 linImp0 = invMassA*linear*rambdaDt; + float4 linImp1 = invMassB*(-linear)*rambdaDt; + float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt; + float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt; + + linVelA += linImp0; + angVelA += angImp0; + linVelB += linImp1; + angVelB += angImp1; + } + { // angular damping for point constraint + float4 ab = normalize3( posB - posA ); + float4 ac = normalize3( center - posA ); + if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f)) + { + float angNA = dot3F4( n, angVelA ); + float angNB = dot3F4( n, angVelB ); + + angVelA -= (angNA*0.1f)*n; + angVelB -= (angNB*0.1f)*n; + } + } + } + + + + } + + if (gBodies[aIdx].m_invMass) + { + gBodies[aIdx].m_linVel = linVelA; + gBodies[aIdx].m_angVel = angVelA; + } else + { + gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0); + gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0); + } + if (gBodies[bIdx].m_invMass) + { + gBodies[bIdx].m_linVel = linVelB; + gBodies[bIdx].m_angVel = angVelB; + } else + { + gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0); + gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0); + } + + +} + +typedef struct +{ + int m_valInt0; + int m_valInt1; + int m_valInt2; + int m_valInt3; + + float m_val0; + float m_val1; + float m_val2; + float m_val3; +} SolverDebugInfo; + + + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void BatchSolveKernelFriction(__global Body* gBodies, + __global Shape* gShapes, + __global Constraint4* gConstraints, + __global int* gN, + __global int* gOffsets, + int maxBatch, + int bIdx, + int nSplit + ) +{ + //__local int ldsBatchIdx[WG_SIZE+1]; + __local int ldsCurBatch; + __local int ldsNextBatch; + __local int ldsStart; + + int lIdx = GET_LOCAL_IDX; + int wgIdx = GET_GROUP_IDX; + +// int gIdx = GET_GLOBAL_IDX; +// debugInfo[gIdx].m_valInt0 = gIdx; + //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE; + + + int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1); + int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1); + int cellIdx = xIdx+yIdx*nSplit; + + if( gN[cellIdx] == 0 ) + return; + + const int start = gOffsets[cellIdx]; + const int end = start + gN[cellIdx]; + + + if( lIdx == 0 ) + { + ldsCurBatch = 0; + ldsNextBatch = 0; + ldsStart = start; + } + + + GROUP_LDS_BARRIER; + + int idx=ldsStart+lIdx; + while (ldsCurBatch < maxBatch) + { + for(; idx 0.70710678f) {\n" +" // choose p in y-z plane\n" +" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = 0;\n" +" p[0].y = -n[0].z*k;\n" +" p[0].z = n[0].y*k;\n" +" // set q = n x p\n" +" q[0].x = a*k;\n" +" q[0].y = -n[0].x*p[0].z;\n" +" q[0].z = n[0].x*p[0].y;\n" +" }\n" +" else {\n" +" // choose p in x-y plane\n" +" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = -n[0].y*k;\n" +" p[0].y = n[0].x*k;\n" +" p[0].z = 0;\n" +" // set q = n x p\n" +" q[0].x = -n[0].z*p[0].y;\n" +" q[0].y = n[0].z*p[0].x;\n" +" q[0].z = a*k;\n" +" }\n" +"}\n" +"\n" +"\n" +"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n" +"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n" +"{\n" +" float frictionCoeff = ldsCs[0].m_linear.w;\n" +" int aIdx = ldsCs[0].m_bodyA;\n" +" int bIdx = ldsCs[0].m_bodyB;\n" +"\n" +"\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" +"\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" +" \n" +"\n" +" {\n" +" float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n" +" float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n" +"\n" +" float sum = 0;\n" +" for(int j=0; j<4; j++)\n" +" {\n" +" sum +=ldsCs[0].m_appliedRambdaDt[j];\n" +" }\n" +" frictionCoeff = 0.7f;\n" +" for(int j=0; j<4; j++)\n" +" {\n" +" maxRambdaDt[j] = frictionCoeff*sum;\n" +" minRambdaDt[j] = -maxRambdaDt[j];\n" +" }\n" +"\n" +" \n" +"// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" +"// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n" +" \n" +" \n" +" {\n" +" \n" +" __global Constraint4* cs = ldsCs;\n" +" \n" +" if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n" +" const float4 center = cs->m_center;\n" +" \n" +" float4 n = -cs->m_linear;\n" +" \n" +" float4 tangent[2];\n" +" btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n" +" float4 angular0, angular1, linear;\n" +" float4 r0 = center - posA;\n" +" float4 r1 = center - posB;\n" +" for(int i=0; i<2; i++)\n" +" {\n" +" setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n" +" float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n" +" linVelA, angVelA, linVelB, angVelB );\n" +" rambdaDt *= cs->m_fJacCoeffInv[i];\n" +" \n" +" {\n" +" float prevSum = cs->m_fAppliedRambdaDt[i];\n" +" float updated = prevSum;\n" +" updated += rambdaDt;\n" +" updated = max2( updated, minRambdaDt[i] );\n" +" updated = min2( updated, maxRambdaDt[i] );\n" +" rambdaDt = updated - prevSum;\n" +" cs->m_fAppliedRambdaDt[i] = updated;\n" +" }\n" +" \n" +" float4 linImp0 = invMassA*linear*rambdaDt;\n" +" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" +" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" +" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" +" \n" +" linVelA += linImp0;\n" +" angVelA += angImp0;\n" +" linVelB += linImp1;\n" +" angVelB += angImp1;\n" +" }\n" +" { // angular damping for point constraint\n" +" float4 ab = normalize3( posB - posA );\n" +" float4 ac = normalize3( center - posA );\n" +" if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))\n" +" {\n" +" float angNA = dot3F4( n, angVelA );\n" +" float angNB = dot3F4( n, angVelB );\n" +" \n" +" angVelA -= (angNA*0.1f)*n;\n" +" angVelB -= (angNB*0.1f)*n;\n" +" }\n" +" }\n" +" }\n" +"\n" +" \n" +" \n" +" }\n" +"\n" +" if (gBodies[aIdx].m_invMass)\n" +" {\n" +" gBodies[aIdx].m_linVel = linVelA;\n" +" gBodies[aIdx].m_angVel = angVelA;\n" +" } else\n" +" {\n" +" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n" +" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n" +" }\n" +" if (gBodies[bIdx].m_invMass)\n" +" {\n" +" gBodies[bIdx].m_linVel = linVelB;\n" +" gBodies[bIdx].m_angVel = angVelB;\n" +" } else\n" +" {\n" +" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n" +" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n" +" }\n" +" \n" +"\n" +"}\n" +"\n" +"typedef struct \n" +"{\n" +" int m_valInt0;\n" +" int m_valInt1;\n" +" int m_valInt2;\n" +" int m_valInt3;\n" +"\n" +" float m_val0;\n" +" float m_val1;\n" +" float m_val2;\n" +" float m_val3;\n" +"} SolverDebugInfo;\n" +"\n" +"\n" +"\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void BatchSolveKernelFriction(__global Body* gBodies,\n" +" __global Shape* gShapes,\n" +" __global Constraint4* gConstraints,\n" +" __global int* gN,\n" +" __global int* gOffsets,\n" +" int maxBatch,\n" +" int bIdx,\n" +" int nSplit\n" +" )\n" +"{\n" +" //__local int ldsBatchIdx[WG_SIZE+1];\n" +" __local int ldsCurBatch;\n" +" __local int ldsNextBatch;\n" +" __local int ldsStart;\n" +"\n" +" int lIdx = GET_LOCAL_IDX;\n" +" int wgIdx = GET_GROUP_IDX;\n" +"\n" +"// int gIdx = GET_GLOBAL_IDX;\n" +"// debugInfo[gIdx].m_valInt0 = gIdx;\n" +" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n" +"\n" +"\n" +" int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n" +" int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n" +" int cellIdx = xIdx+yIdx*nSplit;\n" +" \n" +" if( gN[cellIdx] == 0 ) \n" +" return;\n" +"\n" +" const int start = gOffsets[cellIdx];\n" +" const int end = start + gN[cellIdx];\n" +"\n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" ldsCurBatch = 0;\n" +" ldsNextBatch = 0;\n" +" ldsStart = start;\n" +" }\n" +"\n" +"\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" int idx=ldsStart+lIdx;\n" +" while (ldsCurBatch < maxBatch)\n" +" {\n" +" for(; idx 0.70710678f) { + // choose p in y-z plane + float a = n.y*n.y + n.z*n.z; + float k = 1.f/sqrt(a); + p[0].x = 0; + p[0].y = -n.z*k; + p[0].z = n.y*k; + // set q = n x p + q[0].x = a*k; + q[0].y = -n.x*p[0].z; + q[0].z = n.x*p[0].y; + } + else { + // choose p in x-y plane + float a = n.x*n.x + n.y*n.y; + float k = 1.f/sqrt(a); + p[0].x = -n.y*k; + p[0].y = n.x*k; + p[0].z = 0; + // set q = n x p + q[0].x = -n.z*p[0].y; + q[0].y = n.z*p[0].x; + q[0].z = a*k; + } +} + + +void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA, + const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, + __global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff, + Constraint4* dstC ) +{ + dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit); + dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit); + + float dtInv = 1.f/dt; + for(int ic=0; ic<4; ic++) + { + dstC->m_appliedRambdaDt[ic] = 0.f; + } + dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f; + + + dstC->m_linear = -src->m_worldNormal; + dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() ); + for(int ic=0; ic<4; ic++) + { + float4 r0 = src->m_worldPos[ic] - posA; + float4 r1 = src->m_worldPos[ic] - posB; + + if( ic >= src->m_worldNormal.w )//npoints + { + dstC->m_jacCoeffInv[ic] = 0.f; + continue; + } + + float relVelN; + { + float4 linear, angular0, angular1; + setLinearAndAngular(src->m_worldNormal, r0, r1, &linear, &angular0, &angular1); + + dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1, + invMassA, &invInertiaA, invMassB, &invInertiaB ); + + relVelN = calcRelVel(linear, -linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB); + + float e = 0.f;//src->getRestituitionCoeff(); + if( relVelN*relVelN < 0.004f ) e = 0.f; + + dstC->m_b[ic] = e*relVelN; + //float penetration = src->m_worldPos[ic].w; + dstC->m_b[ic] += (src->m_worldPos[ic].w + positionDrift)*positionConstraintCoeff*dtInv; + dstC->m_appliedRambdaDt[ic] = 0.f; + } + } + + if( src->m_worldNormal.w > 0 )//npoints + { // prepare friction + float4 center = make_float4(0.f); + for(int i=0; im_worldNormal.w; i++) + center += src->m_worldPos[i]; + center /= (float)src->m_worldNormal.w; + + float4 tangent[2]; + btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]); + + float4 r[2]; + r[0] = center - posA; + r[1] = center - posB; + + for(int i=0; i<2; i++) + { + float4 linear, angular0, angular1; + setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1); + + dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1, + invMassA, &invInertiaA, invMassB, &invInertiaB ); + dstC->m_fAppliedRambdaDt[i] = 0.f; + } + dstC->m_center = center; + } + + for(int i=0; i<4; i++) + { + if( im_worldNormal.w ) + { + dstC->m_worldPos[i] = src->m_worldPos[i]; + } + else + { + dstC->m_worldPos[i] = make_float4(0.f); + } + } +} + +typedef struct +{ + int m_nContacts; + float m_dt; + float m_positionDrift; + float m_positionConstraintCoeff; +} ConstBufferCTC; + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void ContactToConstraintKernel(__global Contact4* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut, +int nContacts, +float dt, +float positionDrift, +float positionConstraintCoeff +) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit); + int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit); + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; + + Constraint4 cs; + + setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB, + &gContact[gIdx], dt, positionDrift, positionConstraintCoeff, + &cs ); + + cs.m_batchIdx = gContact[gIdx].m_batchIdx; + + gConstraintOut[gIdx] = cs; + } +} + + + + + diff --git a/opencl/gpu_rigidbody/kernels/solverSetup.h b/opencl/gpu_rigidbody/kernels/solverSetup.h new file mode 100644 index 000000000..40839a8c7 --- /dev/null +++ b/opencl/gpu_rigidbody/kernels/solverSetup.h @@ -0,0 +1,664 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solverSetupCL= \ +"\n" +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"\n" +"\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"\n" +"\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile global int*\n" +"#endif\n" +"\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"\n" +"\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"\n" +"\n" +"#define max2 max\n" +"#define min2 min\n" +"\n" +"\n" +"///////////////////////////////////////\n" +"// Vector\n" +"///////////////////////////////////////\n" +"__inline\n" +"float fastDiv(float numerator, float denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"// return numerator/denominator; \n" +"}\n" +"\n" +"__inline\n" +"float4 fastDiv4(float4 numerator, float4 denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"}\n" +"\n" +"__inline\n" +"float fastSqrtf(float f2)\n" +"{\n" +" return native_sqrt(f2);\n" +"// return sqrt(f2);\n" +"}\n" +"\n" +"__inline\n" +"float fastRSqrt(float f2)\n" +"{\n" +" return native_rsqrt(f2);\n" +"}\n" +"\n" +"__inline\n" +"float fastLength4(float4 v)\n" +"{\n" +" return fast_length(v);\n" +"}\n" +"\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"\n" +"\n" +"__inline\n" +"float sqrtf(float a)\n" +"{\n" +"// return sqrt(a);\n" +" return native_sqrt(a);\n" +"}\n" +"\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = make_float4(a.xyz,0.f);\n" +" float4 b1 = make_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"\n" +"__inline\n" +"float length3(const float4 a)\n" +"{\n" +" return sqrtf(dot3F4(a,a));\n" +"}\n" +"\n" +"__inline\n" +"float dot4(const float4 a, const float4 b)\n" +"{\n" +" return dot( a, b );\n" +"}\n" +"\n" +"// for height\n" +"__inline\n" +"float dot3w1(const float4 point, const float4 eqn)\n" +"{\n" +" return dot3F4(point,eqn) + eqn.w;\n" +"}\n" +"\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"// float length = sqrtf(dot3F4(a, a));\n" +"// return 1.f/length * a;\n" +"}\n" +"\n" +"__inline\n" +"float4 normalize4(const float4 a)\n" +"{\n" +" float length = sqrtf(dot4(a, a));\n" +" return 1.f/length * a;\n" +"}\n" +"\n" +"__inline\n" +"float4 createEquation(const float4 a, const float4 b, const float4 c)\n" +"{\n" +" float4 eqn;\n" +" float4 ab = b-a;\n" +" float4 ac = c-a;\n" +" eqn = normalize3( cross3(ab, ac) );\n" +" eqn.w = -dot3F4(eqn,a);\n" +" return eqn;\n" +"}\n" +"\n" +"///////////////////////////////////////\n" +"// Matrix3x3\n" +"///////////////////////////////////////\n" +"\n" +"typedef struct\n" +"{\n" +" float4 m_row[3];\n" +"}Matrix3x3;\n" +"\n" +"__inline\n" +"Matrix3x3 mtZero();\n" +"\n" +"__inline\n" +"Matrix3x3 mtIdentity();\n" +"\n" +"__inline\n" +"Matrix3x3 mtTranspose(Matrix3x3 m);\n" +"\n" +"__inline\n" +"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n" +"\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b);\n" +"\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b);\n" +"\n" +"__inline\n" +"Matrix3x3 mtZero()\n" +"{\n" +" Matrix3x3 m;\n" +" m.m_row[0] = (float4)(0.f);\n" +" m.m_row[1] = (float4)(0.f);\n" +" m.m_row[2] = (float4)(0.f);\n" +" return m;\n" +"}\n" +"\n" +"__inline\n" +"Matrix3x3 mtIdentity()\n" +"{\n" +" Matrix3x3 m;\n" +" m.m_row[0] = (float4)(1,0,0,0);\n" +" m.m_row[1] = (float4)(0,1,0,0);\n" +" m.m_row[2] = (float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"\n" +"__inline\n" +"Matrix3x3 mtTranspose(Matrix3x3 m)\n" +"{\n" +" Matrix3x3 out;\n" +" out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"\n" +"__inline\n" +"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n" +"{\n" +" Matrix3x3 transB;\n" +" transB = mtTranspose( b );\n" +" Matrix3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b)\n" +"{\n" +" float4 ans;\n" +" ans.x = dot3F4( a.m_row[0], b );\n" +" ans.y = dot3F4( a.m_row[1], b );\n" +" ans.z = dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b)\n" +"{\n" +" float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +"\n" +" float4 ans;\n" +" ans.x = dot3F4( a, colx );\n" +" ans.y = dot3F4( a, coly );\n" +" ans.z = dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"\n" +"typedef float4 Quaternion;\n" +"\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b);\n" +"\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in);\n" +"\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec);\n" +"\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q);\n" +"\n" +"__inline\n" +"Matrix3x3 qtGetRotationMatrix(Quaternion q);\n" +"\n" +"\n" +"\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in)\n" +"{\n" +" return fastNormalize4(in);\n" +"// in /= length( in );\n" +"// return in;\n" +"}\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"\n" +"__inline\n" +"float4 qtInvRotate(const Quaternion q, float4 vec)\n" +"{\n" +" return qtRotate( qtInvert( q ), vec );\n" +"}\n" +"\n" +"__inline\n" +"Matrix3x3 qtGetRotationMatrix(Quaternion quat)\n" +"{\n" +" float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" +" Matrix3x3 out;\n" +"\n" +" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" +" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" +" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" +" out.m_row[0].w = 0.f;\n" +"\n" +" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" +" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" +" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" +" out.m_row[1].w = 0.f;\n" +"\n" +" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" +" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" +" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" +" out.m_row[2].w = 0.f;\n" +"\n" +" return out;\n" +"}\n" +"\n" +"\n" +"\n" +"\n" +"#define WG_SIZE 64\n" +"\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" Quaternion m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +"\n" +" u32 m_shapeIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} Body;\n" +"\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_invInertia;\n" +" Matrix3x3 m_initInvInertia;\n" +"} Shape;\n" +"\n" +"typedef struct\n" +"{\n" +" float4 m_linear;\n" +" float4 m_worldPos[4];\n" +" float4 m_center; \n" +" float m_jacCoeffInv[4];\n" +" float m_b[4];\n" +" float m_appliedRambdaDt[4];\n" +"\n" +" float m_fJacCoeffInv[2]; \n" +" float m_fAppliedRambdaDt[2]; \n" +"\n" +" u32 m_bodyA;\n" +" u32 m_bodyB;\n" +"\n" +" int m_batchIdx;\n" +" u32 m_paddings[1];\n" +"} Constraint4;\n" +"\n" +"typedef struct\n" +"{\n" +" float4 m_worldPos[4];\n" +" float4 m_worldNormal;\n" +" u32 m_coeffs;\n" +" int m_batchIdx;\n" +"\n" +" int m_bodyAPtrAndSignBit;\n" +" int m_bodyBPtrAndSignBit;\n" +"} Contact4;\n" +"\n" +"typedef struct\n" +"{\n" +" int m_nConstraints;\n" +" int m_start;\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBuffer;\n" +"\n" +"typedef struct\n" +"{\n" +" int m_solveFriction;\n" +" int m_maxBatch; // long batch really kills the performance\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBufferBatchSolve;\n" +"\n" +"\n" +"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n" +"{\n" +" *linear = -n;\n" +" *angular0 = -cross3(r0, n);\n" +" *angular1 = cross3(r1, n);\n" +"}\n" +"\n" +"\n" +"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n" +"{\n" +" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n" +"}\n" +"\n" +"\n" +"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" +" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n" +"{\n" +" // linear0,1 are normlized\n" +" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n" +" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" +" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n" +" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" +" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" +"}\n" +"\n" +"\n" +"\n" +" \n" +"\n" +"\n" +"typedef struct \n" +"{\n" +" int m_valInt0;\n" +" int m_valInt1;\n" +" int m_valInt2;\n" +" int m_valInt3;\n" +"\n" +" float m_val0;\n" +" float m_val1;\n" +" float m_val2;\n" +" float m_val3;\n" +"} SolverDebugInfo;\n" +"\n" +"\n" +"\n" +"typedef struct\n" +"{\n" +" int m_nContacts;\n" +" int m_staticIdx;\n" +" float m_scale;\n" +" int m_nSplit;\n" +"} ConstBufferSSD;\n" +"\n" +"\n" +"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n" +" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n" +"{\n" +" if (fabs(n.z) > 0.70710678f) {\n" +" // choose p in y-z plane\n" +" float a = n.y*n.y + n.z*n.z;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = 0;\n" +" p[0].y = -n.z*k;\n" +" p[0].z = n.y*k;\n" +" // set q = n x p\n" +" q[0].x = a*k;\n" +" q[0].y = -n.x*p[0].z;\n" +" q[0].z = n.x*p[0].y;\n" +" }\n" +" else {\n" +" // choose p in x-y plane\n" +" float a = n.x*n.x + n.y*n.y;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = -n.y*k;\n" +" p[0].y = n.x*k;\n" +" p[0].z = 0;\n" +" // set q = n x p\n" +" q[0].x = -n.z*p[0].y;\n" +" q[0].y = n.z*p[0].x;\n" +" q[0].z = a*k;\n" +" }\n" +"}\n" +"\n" +"\n" +"void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,\n" +" const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, \n" +" __global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff,\n" +" Constraint4* dstC )\n" +"{\n" +" dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n" +" dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n" +"\n" +" float dtInv = 1.f/dt;\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" dstC->m_appliedRambdaDt[ic] = 0.f;\n" +" }\n" +" dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n" +"\n" +"\n" +" dstC->m_linear = -src->m_worldNormal;\n" +" dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" float4 r0 = src->m_worldPos[ic] - posA;\n" +" float4 r1 = src->m_worldPos[ic] - posB;\n" +"\n" +" if( ic >= src->m_worldNormal.w )//npoints\n" +" {\n" +" dstC->m_jacCoeffInv[ic] = 0.f;\n" +" continue;\n" +" }\n" +"\n" +" float relVelN;\n" +" {\n" +" float4 linear, angular0, angular1;\n" +" setLinearAndAngular(src->m_worldNormal, r0, r1, &linear, &angular0, &angular1);\n" +"\n" +" dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n" +" invMassA, &invInertiaA, invMassB, &invInertiaB );\n" +"\n" +" relVelN = calcRelVel(linear, -linear, angular0, angular1,\n" +" linVelA, angVelA, linVelB, angVelB);\n" +"\n" +" float e = 0.f;//src->getRestituitionCoeff();\n" +" if( relVelN*relVelN < 0.004f ) e = 0.f;\n" +"\n" +" dstC->m_b[ic] = e*relVelN;\n" +" //float penetration = src->m_worldPos[ic].w;\n" +" dstC->m_b[ic] += (src->m_worldPos[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n" +" dstC->m_appliedRambdaDt[ic] = 0.f;\n" +" }\n" +" }\n" +"\n" +" if( src->m_worldNormal.w > 0 )//npoints\n" +" { // prepare friction\n" +" float4 center = make_float4(0.f);\n" +" for(int i=0; im_worldNormal.w; i++) \n" +" center += src->m_worldPos[i];\n" +" center /= (float)src->m_worldNormal.w;\n" +"\n" +" float4 tangent[2];\n" +" btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n" +" \n" +" float4 r[2];\n" +" r[0] = center - posA;\n" +" r[1] = center - posB;\n" +"\n" +" for(int i=0; i<2; i++)\n" +" {\n" +" float4 linear, angular0, angular1;\n" +" setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n" +"\n" +" dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n" +" invMassA, &invInertiaA, invMassB, &invInertiaB );\n" +" dstC->m_fAppliedRambdaDt[i] = 0.f;\n" +" }\n" +" dstC->m_center = center;\n" +" }\n" +"\n" +" for(int i=0; i<4; i++)\n" +" {\n" +" if( im_worldNormal.w )\n" +" {\n" +" dstC->m_worldPos[i] = src->m_worldPos[i];\n" +" }\n" +" else\n" +" {\n" +" dstC->m_worldPos[i] = make_float4(0.f);\n" +" }\n" +" }\n" +"}\n" +"\n" +"typedef struct\n" +"{\n" +" int m_nContacts;\n" +" float m_dt;\n" +" float m_positionDrift;\n" +" float m_positionConstraintCoeff;\n" +"} ConstBufferCTC;\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void ContactToConstraintKernel(__global Contact4* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut, \n" +"int nContacts,\n" +"float dt,\n" +"float positionDrift,\n" +"float positionConstraintCoeff\n" +")\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" \n" +" if( gIdx < nContacts )\n" +" {\n" +" int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n" +" int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n" +"\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" +"\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" +"\n" +" Constraint4 cs;\n" +"\n" +" setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n" +" &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,\n" +" &cs );\n" +" \n" +" cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n" +"\n" +" gConstraintOut[gIdx] = cs;\n" +" }\n" +"}\n" +"\n" +"\n" +"\n" +"\n" +"\n" +"\n" +; diff --git a/opencl/gpu_rigidbody/kernels/solverSetup2.cl b/opencl/gpu_rigidbody/kernels/solverSetup2.cl new file mode 100644 index 000000000..0ab67919d --- /dev/null +++ b/opencl/gpu_rigidbody/kernels/solverSetup2.cl @@ -0,0 +1,494 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile global int* +#endif + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +/////////////////////////////////////// +// Vector +/////////////////////////////////////// +__inline +float fastDiv(float numerator, float denominator) +{ + return native_divide(numerator, denominator); +// return numerator/denominator; +} + +__inline +float4 fastDiv4(float4 numerator, float4 denominator) +{ + return native_divide(numerator, denominator); +} + +__inline +float fastSqrtf(float f2) +{ + return native_sqrt(f2); +// return sqrt(f2); +} + +__inline +float fastRSqrt(float f2) +{ + return native_rsqrt(f2); +} + +__inline +float fastLength4(float4 v) +{ + return fast_length(v); +} + +__inline +float4 fastNormalize4(float4 v) +{ + return fast_normalize(v); +} + + +__inline +float sqrtf(float a) +{ +// return sqrt(a); + return native_sqrt(a); +} + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = make_float4(a.xyz,0.f); + float4 b1 = make_float4(b.xyz,0.f); + return dot(a1, b1); +} + +__inline +float length3(const float4 a) +{ + return sqrtf(dot3F4(a,a)); +} + +__inline +float dot4(const float4 a, const float4 b) +{ + return dot( a, b ); +} + +// for height +__inline +float dot3w1(const float4 point, const float4 eqn) +{ + return dot3F4(point,eqn) + eqn.w; +} + +__inline +float4 normalize3(const float4 a) +{ + float4 n = make_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +// float length = sqrtf(dot3F4(a, a)); +// return 1.f/length * a; +} + +__inline +float4 normalize4(const float4 a) +{ + float length = sqrtf(dot4(a, a)); + return 1.f/length * a; +} + +__inline +float4 createEquation(const float4 a, const float4 b, const float4 c) +{ + float4 eqn; + float4 ab = b-a; + float4 ac = c-a; + eqn = normalize3( cross3(ab, ac) ); + eqn.w = -dot3F4(eqn,a); + return eqn; +} + +/////////////////////////////////////// +// Matrix3x3 +/////////////////////////////////////// + +typedef struct +{ + float4 m_row[3]; +}Matrix3x3; + +__inline +Matrix3x3 mtZero(); + +__inline +Matrix3x3 mtIdentity(); + +__inline +Matrix3x3 mtTranspose(Matrix3x3 m); + +__inline +Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b); + +__inline +float4 mtMul1(Matrix3x3 a, float4 b); + +__inline +float4 mtMul3(float4 a, Matrix3x3 b); + +__inline +Matrix3x3 mtZero() +{ + Matrix3x3 m; + m.m_row[0] = (float4)(0.f); + m.m_row[1] = (float4)(0.f); + m.m_row[2] = (float4)(0.f); + return m; +} + +__inline +Matrix3x3 mtIdentity() +{ + Matrix3x3 m; + m.m_row[0] = (float4)(1,0,0,0); + m.m_row[1] = (float4)(0,1,0,0); + m.m_row[2] = (float4)(0,0,1,0); + return m; +} + +__inline +Matrix3x3 mtTranspose(Matrix3x3 m) +{ + Matrix3x3 out; + out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f); + out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f); + out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f); + return out; +} + +__inline +Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b) +{ + Matrix3x3 transB; + transB = mtTranspose( b ); + Matrix3x3 ans; + // why this doesn't run when 0ing in the for{} + a.m_row[0].w = 0.f; + a.m_row[1].w = 0.f; + a.m_row[2].w = 0.f; + for(int i=0; i<3; i++) + { +// a.m_row[i].w = 0.f; + ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]); + ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]); + ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]); + ans.m_row[i].w = 0.f; + } + return ans; +} + +__inline +float4 mtMul1(Matrix3x3 a, float4 b) +{ + float4 ans; + ans.x = dot3F4( a.m_row[0], b ); + ans.y = dot3F4( a.m_row[1], b ); + ans.z = dot3F4( a.m_row[2], b ); + ans.w = 0.f; + return ans; +} + +__inline +float4 mtMul3(float4 a, Matrix3x3 b) +{ + float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); + float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); + float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); + + float4 ans; + ans.x = dot3F4( a, colx ); + ans.y = dot3F4( a, coly ); + ans.z = dot3F4( a, colz ); + return ans; +} + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + +__inline +Quaternion qtMul(Quaternion a, Quaternion b); + +__inline +Quaternion qtNormalize(Quaternion in); + +__inline +float4 qtRotate(Quaternion q, float4 vec); + +__inline +Quaternion qtInvert(Quaternion q); + + + + + +__inline +Quaternion qtMul(Quaternion a, Quaternion b) +{ + Quaternion ans; + ans = cross3( a, b ); + ans += a.w*b+b.w*a; +// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w*b.w - dot3F4(a, b); + return ans; +} + +__inline +Quaternion qtNormalize(Quaternion in) +{ + return fastNormalize4(in); +// in /= length( in ); +// return in; +} +__inline +float4 qtRotate(Quaternion q, float4 vec) +{ + Quaternion qInv = qtInvert( q ); + float4 vcpy = vec; + vcpy.w = 0.f; + float4 out = qtMul(qtMul(q,vcpy),qInv); + return out; +} + +__inline +Quaternion qtInvert(Quaternion q) +{ + return (Quaternion)(-q.xyz, q.w); +} + +__inline +float4 qtInvRotate(const Quaternion q, float4 vec) +{ + return qtRotate( qtInvert( q ), vec ); +} + + + + +#define WG_SIZE 64 + +typedef struct +{ + float4 m_pos; + Quaternion m_quat; + float4 m_linVel; + float4 m_angVel; + + u32 m_shapeIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} Body; + +typedef struct +{ + Matrix3x3 m_invInertia; + Matrix3x3 m_initInvInertia; +} Shape; + +typedef struct +{ + float4 m_linear; + float4 m_worldPos[4]; + float4 m_center; + float m_jacCoeffInv[4]; + float m_b[4]; + float m_appliedRambdaDt[4]; + + float m_fJacCoeffInv[2]; + float m_fAppliedRambdaDt[2]; + + u32 m_bodyA; + u32 m_bodyB; + + int m_batchIdx; + u32 m_paddings[1]; +} Constraint4; + +typedef struct +{ + float4 m_worldPos[4]; + float4 m_worldNormal; + u32 m_coeffs; + int m_batchIdx; + + int m_bodyAPtrAndSignBit; + int m_bodyBPtrAndSignBit; +} Contact4; + +typedef struct +{ + int m_nConstraints; + int m_start; + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_solveFriction; + int m_maxBatch; // long batch really kills the performance + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBufferBatchSolve; + + + + + +typedef struct +{ + int m_valInt0; + int m_valInt1; + int m_valInt2; + int m_valInt3; + + float m_val0; + float m_val1; + float m_val2; + float m_val3; +} SolverDebugInfo; + + + + +// others +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void ReorderContactKernel(__global Contact4* in, __global Contact4* out, __global int2* sortData, int4 cb ) +{ + int nContacts = cb.x; + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int srcIdx = sortData[gIdx].y; + out[gIdx] = in[srcIdx]; + } +} + +typedef struct +{ + int m_nContacts; + int m_staticIdx; + float m_scale; + int m_nSplit; +} ConstBufferSSD; + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SetSortDataKernel(__global Contact4* gContact, __global Body* gBodies, __global int2* gSortDataOut, +int nContacts, +float scale, +int N_SPLIT +) + +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit); + int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit); + + int idx = (gContact[gIdx].m_bodyAPtrAndSignBit<0)? bIdx: aIdx; + float4 p = gBodies[idx].m_pos; + int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (N_SPLIT-1); + int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (N_SPLIT-1); + + gSortDataOut[gIdx].x = (xIdx+zIdx*N_SPLIT); + gSortDataOut[gIdx].y = gIdx; + } + else + { + gSortDataOut[gIdx].x = 0xffffffff; + } +} + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void CopyConstraintKernel(__global Contact4* gIn, __global Contact4* gOut, int4 cb ) +{ + int gIdx = GET_GLOBAL_IDX; + if( gIdx < cb.x ) + { + gOut[gIdx] = gIn[gIdx]; + } +} + + + diff --git a/opencl/gpu_rigidbody/kernels/solverSetup2.h b/opencl/gpu_rigidbody/kernels/solverSetup2.h new file mode 100644 index 000000000..1796a6c5b --- /dev/null +++ b/opencl/gpu_rigidbody/kernels/solverSetup2.h @@ -0,0 +1,498 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solverSetup2CL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"\n" +"\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"\n" +"\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile global int*\n" +"#endif\n" +"\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"\n" +"\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"\n" +"\n" +"#define max2 max\n" +"#define min2 min\n" +"\n" +"\n" +"///////////////////////////////////////\n" +"// Vector\n" +"///////////////////////////////////////\n" +"__inline\n" +"float fastDiv(float numerator, float denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"// return numerator/denominator; \n" +"}\n" +"\n" +"__inline\n" +"float4 fastDiv4(float4 numerator, float4 denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"}\n" +"\n" +"__inline\n" +"float fastSqrtf(float f2)\n" +"{\n" +" return native_sqrt(f2);\n" +"// return sqrt(f2);\n" +"}\n" +"\n" +"__inline\n" +"float fastRSqrt(float f2)\n" +"{\n" +" return native_rsqrt(f2);\n" +"}\n" +"\n" +"__inline\n" +"float fastLength4(float4 v)\n" +"{\n" +" return fast_length(v);\n" +"}\n" +"\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"\n" +"\n" +"__inline\n" +"float sqrtf(float a)\n" +"{\n" +"// return sqrt(a);\n" +" return native_sqrt(a);\n" +"}\n" +"\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = make_float4(a.xyz,0.f);\n" +" float4 b1 = make_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"\n" +"__inline\n" +"float length3(const float4 a)\n" +"{\n" +" return sqrtf(dot3F4(a,a));\n" +"}\n" +"\n" +"__inline\n" +"float dot4(const float4 a, const float4 b)\n" +"{\n" +" return dot( a, b );\n" +"}\n" +"\n" +"// for height\n" +"__inline\n" +"float dot3w1(const float4 point, const float4 eqn)\n" +"{\n" +" return dot3F4(point,eqn) + eqn.w;\n" +"}\n" +"\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"// float length = sqrtf(dot3F4(a, a));\n" +"// return 1.f/length * a;\n" +"}\n" +"\n" +"__inline\n" +"float4 normalize4(const float4 a)\n" +"{\n" +" float length = sqrtf(dot4(a, a));\n" +" return 1.f/length * a;\n" +"}\n" +"\n" +"__inline\n" +"float4 createEquation(const float4 a, const float4 b, const float4 c)\n" +"{\n" +" float4 eqn;\n" +" float4 ab = b-a;\n" +" float4 ac = c-a;\n" +" eqn = normalize3( cross3(ab, ac) );\n" +" eqn.w = -dot3F4(eqn,a);\n" +" return eqn;\n" +"}\n" +"\n" +"///////////////////////////////////////\n" +"// Matrix3x3\n" +"///////////////////////////////////////\n" +"\n" +"typedef struct\n" +"{\n" +" float4 m_row[3];\n" +"}Matrix3x3;\n" +"\n" +"__inline\n" +"Matrix3x3 mtZero();\n" +"\n" +"__inline\n" +"Matrix3x3 mtIdentity();\n" +"\n" +"__inline\n" +"Matrix3x3 mtTranspose(Matrix3x3 m);\n" +"\n" +"__inline\n" +"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n" +"\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b);\n" +"\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b);\n" +"\n" +"__inline\n" +"Matrix3x3 mtZero()\n" +"{\n" +" Matrix3x3 m;\n" +" m.m_row[0] = (float4)(0.f);\n" +" m.m_row[1] = (float4)(0.f);\n" +" m.m_row[2] = (float4)(0.f);\n" +" return m;\n" +"}\n" +"\n" +"__inline\n" +"Matrix3x3 mtIdentity()\n" +"{\n" +" Matrix3x3 m;\n" +" m.m_row[0] = (float4)(1,0,0,0);\n" +" m.m_row[1] = (float4)(0,1,0,0);\n" +" m.m_row[2] = (float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"\n" +"__inline\n" +"Matrix3x3 mtTranspose(Matrix3x3 m)\n" +"{\n" +" Matrix3x3 out;\n" +" out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"\n" +"__inline\n" +"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n" +"{\n" +" Matrix3x3 transB;\n" +" transB = mtTranspose( b );\n" +" Matrix3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b)\n" +"{\n" +" float4 ans;\n" +" ans.x = dot3F4( a.m_row[0], b );\n" +" ans.y = dot3F4( a.m_row[1], b );\n" +" ans.z = dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b)\n" +"{\n" +" float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +"\n" +" float4 ans;\n" +" ans.x = dot3F4( a, colx );\n" +" ans.y = dot3F4( a, coly );\n" +" ans.z = dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"\n" +"typedef float4 Quaternion;\n" +"\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b);\n" +"\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in);\n" +"\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec);\n" +"\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q);\n" +"\n" +"\n" +"\n" +"\n" +"\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in)\n" +"{\n" +" return fastNormalize4(in);\n" +"// in /= length( in );\n" +"// return in;\n" +"}\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"\n" +"__inline\n" +"float4 qtInvRotate(const Quaternion q, float4 vec)\n" +"{\n" +" return qtRotate( qtInvert( q ), vec );\n" +"}\n" +"\n" +"\n" +"\n" +"\n" +"#define WG_SIZE 64\n" +"\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" Quaternion m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +"\n" +" u32 m_shapeIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} Body;\n" +"\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_invInertia;\n" +" Matrix3x3 m_initInvInertia;\n" +"} Shape;\n" +"\n" +"typedef struct\n" +"{\n" +" float4 m_linear;\n" +" float4 m_worldPos[4];\n" +" float4 m_center; \n" +" float m_jacCoeffInv[4];\n" +" float m_b[4];\n" +" float m_appliedRambdaDt[4];\n" +"\n" +" float m_fJacCoeffInv[2]; \n" +" float m_fAppliedRambdaDt[2]; \n" +"\n" +" u32 m_bodyA;\n" +" u32 m_bodyB;\n" +"\n" +" int m_batchIdx;\n" +" u32 m_paddings[1];\n" +"} Constraint4;\n" +"\n" +"typedef struct\n" +"{\n" +" float4 m_worldPos[4];\n" +" float4 m_worldNormal;\n" +" u32 m_coeffs;\n" +" int m_batchIdx;\n" +"\n" +" int m_bodyAPtrAndSignBit;\n" +" int m_bodyBPtrAndSignBit;\n" +"} Contact4;\n" +"\n" +"typedef struct\n" +"{\n" +" int m_nConstraints;\n" +" int m_start;\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBuffer;\n" +"\n" +"typedef struct\n" +"{\n" +" int m_solveFriction;\n" +" int m_maxBatch; // long batch really kills the performance\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBufferBatchSolve;\n" +"\n" +"\n" +" \n" +"\n" +"\n" +"typedef struct \n" +"{\n" +" int m_valInt0;\n" +" int m_valInt1;\n" +" int m_valInt2;\n" +" int m_valInt3;\n" +"\n" +" float m_val0;\n" +" float m_val1;\n" +" float m_val2;\n" +" float m_val3;\n" +"} SolverDebugInfo;\n" +"\n" +"\n" +"\n" +"\n" +"// others\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void ReorderContactKernel(__global Contact4* in, __global Contact4* out, __global int2* sortData, int4 cb )\n" +"{\n" +" int nContacts = cb.x;\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < nContacts )\n" +" {\n" +" int srcIdx = sortData[gIdx].y;\n" +" out[gIdx] = in[srcIdx];\n" +" }\n" +"}\n" +"\n" +"typedef struct\n" +"{\n" +" int m_nContacts;\n" +" int m_staticIdx;\n" +" float m_scale;\n" +" int m_nSplit;\n" +"} ConstBufferSSD;\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SetSortDataKernel(__global Contact4* gContact, __global Body* gBodies, __global int2* gSortDataOut, \n" +"int nContacts,\n" +"float scale,\n" +"int N_SPLIT\n" +")\n" +"\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" \n" +" if( gIdx < nContacts )\n" +" {\n" +" int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n" +" int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n" +"\n" +" int idx = (gContact[gIdx].m_bodyAPtrAndSignBit<0)? bIdx: aIdx;\n" +" float4 p = gBodies[idx].m_pos;\n" +" int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (N_SPLIT-1);\n" +" int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (N_SPLIT-1);\n" +"\n" +" gSortDataOut[gIdx].x = (xIdx+zIdx*N_SPLIT);\n" +" gSortDataOut[gIdx].y = gIdx;\n" +" }\n" +" else\n" +" {\n" +" gSortDataOut[gIdx].x = 0xffffffff;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void CopyConstraintKernel(__global Contact4* gIn, __global Contact4* gOut, int4 cb )\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < cb.x )\n" +" {\n" +" gOut[gIdx] = gIn[gIdx];\n" +" }\n" +"}\n" +"\n" +"\n" +"\n" +"\n" +; diff --git a/opencl/gpu_sat/host/btCollidable.h b/opencl/gpu_sat/host/btCollidable.h index e9633ca42..86ad51efe 100644 --- a/opencl/gpu_sat/host/btCollidable.h +++ b/opencl/gpu_sat/host/btCollidable.h @@ -5,7 +5,7 @@ enum btShapeTypes { SHAPE_HEIGHT_FIELD=1, - SHAPE_CONVEX_HEIGHT_FIELD=2, + SHAPE_CONVEX_HULL=3, SHAPE_PLANE=4, SHAPE_CONCAVE_TRIMESH=5,