/* Copyright (c) 2012 Advanced Micro Devices, Inc. This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. */ //Originally written by Erwin Coumans #include "OpenGLInclude.h" #include "CLPhysicsDemo.h" #include "LinearMath/btAlignedObjectArray.h" #include "DemoSettings.h" #include "../basic_initialize/btOpenCLUtils.h" #include "../opengl_interop/btOpenCLGLInteropBuffer.h" #include "../broadphase_benchmark/findPairsOpenCL.h" #include "LinearMath/btVector3.h" #include "LinearMath/btQuaternion.h" #include "LinearMath/btMatrix3x3.h" #include "../../opencl/gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.h" #include "../../opencl/gpu_rigidbody_pipeline/btConvexUtility.h" #include "../../dynamics/basic_demo/ConvexHeightFieldShape.h" #include "../broadphase_benchmark/btGridBroadphaseCl.h" #include "LinearMath/btQuickprof.h" #define MSTRINGIFY(A) #A static char* interopKernelString = #include "../broadphase_benchmark/integrateKernel.cl" #define INTEROPKERNEL_SRC_PATH "../../opencl/broadphase_benchmark/integrateKernel.cl" cl_kernel g_integrateTransformsKernel; bool runOpenCLKernels = true; btGpuNarrowphaseAndSolver* narrowphaseAndSolver = 0; ConvexHeightField* s_convexHeightField = 0 ; btOpenCLGLInteropBuffer* g_interopBuffer = 0; extern GLuint cube_vbo; extern int VBOsize; cl_mem clBuffer=0; char* hostPtr=0; cl_bool blocking= CL_TRUE; btFindPairsIO gFpIO; cl_context g_cxMainContext; cl_command_queue g_cqCommandQue; cl_device_id g_device; cl_mem gLinVelMem=0; cl_mem gAngVelMem=0; cl_mem gBodyTimes=0; #include adl::DeviceCL* g_deviceCL=0; struct btAABBHost //keep this in sync with btAABBCL! { float fx; float fy; float fz; unsigned int uw; }; struct InternalData { adl::Buffer* m_linVelBuf; adl::Buffer* m_angVelBuf; adl::Buffer* m_bodyTimes; bool m_useInterop; btGridBroadphaseCl* m_Broadphase; adl::Buffer* m_localShapeAABB; btVector3* m_linVelHost; btVector3* m_angVelHost; float* m_bodyTimesHost; InternalData():m_linVelBuf(0),m_angVelBuf(0),m_bodyTimes(0),m_useInterop(0),m_Broadphase(0) { m_linVelHost= new btVector3[MAX_CONVEX_BODIES_CL]; m_angVelHost = new btVector3[MAX_CONVEX_BODIES_CL]; m_bodyTimesHost = new float[MAX_CONVEX_BODIES_CL]; } ~InternalData() { delete[] m_linVelHost; delete[] m_angVelHost; delete[] m_bodyTimesHost; } }; void InitCL(int preferredDeviceIndex, int preferredPlatformIndex, bool useInterop) { void* glCtx=0; void* glDC = 0; #ifdef _WIN32 glCtx = wglGetCurrentContext(); #else //!_WIN32 GLXContext glCtx = glXGetCurrentContext(); #endif //!_WIN32 glDC = wglGetCurrentDC(); int ciErrNum = 0; #ifdef CL_PLATFORM_INTEL cl_device_type deviceType = CL_DEVICE_TYPE_ALL; #else cl_device_type deviceType = CL_DEVICE_TYPE_GPU; #endif if (useInterop) { g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC); } else { g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex); } oclCHECKERROR(ciErrNum, CL_SUCCESS); int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext); if (numDev>0) { g_device= btOpenCLUtils::getDevice(g_cxMainContext,0); btOpenCLDeviceInfo clInfo; btOpenCLUtils::getDeviceInfo(g_device,clInfo); btOpenCLUtils::printDeviceInfo(g_device); g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); } } CLPhysicsDemo::CLPhysicsDemo(Win32OpenGLWindow* renderer) { m_numCollisionShapes=0; m_numPhysicsInstances=0; m_data = new InternalData; } CLPhysicsDemo::~CLPhysicsDemo() { } void CLPhysicsDemo::writeBodiesToGpu() { if (narrowphaseAndSolver) narrowphaseAndSolver->writeAllBodiesToGpu(); } int CLPhysicsDemo::registerCollisionShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling) { btAlignedObjectArray verts; unsigned char* vts = (unsigned char*) vertices; for (int i=0;iregisterShape(s_convexHeightField); if (shapeIndex>=0) { btAABBHost aabbMin, aabbMax; aabbMin.fx = s_convexHeightField->m_aabb.m_min.x; aabbMin.fy = s_convexHeightField->m_aabb.m_min.y; aabbMin.fz= s_convexHeightField->m_aabb.m_min.z; aabbMin.uw = shapeIndex; aabbMax.fx = s_convexHeightField->m_aabb.m_max.x; aabbMax.fy = s_convexHeightField->m_aabb.m_max.y; aabbMax.fz= s_convexHeightField->m_aabb.m_max.z; aabbMax.uw = shapeIndex; m_data->m_localShapeAABB->write(&aabbMin,1,shapeIndex*2); m_data->m_localShapeAABB->write(&aabbMax,1,shapeIndex*2+1); adl::DeviceUtils::waitForCompletion( g_deviceCL ); } m_numCollisionShapes++; delete[] eqn; return shapeIndex; } int CLPhysicsDemo::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, void* userPointer) { btVector3 aabbMin(position[0],position[0],position[0]); btVector3 aabbMax = aabbMin; aabbMin -= btVector3(1.f,1.f,1.f); aabbMax += btVector3(1.f,1.f,1.f); if (collisionShapeIndex>=0) { btBroadphaseProxy* proxy = m_data->m_Broadphase->createProxy(aabbMin,aabbMax,collisionShapeIndex,userPointer,1,1,0,0);//m_dispatcher); } bool writeToGpu = false; int bodyIndex = -1; if (narrowphaseAndSolver) bodyIndex = narrowphaseAndSolver->registerRigidBody(collisionShapeIndex,mass,position,orientation,writeToGpu); m_numPhysicsInstances++; return bodyIndex; } void CLPhysicsDemo::init(int preferredDevice, int preferredPlatform, bool useInterop) { InitCL(-1,-1,useInterop); #define CUSTOM_CL_INITIALIZATION #ifdef CUSTOM_CL_INITIALIZATION g_deviceCL = new adl::DeviceCL(); g_deviceCL->m_deviceIdx = g_device; g_deviceCL->m_context = g_cxMainContext; g_deviceCL->m_commandQueue = g_cqCommandQue; g_deviceCL->m_kernelManager = new adl::KernelManager; #else DeviceUtils::Config cfg; cfg.m_type = DeviceUtils::Config::DEVICE_CPU; g_deviceCL = DeviceUtils::allocate( TYPE_CL, cfg ); #endif //adl::Solver::allocate(g_deviceCL->allocate( m_data->m_linVelBuf = new adl::Buffer(g_deviceCL,MAX_CONVEX_BODIES_CL); m_data->m_angVelBuf = new adl::Buffer(g_deviceCL,MAX_CONVEX_BODIES_CL); m_data->m_bodyTimes = new adl::Buffer(g_deviceCL,MAX_CONVEX_BODIES_CL); m_data->m_localShapeAABB = new adl::Buffer(g_deviceCL,MAX_CONVEX_SHAPES_CL); gLinVelMem = (cl_mem)m_data->m_linVelBuf->m_ptr; gAngVelMem = (cl_mem)m_data->m_angVelBuf->m_ptr; gBodyTimes = (cl_mem)m_data->m_bodyTimes->m_ptr; narrowphaseAndSolver = new btGpuNarrowphaseAndSolver(g_deviceCL); int maxObjects = btMax(256,MAX_CONVEX_BODIES_CL); int maxPairsSmallProxy = 32; btOverlappingPairCache* overlappingPairCache=0; m_data->m_Broadphase = new btGridBroadphaseCl(overlappingPairCache,btVector3(4.f, 4.f, 4.f), 128, 128, 128,maxObjects, maxObjects, maxPairsSmallProxy, 100.f, 128, g_cxMainContext ,g_device,g_cqCommandQue, g_deviceCL); cl_program prog = btOpenCLUtils::compileCLProgramFromString(g_cxMainContext,g_device,interopKernelString,0,"",INTEROPKERNEL_SRC_PATH); g_integrateTransformsKernel = btOpenCLUtils::compileCLKernelFromString(g_cxMainContext, g_device,interopKernelString, "integrateTransformsKernel" ,0,prog); initFindPairs(gFpIO, g_cxMainContext, g_device, g_cqCommandQue, MAX_CONVEX_BODIES_CL); } void CLPhysicsDemo::writeVelocitiesToGpu() { m_data->m_linVelBuf->write(m_data->m_linVelHost,MAX_CONVEX_BODIES_CL); m_data->m_angVelBuf->write(m_data->m_angVelHost,MAX_CONVEX_BODIES_CL); m_data->m_bodyTimes->write(m_data->m_bodyTimesHost,MAX_CONVEX_BODIES_CL); adl::DeviceUtils::waitForCompletion( g_deviceCL ); } void CLPhysicsDemo::setupInterop() { m_data->m_useInterop = true; g_interopBuffer = new btOpenCLGLInteropBuffer(g_cxMainContext,g_cqCommandQue,cube_vbo); clFinish(g_cqCommandQue); } void CLPhysicsDemo::cleanup() { delete narrowphaseAndSolver; delete m_data->m_linVelBuf; delete m_data->m_angVelBuf; delete m_data->m_bodyTimes; delete m_data->m_localShapeAABB; delete m_data->m_Broadphase; delete m_data; delete g_deviceCL->m_kernelManager; delete g_deviceCL; m_data=0; g_deviceCL=0; delete g_interopBuffer; delete s_convexHeightField; } void CLPhysicsDemo::stepSimulation() { BT_PROFILE("simulationLoop"); { BT_PROFILE("glFinish"); glFinish(); } cl_int ciErrNum = CL_SUCCESS; if(m_data->m_useInterop) { clBuffer = g_interopBuffer->getCLBUffer(); BT_PROFILE("clEnqueueAcquireGLObjects"); ciErrNum = clEnqueueAcquireGLObjects(g_cqCommandQue, 1, &clBuffer, 0, 0, NULL); adl::DeviceUtils::waitForCompletion( g_deviceCL ); } else { glBindBuffer(GL_ARRAY_BUFFER, cube_vbo); glFlush(); BT_PROFILE("glMapBuffer and clEnqueueWriteBuffer"); blocking= CL_TRUE; hostPtr= (char*)glMapBuffer( GL_ARRAY_BUFFER,GL_READ_WRITE);//GL_WRITE_ONLY if (!clBuffer) { clBuffer = clCreateBuffer(g_cxMainContext, CL_MEM_READ_WRITE, VBOsize, 0, &ciErrNum); } adl::DeviceUtils::waitForCompletion( g_deviceCL ); oclCHECKERROR(ciErrNum, CL_SUCCESS); ciErrNum = clEnqueueWriteBuffer ( g_cqCommandQue, clBuffer, blocking, 0, VBOsize, hostPtr,0,0,0 ); adl::DeviceUtils::waitForCompletion( g_deviceCL ); } oclCHECKERROR(ciErrNum, CL_SUCCESS); if (runOpenCLKernels && m_numPhysicsInstances) { gFpIO.m_numObjects = m_numPhysicsInstances; gFpIO.m_positionOffset = SHAPE_VERTEX_BUFFER_SIZE/4; gFpIO.m_clObjectsBuffer = clBuffer; gFpIO.m_dAABB = m_data->m_Broadphase->m_dAABB; gFpIO.m_dlocalShapeAABB = (cl_mem)m_data->m_localShapeAABB->m_ptr; gFpIO.m_numOverlap = 0; { BT_PROFILE("setupGpuAabbs"); setupGpuAabbsFull(gFpIO,narrowphaseAndSolver->getBodiesGpu() ); } if (1) { BT_PROFILE("calculateOverlappingPairs"); m_data->m_Broadphase->calculateOverlappingPairs(0, m_numPhysicsInstances); gFpIO.m_dAllOverlappingPairs = m_data->m_Broadphase->m_dAllOverlappingPairs; gFpIO.m_numOverlap = m_data->m_Broadphase->m_numPrefixSum; } //printf("gFpIO.m_numOverlap = %d\n",gFpIO.m_numOverlap ); if (gFpIO.m_numOverlap>=0 && gFpIO.m_numOverlapgetBodiesGpu(), narrowphaseAndSolver->getBodyInertiasGpu()); } if (gFpIO.m_numOverlap) { BT_PROFILE("computeContactsAndSolver"); if (narrowphaseAndSolver) narrowphaseAndSolver->computeContactsAndSolver(gFpIO.m_dAllOverlappingPairs,gFpIO.m_numOverlap); } { BT_PROFILE("copyBodyVelocities"); if (narrowphaseAndSolver) copyBodyVelocities(gFpIO, gLinVelMem, gAngVelMem, narrowphaseAndSolver->getBodiesGpu(), narrowphaseAndSolver->getBodyInertiasGpu()); } } } else { printf("error, gFpIO.m_numOverlap = %d\n",gFpIO.m_numOverlap); btAssert(0); } { BT_PROFILE("integrateTransforms"); if (runOpenCLKernels) { int numObjects = m_numPhysicsInstances; int offset = SHAPE_VERTEX_BUFFER_SIZE/4; ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 0, sizeof(int), &offset); ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 1, sizeof(int), &numObjects); ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 2, sizeof(cl_mem), (void*)&clBuffer ); ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 3, sizeof(cl_mem), (void*)&gLinVelMem); ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 4, sizeof(cl_mem), (void*)&gAngVelMem); ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 5, sizeof(cl_mem), (void*)&gBodyTimes); size_t workGroupSize = 64; size_t numWorkItems = workGroupSize*((m_numPhysicsInstances + (workGroupSize)) / workGroupSize); if (workGroupSize>numWorkItems) workGroupSize=numWorkItems; ciErrNum = clEnqueueNDRangeKernel(g_cqCommandQue, g_integrateTransformsKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0); oclCHECKERROR(ciErrNum, CL_SUCCESS); } } } if(m_data->m_useInterop) { BT_PROFILE("clEnqueueReleaseGLObjects"); ciErrNum = clEnqueueReleaseGLObjects(g_cqCommandQue, 1, &clBuffer, 0, 0, 0); adl::DeviceUtils::waitForCompletion( g_deviceCL ); } else { BT_PROFILE("clEnqueueReadBuffer clReleaseMemObject and glUnmapBuffer"); ciErrNum = clEnqueueReadBuffer ( g_cqCommandQue, clBuffer, blocking, 0, VBOsize, hostPtr,0,0,0); //clReleaseMemObject(clBuffer); adl::DeviceUtils::waitForCompletion( g_deviceCL ); glUnmapBuffer( GL_ARRAY_BUFFER); glFlush(); } oclCHECKERROR(ciErrNum, CL_SUCCESS); if (runOpenCLKernels) { BT_PROFILE("clFinish"); clFinish(g_cqCommandQue); } }