Added btCudaBroadphase, some early research & development work to accelerate Bullet using CUDA

Re-uses the NVidia particle demo.
2008-09-04 23:24:11 +00:00
parent d8a5bf2c9c
commit aef74321d7
25 changed files with 6570 additions and 0 deletions
--- a/Extras/CUDA/particleSystem.cu
+++ b/Extras/CUDA/particleSystem.cu
@@ -0,0 +1,331 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as 
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer software" and "commercial computer software 
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein.
+ */
+
+//#include <cutil.h>
+#include <cstdlib>
+#include <cstdio>
+#include <string.h>
+
+#if defined(__APPLE__) || defined(MACOSX)
+#include <GLUT/glut.h>
+#else
+#include <GL/glut.h>
+#endif
+
+#include <cuda_gl_interop.h>
+
+#include "particles_kernel.cu"
+#include "radixsort.cu"
+
+    //! Check for CUDA error
+#  define CUT_CHECK_ERROR(errorMessage) do {                                 \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    err = cudaThreadSynchronize();                                           \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+
+#  define MY_CUDA_SAFE_CALL_NO_SYNC( call) do {                                 \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define MY_CUDA_SAFE_CALL( call) do {                                         \
+    MY_CUDA_SAFE_CALL_NO_SYNC(call);                                            \
+    cudaError err = cudaThreadSynchronize();                                 \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+
+extern "C"
+{
+
+void cudaInit(int argc, char **argv)
+{   
+    //CUT_DEVICE_INIT(argc, argv);
+}
+
+void allocateArray(void **devPtr, size_t size)
+{
+    MY_CUDA_SAFE_CALL(cudaMalloc(devPtr, size));
+}
+
+void freeArray(void *devPtr)
+{
+    MY_CUDA_SAFE_CALL(cudaFree(devPtr));
+}
+
+void threadSync()
+{
+    MY_CUDA_SAFE_CALL(cudaThreadSynchronize());
+}
+
+void copyArrayFromDevice(void* host, const void* device, unsigned int vbo, int size)
+{   
+    if (vbo)
+        MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&device, vbo));
+    MY_CUDA_SAFE_CALL(cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost));
+    if (vbo)
+        MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vbo));
+}
+
+void copyArrayToDevice(void* device, const void* host, int offset, int size)
+{
+    MY_CUDA_SAFE_CALL(cudaMemcpy((char *) device + offset, host, size, cudaMemcpyHostToDevice));
+}
+
+void registerGLBufferObject(uint vbo)
+{
+    MY_CUDA_SAFE_CALL(cudaGLRegisterBufferObject(vbo));
+}
+
+void unregisterGLBufferObject(uint vbo)
+{
+    MY_CUDA_SAFE_CALL(cudaGLUnregisterBufferObject(vbo));
+}
+
+void setParameters(SimParams *hostParams)
+{
+    // copy parameters to constant memory
+    MY_CUDA_SAFE_CALL( cudaMemcpyToSymbol(params, hostParams, sizeof(SimParams)) );
+}
+
+//Round a / b to nearest higher integer value
+int iDivUp(int a, int b){
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+}
+
+// compute grid and thread block size for a given number of elements
+void computeGridSize(int n, int blockSize, int &numBlocks, int &numThreads)
+{
+    numThreads = min(blockSize, n);
+    numBlocks = iDivUp(n, numThreads);
+}
+
+void 
+integrateSystem(uint vboOldPos, uint vboNewPos, 
+                float* oldVel, float* newVel, 
+                float deltaTime,
+                int numBodies)
+{
+    int numThreads, numBlocks;
+    computeGridSize(numBodies, 256, numBlocks, numThreads);
+
+    float *oldPos, *newPos;
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&oldPos, vboOldPos));
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&newPos, vboNewPos));
+
+    // execute the kernel
+    integrate<<< numBlocks, numThreads >>>((float4*)newPos, (float4*)newVel,
+                                           (float4*)oldPos, (float4*)oldVel,
+                                           deltaTime);
+    
+    // check if kernel invocation generated an error
+    CUT_CHECK_ERROR("integrate kernel execution failed");
+
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboOldPos));
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboNewPos));
+}
+
+void 
+updateGrid(uint    vboPos, 
+           uint*   gridCounters,
+           uint*   gridCells,
+           uint    numBodies,
+           uint    numCells)
+{
+    int numThreads, numBlocks;
+    computeGridSize(numBodies, 256, numBlocks, numThreads);
+
+    float *pos;
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&pos, vboPos));
+
+    MY_CUDA_SAFE_CALL(cudaMemset(gridCounters, 0, numCells*sizeof(uint)));
+
+    // execute the kernel
+    updateGridD<<< numBlocks, numThreads >>>((float4 *) pos,
+                                             gridCounters,
+                                             gridCells);
+    
+    // check if kernel invocation generated an error
+    CUT_CHECK_ERROR("Kernel execution failed");
+
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboPos));
+}
+
+
+void 
+calcHash(uint    vboPos, 
+         uint*   particleHash,
+         int     numBodies)
+{
+    int numThreads, numBlocks;
+    computeGridSize(numBodies, 256, numBlocks, numThreads);
+
+    float *pos;
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&pos, vboPos));
+
+    // execute the kernel
+    calcHashD<<< numBlocks, numThreads >>>((float4 *) pos,
+                                           (uint2 *) particleHash);
+    
+    // check if kernel invocation generated an error
+    CUT_CHECK_ERROR("Kernel execution failed");
+
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboPos));
+}
+
+void 
+reorderDataAndFindCellStart(uint*  particleHash,
+							uint   vboOldPos,
+							float* oldVel,
+							float* sortedPos,
+							float* sortedVel,
+							uint*  cellStart,
+							uint   numBodies,
+							uint   numCells)
+{
+    int numThreads, numBlocks;
+    computeGridSize(numBodies, 256, numBlocks, numThreads);
+
+	MY_CUDA_SAFE_CALL(cudaMemset(cellStart, 0xffffffff, numCells*sizeof(uint)));
+
+    float *oldPos;
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&oldPos, vboOldPos));
+
+#if USE_TEX
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldPosTex, oldPos, numBodies*sizeof(float4)));
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldVelTex, oldVel, numBodies*sizeof(float4)));
+#endif
+
+    reorderDataAndFindCellStartD<<< numBlocks, numThreads >>>(
+		(uint2 *)  particleHash,
+        (float4 *) oldPos,
+        (float4 *) oldVel,
+        (float4 *) sortedPos,
+        (float4 *) sortedVel,
+        (uint *)   cellStart);
+    CUT_CHECK_ERROR("Kernel execution failed: reorderDataAndFindCellStartD");
+
+#if USE_TEX
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldPosTex));
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldVelTex));
+#endif
+
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboOldPos));
+}
+
+void
+collide(uint   vboOldPos, uint vboNewPos,
+        float* sortedPos, float* sortedVel,
+        float* oldVel, float* newVel,
+        uint*  gridCounters,
+        uint*  gridCells,
+        uint*  particleHash,
+        uint*  cellStart,
+        uint   numBodies,
+        uint   numCells,
+        uint   maxParticlesPerCell)
+{
+    float4 *oldPos, *newPos;
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&oldPos, vboOldPos));
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&newPos, vboNewPos));
+
+#if USE_TEX
+
+#if USE_SORT
+    // use sorted arrays
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldPosTex, sortedPos, numBodies*sizeof(float4)));
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldVelTex, sortedVel, numBodies*sizeof(float4)));
+
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, particleHashTex, particleHash, numBodies*sizeof(uint2)));
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, cellStartTex, cellStart, numCells*sizeof(uint)));
+#else
+
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldPosTex, oldPos, numBodies*sizeof(float4)));
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldVelTex, oldVel, numBodies*sizeof(float4)));
+
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, gridCountersTex, gridCounters,numCells*sizeof(uint)));
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, gridCellsTex, gridCells, numCells*maxParticlesPerCell*sizeof(uint)));
+#endif
+
+#endif
+
+    // thread per particle
+    int numThreads, numBlocks;
+    computeGridSize(numBodies, BLOCKDIM, numBlocks, numThreads);
+
+    // execute the kernel
+    collideD<<< numBlocks, numThreads >>>((float4*)newPos, (float4*)newVel,
+#if USE_SORT
+                                          (float4*)sortedPos, (float4*)sortedVel,
+                                          (uint2 *) particleHash,
+                                          cellStart
+#else
+                                          (float4*)oldPos, (float4*)oldVel,
+                                          gridCounters,
+                                          gridCells
+#endif
+                                          );
+
+    // check if kernel invocation generated an error
+    CUT_CHECK_ERROR("Kernel execution failed");
+
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboNewPos));
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboOldPos));
+
+#if USE_TEX
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldPosTex));
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldVelTex));
+
+#if USE_SORT
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(particleHashTex));
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(cellStartTex));
+#else
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(gridCountersTex));
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(gridCellsTex));
+#endif
+#endif
+}
+
+}   // extern "C"