diff --git a/Extras/CUDA/README.txt b/Extras/CUDA/README.txt new file mode 100644 index 000000000..e60c21902 --- /dev/null +++ b/Extras/CUDA/README.txt @@ -0,0 +1,7 @@ +btCudaBroadphase is some R&D work, a first attempt to use CUDA in Bullet. +It uses the NVidia CUDA particle demo grid broadphase as Bullet broadphase. + +Press 's' to toggle between the original CUDA particle demo and using Bullet+btCudaBroadphase. + +See some related discussion here: +http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=500&start=105 diff --git a/Extras/CUDA/btCudaBroadphase.cpp b/Extras/CUDA/btCudaBroadphase.cpp new file mode 100644 index 000000000..7b06d305a --- /dev/null +++ b/Extras/CUDA/btCudaBroadphase.cpp @@ -0,0 +1,1139 @@ + +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2008 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#include "particles_kernel.cuh" +#include "particleSystem.cuh" +#include "radixsort.cuh" +#include "vector_functions.h" +#include + +#ifdef WIN32//for glut.h +#include +#endif + +#include +//think different +#if defined(__APPLE__) && !defined (VMDMESA) +#include +#include +#include +#else +#include +#endif + + + +#define USE_SORT 1 + +#include "btCudaBroadphase.h" +#include "LinearMath/btAlignedAllocator.h" +#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h" + +btCudaBroadphase::btCudaBroadphase(SimParams& simParams,int maxProxies) : +btSimpleBroadphase(maxProxies, + new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache), + m_bInitialized(false), + m_numParticles(simParams.numBodies), + m_hPos(0), + m_hVel(0), + m_currentPosRead(0), + m_currentVelRead(0), + m_currentPosWrite(1), + m_currentVelWrite(1), + m_maxParticlesPerCell(4), + m_simParams(simParams) +{ + m_ownsPairCache = true; + + m_dPos[0] = m_dPos[1] = 0; + m_dVel[0] = m_dVel[1] = 0; + + m_simParams.gridSize.x = 64; + m_simParams.gridSize.y = 64; + m_simParams.gridSize.z = 64; + + + m_simParams.numCells = m_simParams.gridSize.x*m_simParams.gridSize.y*m_simParams.gridSize.z; + m_simParams.worldSize = make_float3(2.0f, 2.0f, 2.0f); + + // set simulation parameters + + m_simParams.numBodies = m_numParticles; + m_simParams.maxParticlesPerCell = m_maxParticlesPerCell; + + m_simParams.worldOrigin = make_float3(-1.0f, -1.0f, -1.0f); + m_simParams.cellSize = make_float3(m_simParams.worldSize.x / m_simParams.gridSize.x, m_simParams.worldSize.y / m_simParams.gridSize.y, m_simParams.worldSize.z / m_simParams.gridSize.z); + + m_simParams.particleRadius = m_simParams.cellSize.x * 0.5f; + m_simParams.colliderPos = make_float4(-1.2f, -0.8f, 0.8f, 1.0f); + m_simParams.colliderRadius = 0.2f; + + m_simParams.spring = 0.5f; + m_simParams.damping = 0.02f; + m_simParams.shear = 0.1f; + m_simParams.attraction = 0.0f; + m_simParams.boundaryDamping = -0.5f; + + m_simParams.gravity = make_float3(0.0f, -0.0003f, 0.0f); + m_simParams.globalDamping = 1.0f; + + _initialize(m_numParticles); + +} + +static inline float lerp(float a, float b, float t) +{ + return a + t*(b-a); +} + +static void colorRamp(float t, float *r) +{ + const int ncolors = 7; + float c[ncolors][3] = { + { 1.0, 0.0, 0.0, }, + { 1.0, 0.5, 0.0, }, + { 1.0, 1.0, 0.0, }, + { 0.0, 1.0, 0.0, }, + { 0.0, 1.0, 1.0, }, + { 0.0, 0.0, 1.0, }, + { 1.0, 0.0, 1.0, }, + }; + t = t * (ncolors-1); + int i = (int) t; + float u = t - floor(t); + r[0] = lerp(c[i][0], c[i+1][0], u); + r[1] = lerp(c[i][1], c[i+1][1], u); + r[2] = lerp(c[i][2], c[i+1][2], u); +} + + +unsigned int btCudaBroadphase::createVBO(unsigned int size) +{ + GLuint vbo; + glGenBuffers(1, &vbo); + glBindBuffer(GL_ARRAY_BUFFER, vbo); + glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); + glBindBuffer(GL_ARRAY_BUFFER, 0); + registerGLBufferObject(vbo); + return vbo; +} + + +void btCudaBroadphase::_initialize(int numParticles) +{ + assert(!m_bInitialized); + + // allocate host storage + m_hPos = new float[numParticles*4]; + m_hVel = new float[numParticles*4]; + m_hSortedPos = new float[numParticles*4]; + memset(m_hPos, 0, numParticles*4*sizeof(float)); + memset(m_hVel, 0, numParticles*4*sizeof(float)); + memset(m_hSortedPos, 0, numParticles*4*sizeof(float)); + + m_hGridCounters = new uint[m_simParams.numCells]; + m_hGridCells = new uint[m_simParams.numCells*m_maxParticlesPerCell]; + memset(m_hGridCounters, 0, m_simParams.numCells*sizeof(uint)); + memset(m_hGridCells, 0, m_simParams.numCells*m_maxParticlesPerCell*sizeof(uint)); + + m_hParticleHash = new uint[numParticles*2]; + memset(m_hParticleHash, 0, numParticles*2*sizeof(uint)); + + m_hCellStart = new uint[m_simParams.numCells]; + memset(m_hCellStart, 0, m_simParams.numCells*sizeof(uint)); + + // allocate GPU data + unsigned int memSize = sizeof(float) * 4 * m_numParticles; + + m_posVbo[0] = createVBO(memSize); + m_posVbo[1] = createVBO(memSize); + + allocateArray((void**)&m_dVel[0], memSize); + allocateArray((void**)&m_dVel[1], memSize); + + allocateArray((void**)&m_dSortedPos, memSize); + allocateArray((void**)&m_dSortedVel, memSize); + +#if USE_SORT + allocateArray((void**)&m_dParticleHash[0], m_numParticles*2*sizeof(uint)); + allocateArray((void**)&m_dParticleHash[1], m_numParticles*2*sizeof(uint)); + allocateArray((void**)&m_dCellStart, m_simParams.numCells*sizeof(uint)); +#else + allocateArray((void**)&m_dGridCounters, m_numGridCells*sizeof(uint)); + allocateArray((void**)&m_dGridCells, m_numGridCells*m_maxParticlesPerCell*sizeof(uint)); +#endif + + m_colorVBO = createVBO(m_numParticles*4*sizeof(float)); + +#if 1 + // fill color buffer + glBindBufferARB(GL_ARRAY_BUFFER, m_colorVBO); + float *data = (float *) glMapBufferARB(GL_ARRAY_BUFFER, GL_WRITE_ONLY); + float *ptr = data; + for(uint i=0; i params.gridSize.x-1) || + (gridPos.y < 0) || (gridPos.y > params.gridSize.y-1) || + (gridPos.z < 0) || (gridPos.z > params.gridSize.z-1)) { + return force; + } + + uint gridHash = calcGridHash(gridPos); + + // get start of bucket for this cell + uint bucketStart = FETCH(cellStart, gridHash); + if (bucketStart == 0xffffffff) + return force; // cell empty + + // iterate over particles in this cell + for(uint i=0; i= 0) + { + +//#define _USE_BRUTEFORCE_N 1 +#ifdef _USE_BRUTEFORCE_N + + int i; + for (i=0;ifindPair(proxy0,proxy1)) + { + m_pairCache->addOverlappingPair(proxy0,proxy1); + } + } else + { + if (!m_pairCache->hasDeferredRemoval()) + { + if ( m_pairCache->findPair(proxy0,proxy1)) + { + m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher); + } + } + + } + } + proxy1 = &m_pHandles[proxy1->GetNextAllocated()]; + + } + proxy0 = &m_pHandles[proxy0->GetNextAllocated()]; + + } +#else //_USE_BRUTEFORCE_N + + // update constants + setParameters(&m_simParams); + + float deltaTime = 1./60.f; + + /* + + // integrate + integrateSystem(m_posVbo[m_currentPosRead], m_posVbo[m_currentPosWrite], + m_dVel[m_currentVelRead], m_dVel[m_currentVelWrite], + deltaTime, + m_numParticles); + + + + + btSwap(m_currentPosRead, m_currentPosWrite); + btSwap(m_currentVelRead, m_currentVelWrite); +*/ + +#if USE_SORT + // sort and search method + + // calculate hash + calcHash(m_posVbo[m_currentPosRead], + m_dParticleHash[0], + m_numParticles); + +#if DEBUG_GRID + copyArrayFromDevice((void *) m_hParticleHash, (void *) m_dParticleHash[0], 0, sizeof(uint)*2*m_numParticles); + printf("particle hash:\n"); + for(uint i=0; im_min+proxy0->m_max)*0.5f; + +// float4* p = (float4*)&m_hSortedPos[index*4]; + + + int3 particleGridPos; + particleGridPos.x = floor((mypos.x() - m_simParams.worldOrigin.x) / m_simParams.cellSize.x); + particleGridPos.y = floor((mypos.y() - m_simParams.worldOrigin.y) / m_simParams.cellSize.y); + particleGridPos.z = floor((mypos.z() - m_simParams.worldOrigin.z) / m_simParams.cellSize.z); + + + //for(int z=0; z<1; z++) + for(int z=-1; z<=1; z++) + { + // for(int y=0; y<1; y++) + for(int y=-1; y<=1; y++) + { + // for(int x=0; x<1; x++) + for(int x=-1; x<=1; x++) + { + int3 gridPos; + gridPos.x = particleGridPos.x + x; + gridPos.y = particleGridPos.y + y; + gridPos.z = particleGridPos.z + z; + + if ((gridPos.x < 0) || (gridPos.x > m_simParams.gridSize.x-1) || + (gridPos.y < 0) || (gridPos.y > m_simParams.gridSize.y-1) || + (gridPos.z < 0) || (gridPos.z > m_simParams.gridSize.z-1)) + { + continue; + } + + + gridPos.x = max(0, min(gridPos.x, m_simParams.gridSize.x-1)); + gridPos.y = max(0, min(gridPos.y, m_simParams.gridSize.y-1)); + gridPos.z = max(0, min(gridPos.z, m_simParams.gridSize.z-1)); + uint gridHash = ((gridPos.z*m_simParams.gridSize.y)* m_simParams.gridSize.x) + (gridPos.y* m_simParams.gridSize.x) + gridPos.x; + + // get start of bucket for this cell + unsigned int bucketStart = m_hCellStart[gridHash]; + if (bucketStart == 0xffffffff) + continue; + + // iterate over particles in this cell + for(uint q=0; qaddOverlappingPair(proxy0,proxy1); + else + { + numRejected++; + } + + } + } + + + + //int numOverlap += myCollideCell2(gridPos + make_int3(x, y, z), index, pos, vel, oldPos, oldVel, particleHash, cellStart); + } + } + } + } + + + + +#endif //_USE_BRUTEFORCE_N + + ///if this broadphase is used in a btMultiSapBroadphase, we shouldn't sort the overlapping paircache + if (m_ownsPairCache && m_pairCache->hasDeferredRemoval()) + { + + btBroadphasePairArray& overlappingPairArray = m_pairCache->getOverlappingPairArray(); + + //perform a sort, to find duplicates and to sort 'invalid' pairs to the end + //overlappingPairArray.quickSort(btBroadphasePairSortPredicate()); + overlappingPairArray.heapSort(btBroadphasePairSortPredicate()); + //printf("A) overlappingPairArray.size()=%d\n",overlappingPairArray.size()); + + overlappingPairArray.resize(overlappingPairArray.size() - m_invalidPair); + m_invalidPair = 0; + + + btBroadphasePair previousPair; + previousPair.m_pProxy0 = 0; + previousPair.m_pProxy1 = 0; + previousPair.m_algorithm = 0; + + + int i; + for (i=0;iprocessOverlap(pair); + } else + { + needsRemoval = true; + } + } else + { + //remove duplicate + needsRemoval = true; + //should have no algorithm + btAssert(!pair.m_algorithm); + } + + if (needsRemoval) + { + m_pairCache->cleanOverlappingPair(pair,dispatcher); + + // m_overlappingPairArray.swap(i,m_overlappingPairArray.size()-1); + // m_overlappingPairArray.pop_back(); + pair.m_pProxy0 = 0; + pair.m_pProxy1 = 0; + m_invalidPair++; + + } + + } + + ///if you don't like to skip the invalid pairs in the array, execute following code: + #define CLEAN_INVALID_PAIRS 1 + #ifdef CLEAN_INVALID_PAIRS + + //perform a sort, to sort 'invalid' pairs to the end + //overlappingPairArray.quickSort(btBroadphasePairSortPredicate()); + overlappingPairArray.heapSort(btBroadphasePairSortPredicate()); + //printf("B) overlappingPairArray.size()=%d\n",overlappingPairArray.size()); + + overlappingPairArray.resize(overlappingPairArray.size() - m_invalidPair); +// printf("C) overlappingPairArray.size()=%d\n",overlappingPairArray.size()); + m_invalidPair = 0; + #endif//CLEAN_INVALID_PAIRS + + } + } + + //printf("numRejected=%d\n",numRejected); +} + +static inline float frand() +{ + return rand() / (float) RAND_MAX; +} + + +void btCudaBroadphase::initGrid(unsigned int* size, float spacing, float jitter, unsigned int numParticles) +{ + srand(1973); +#ifdef CONTROLLED_START + float extra=0.01f; + for(uint z=0; z maxPerCell) + maxPerCell = m_hGridCounters[i]; + if (m_hGridCounters[i] > 0) { + printf("%d (%d): ", i, m_hGridCounters[i]); + for(uint j=0; j + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Extras/CUDA/cutil_gl_error.h b/Extras/CUDA/cutil_gl_error.h new file mode 100644 index 000000000..8f879ebc7 --- /dev/null +++ b/Extras/CUDA/cutil_gl_error.h @@ -0,0 +1,86 @@ +/* +* Copyright 1993-2006 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO USER: +* +* This source code is subject to NVIDIA ownership rights under U.S. and +* international Copyright laws. +* +* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +* OR PERFORMANCE OF THIS SOURCE CODE. +* +* U.S. Government End Users. This source code is a "commercial item" as +* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +* "commercial computer software" and "commercial computer software +* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +* and is provided to the U.S. Government only as a commercial end item. +* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +* source code with only those rights set forth herein. +*/ + +#ifndef CUTIL_GL_ERROR +#define CUTIL_GL_ERROR + +/* CUda UTility Library */ + +// includes, system +#ifdef _WIN32 +# define WINDOWS_LEAN_AND_MEAN +# include +# include +# undef min +# undef max +#endif + +// includes, graphics +#if defined (__APPLE__) || defined(MACOSX) +#include +#include +#else +#include +#include +#endif + +//////////////////////////////////////////////////////////////////////////// +//! Check for OpenGL error +//! @return CUTTrue if no GL error has been encountered, otherwise 0 +//! @param file __FILE__ macro +//! @param line __LINE__ macro +//! @note The GL error is listed on stderr +//! @note This function should be used via the CHECK_ERROR_GL() macro +//////////////////////////////////////////////////////////////////////////// +CUTBoolean CUTIL_API +cutCheckErrorGL( const char* file, const int line) +{ + CUTBoolean ret_val = CUTTrue; + + // check for error + GLenum gl_error = glGetError(); + if (gl_error != GL_NO_ERROR) + { + fprintf(stderr, "GL Error in file '%s' in line %d :\n", file, line); + fprintf(stderr, "%s\n", gluErrorString(gl_error)); + ret_val = CUTFalse; + } + return ret_val; +} + +#ifdef _DEBUG + +#define CUT_CHECK_ERROR_GL() \ + if( CUTFalse == cutCheckErrorGL( __FILE__, __LINE__)) { \ + exit(EXIT_FAILURE); \ + } + +#endif // _DEBUG + +#endif // CUTIL_GL_ERROR diff --git a/Extras/CUDA/cutil_math.h b/Extras/CUDA/cutil_math.h new file mode 100644 index 000000000..de5158d17 --- /dev/null +++ b/Extras/CUDA/cutil_math.h @@ -0,0 +1,767 @@ + /* + * Copyright 1993-2007 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. Users and possessors of this source code + * are hereby granted a nonexclusive, royalty-free license to use this code + * in individual and commercial software. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + * + * Any use of this source code in individual and commercial software must + * include, in the user documentation and internal comments to the code, + * the above Disclaimer and U.S. Government End Users Notice. + */ + +/* + This file implements common mathematical operations on vector types + (float3, float4 etc.) since these are not provided as standard by CUDA. + + The syntax is modelled on the Cg standard library. +*/ + +#ifndef CUTIL_MATH_H +#define CUTIL_MATH_H + +#include "cuda_runtime.h" + +//////////////////////////////////////////////////////////////////////////////// +typedef unsigned int uint; +typedef unsigned short ushort; + +#ifndef __CUDACC__ +#include + +inline float fminf(float a, float b) +{ + return a < b ? a : b; +} + +inline float fmaxf(float a, float b) +{ + return a < b ? a : b; +} + +inline int max(int a, int b) +{ + return a > b ? a : b; +} + +inline int min(int a, int b) +{ + return a < b ? a : b; +} +#endif + +// float functions +//////////////////////////////////////////////////////////////////////////////// + +// lerp +inline __device__ __host__ float lerp(float a, float b, float t) +{ + return a + t*(b-a); +} + +// clamp +inline __device__ __host__ float clamp(float f, float a, float b) +{ + return fmaxf(a, fminf(f, b)); +} + +// int2 functions +//////////////////////////////////////////////////////////////////////////////// + +// negate +inline __host__ __device__ int2 operator-(int2 &a) +{ + return make_int2(-a.x, -a.y); +} + +// addition +inline __host__ __device__ int2 operator+(int2 a, int2 b) +{ + return make_int2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(int2 &a, int2 b) +{ + a.x += b.x; a.y += b.y; +} + +// subtract +inline __host__ __device__ int2 operator-(int2 a, int2 b) +{ + return make_int2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(int2 &a, int2 b) +{ + a.x -= b.x; a.y -= b.y; +} + +// multiply +inline __host__ __device__ int2 operator*(int2 a, int2 b) +{ + return make_int2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ int2 operator*(int2 a, int s) +{ + return make_int2(a.x * s, a.y * s); +} +inline __host__ __device__ int2 operator*(int s, int2 a) +{ + return make_int2(a.x * s, a.y * s); +} +inline __host__ __device__ void operator*=(int2 &a, int s) +{ + a.x *= s; a.y *= s; +} + +// float2 functions +//////////////////////////////////////////////////////////////////////////////// + +// additional constructors +inline __host__ __device__ float2 make_float2(float s) +{ + return make_float2(s, s); +} +inline __host__ __device__ float2 make_float2(int2 a) +{ + return make_float2(float(a.x), float(a.y)); +} + +// negate +inline __host__ __device__ float2 operator-(float2 &a) +{ + return make_float2(-a.x, -a.y); +} + +// addition +inline __host__ __device__ float2 operator+(float2 a, float2 b) +{ + return make_float2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(float2 &a, float2 b) +{ + a.x += b.x; a.y += b.y; +} + +// subtract +inline __host__ __device__ float2 operator-(float2 a, float2 b) +{ + return make_float2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(float2 &a, float2 b) +{ + a.x -= b.x; a.y -= b.y; +} + +// multiply +inline __host__ __device__ float2 operator*(float2 a, float2 b) +{ + return make_float2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ float2 operator*(float2 a, float s) +{ + return make_float2(a.x * s, a.y * s); +} +inline __host__ __device__ float2 operator*(float s, float2 a) +{ + return make_float2(a.x * s, a.y * s); +} +inline __host__ __device__ void operator*=(float2 &a, float s) +{ + a.x *= s; a.y *= s; +} + +// divide +inline __host__ __device__ float2 operator/(float2 a, float2 b) +{ + return make_float2(a.x / b.x, a.y / b.y); +} +inline __host__ __device__ float2 operator/(float2 a, float s) +{ + float inv = 1.0f / s; + return a * inv; +} +inline __host__ __device__ float2 operator/(float s, float2 a) +{ + float inv = 1.0f / s; + return a * inv; +} +inline __host__ __device__ void operator/=(float2 &a, float s) +{ + float inv = 1.0f / s; + a *= inv; +} + +// lerp +inline __device__ __host__ float2 lerp(float2 a, float2 b, float t) +{ + return a + t*(b-a); +} + +// clamp +inline __device__ __host__ float2 clamp(float2 v, float a, float b) +{ + return make_float2(clamp(v.x, a, b), clamp(v.y, a, b)); +} + +inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b) +{ + return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} + +// dot product +inline __host__ __device__ float dot(float2 a, float2 b) +{ + return a.x * b.x + a.y * b.y; +} + +// length +inline __host__ __device__ float length(float2 v) +{ + return sqrtf(dot(v, v)); +} + +// normalize +inline __host__ __device__ float2 normalize(float2 v) +{ + float invLen = 1.0f / sqrtf(dot(v, v)); + return v * invLen; +} + +// floor +inline __host__ __device__ float2 floor(const float2 v) +{ + return make_float2(floor(v.x), floor(v.y)); +} + +// reflect +inline __host__ __device__ float2 reflect(float2 i, float2 n) +{ + return i - 2.0f * n * dot(n,i); +} + +// float3 functions +//////////////////////////////////////////////////////////////////////////////// + +// additional constructors +inline __host__ __device__ float3 make_float3(float s) +{ + return make_float3(s, s, s); +} +inline __host__ __device__ float3 make_float3(float2 a) +{ + return make_float3(a.x, a.y, 0.0f); +} +inline __host__ __device__ float3 make_float3(float2 a, float s) +{ + return make_float3(a.x, a.y, s); +} +inline __host__ __device__ float3 make_float3(float4 a) +{ + return make_float3(a.x, a.y, a.z); // discards w +} +inline __host__ __device__ float3 make_float3(int3 a) +{ + return make_float3(float(a.x), float(a.y), float(a.z)); +} + +// negate +inline __host__ __device__ float3 operator-(float3 &a) +{ + return make_float3(-a.x, -a.y, -a.z); +} + +// min +static __inline__ __host__ __device__ float3 fminf(float3 a, float3 b) +{ + return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z)); +} + +// max +static __inline__ __host__ __device__ float3 fmaxf(float3 a, float3 b) +{ + return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z)); +} + +// addition +inline __host__ __device__ float3 operator+(float3 a, float3 b) +{ + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ float3 operator+(float3 a, float b) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(float3 &a, float3 b) +{ + a.x += b.x; a.y += b.y; a.z += b.z; +} + +// subtract +inline __host__ __device__ float3 operator-(float3 a, float3 b) +{ + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ float3 operator-(float3 a, float b) +{ + return make_float3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ void operator-=(float3 &a, float3 b) +{ + a.x -= b.x; a.y -= b.y; a.z -= b.z; +} + +// multiply +inline __host__ __device__ float3 operator*(float3 a, float3 b) +{ + return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ float3 operator*(float3 a, float s) +{ + return make_float3(a.x * s, a.y * s, a.z * s); +} +inline __host__ __device__ float3 operator*(float s, float3 a) +{ + return make_float3(a.x * s, a.y * s, a.z * s); +} +inline __host__ __device__ void operator*=(float3 &a, float s) +{ + a.x *= s; a.y *= s; a.z *= s; +} + +// divide +inline __host__ __device__ float3 operator/(float3 a, float3 b) +{ + return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); +} +inline __host__ __device__ float3 operator/(float3 a, float s) +{ + float inv = 1.0f / s; + return a * inv; +} +inline __host__ __device__ float3 operator/(float s, float3 a) +{ + float inv = 1.0f / s; + return a * inv; +} +inline __host__ __device__ void operator/=(float3 &a, float s) +{ + float inv = 1.0f / s; + a *= inv; +} + +// lerp +inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) +{ + return a + t*(b-a); +} + +// clamp +inline __device__ __host__ float3 clamp(float3 v, float a, float b) +{ + return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} + +inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b) +{ + return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} + +// dot product +inline __host__ __device__ float dot(float3 a, float3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +// cross product +inline __host__ __device__ float3 cross(float3 a, float3 b) +{ + return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); +} + +// length +inline __host__ __device__ float length(float3 v) +{ + return sqrtf(dot(v, v)); +} + +// normalize +inline __host__ __device__ float3 normalize(float3 v) +{ + float invLen = 1.0f / sqrtf(dot(v, v)); + return v * invLen; +} + +// floor +inline __host__ __device__ float3 floor(const float3 v) +{ + return make_float3(floor(v.x), floor(v.y), floor(v.z)); +} + +// reflect +inline __host__ __device__ float3 reflect(float3 i, float3 n) +{ + return i - 2.0f * n * dot(n,i); +} + +// float4 functions +//////////////////////////////////////////////////////////////////////////////// + +// additional constructors +inline __host__ __device__ float4 make_float4(float s) +{ + return make_float4(s, s, s, s); +} +inline __host__ __device__ float4 make_float4(float3 a) +{ + return make_float4(a.x, a.y, a.z, 0.0f); +} +inline __host__ __device__ float4 make_float4(float3 a, float w) +{ + return make_float4(a.x, a.y, a.z, w); +} +inline __host__ __device__ float4 make_float4(int4 a) +{ + return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); +} + +// negate +inline __host__ __device__ float4 operator-(float4 &a) +{ + return make_float4(-a.x, -a.y, -a.z, -a.w); +} + +// min +static __inline__ __host__ __device__ float4 fminf(float4 a, float4 b) +{ + return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w)); +} + +// max +static __inline__ __host__ __device__ float4 fmaxf(float4 a, float4 b) +{ + return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w)); +} + +// addition +inline __host__ __device__ float4 operator+(float4 a, float4 b) +{ + return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(float4 &a, float4 b) +{ + a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; +} + +// subtract +inline __host__ __device__ float4 operator-(float4 a, float4 b) +{ + return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(float4 &a, float4 b) +{ + a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; +} + +// multiply +inline __host__ __device__ float4 operator*(float4 a, float s) +{ + return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); +} +inline __host__ __device__ float4 operator*(float s, float4 a) +{ + return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); +} +inline __host__ __device__ void operator*=(float4 &a, float s) +{ + a.x *= s; a.y *= s; a.z *= s; a.w *= s; +} + +// divide +inline __host__ __device__ float4 operator/(float4 a, float4 b) +{ + return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); +} +inline __host__ __device__ float4 operator/(float4 a, float s) +{ + float inv = 1.0f / s; + return a * inv; +} +inline __host__ __device__ float4 operator/(float s, float4 a) +{ + float inv = 1.0f / s; + return a * inv; +} +inline __host__ __device__ void operator/=(float4 &a, float s) +{ + float inv = 1.0f / s; + a *= inv; +} + +// lerp +inline __device__ __host__ float4 lerp(float4 a, float4 b, float t) +{ + return a + t*(b-a); +} + +// clamp +inline __device__ __host__ float4 clamp(float4 v, float a, float b) +{ + return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} + +inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b) +{ + return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +// dot product +inline __host__ __device__ float dot(float4 a, float4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +// length +inline __host__ __device__ float length(float4 r) +{ + return sqrtf(dot(r, r)); +} + +// normalize +inline __host__ __device__ float4 normalize(float4 v) +{ + float invLen = 1.0f / sqrtf(dot(v, v)); + return v * invLen; +} + +// floor +inline __host__ __device__ float4 floor(const float4 v) +{ + return make_float4(floor(v.x), floor(v.y), floor(v.z), floor(v.w)); +} + +// int3 functions +//////////////////////////////////////////////////////////////////////////////// + +// additional constructors +inline __host__ __device__ int3 make_int3(int s) +{ + return make_int3(s, s, s); +} +inline __host__ __device__ int3 make_int3(float3 a) +{ + return make_int3(int(a.x), int(a.y), int(a.z)); +} + +// negate +inline __host__ __device__ int3 operator-(int3 &a) +{ + return make_int3(-a.x, -a.y, -a.z); +} + +// min +inline __host__ __device__ int3 min(int3 a, int3 b) +{ + return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} + +// max +inline __host__ __device__ int3 max(int3 a, int3 b) +{ + return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} + +// addition +inline __host__ __device__ int3 operator+(int3 a, int3 b) +{ + return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(int3 &a, int3 b) +{ + a.x += b.x; a.y += b.y; a.z += b.z; +} + +// subtract +inline __host__ __device__ int3 operator-(int3 a, int3 b) +{ + return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); +} + +inline __host__ __device__ void operator-=(int3 &a, int3 b) +{ + a.x -= b.x; a.y -= b.y; a.z -= b.z; +} + +// multiply +inline __host__ __device__ int3 operator*(int3 a, int3 b) +{ + return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ int3 operator*(int3 a, int s) +{ + return make_int3(a.x * s, a.y * s, a.z * s); +} +inline __host__ __device__ int3 operator*(int s, int3 a) +{ + return make_int3(a.x * s, a.y * s, a.z * s); +} +inline __host__ __device__ void operator*=(int3 &a, int s) +{ + a.x *= s; a.y *= s; a.z *= s; +} + +// divide +inline __host__ __device__ int3 operator/(int3 a, int3 b) +{ + return make_int3(a.x / b.x, a.y / b.y, a.z / b.z); +} +inline __host__ __device__ int3 operator/(int3 a, int s) +{ + return make_int3(a.x / s, a.y / s, a.z / s); +} +inline __host__ __device__ int3 operator/(int s, int3 a) +{ + return make_int3(a.x / s, a.y / s, a.z / s); +} +inline __host__ __device__ void operator/=(int3 &a, int s) +{ + a.x /= s; a.y /= s; a.z /= s; +} + +// clamp +inline __device__ __host__ int clamp(int f, int a, int b) +{ + return max(a, min(f, b)); +} + +inline __device__ __host__ int3 clamp(int3 v, int a, int b) +{ + return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} + +inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b) +{ + return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} + + +// uint3 functions +//////////////////////////////////////////////////////////////////////////////// + +// additional constructors +inline __host__ __device__ uint3 make_uint3(uint s) +{ + return make_uint3(s, s, s); +} +inline __host__ __device__ uint3 make_uint3(float3 a) +{ + return make_uint3(uint(a.x), uint(a.y), uint(a.z)); +} + +// min +inline __host__ __device__ uint3 min(uint3 a, uint3 b) +{ + return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} + +// max +inline __host__ __device__ uint3 max(uint3 a, uint3 b) +{ + return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} + +// addition +inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) +{ + return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(uint3 &a, uint3 b) +{ + a.x += b.x; a.y += b.y; a.z += b.z; +} + +// subtract +inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) +{ + return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); +} + +inline __host__ __device__ void operator-=(uint3 &a, uint3 b) +{ + a.x -= b.x; a.y -= b.y; a.z -= b.z; +} + +// multiply +inline __host__ __device__ uint3 operator*(uint3 a, uint3 b) +{ + return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ uint3 operator*(uint3 a, uint s) +{ + return make_uint3(a.x * s, a.y * s, a.z * s); +} +inline __host__ __device__ uint3 operator*(uint s, uint3 a) +{ + return make_uint3(a.x * s, a.y * s, a.z * s); +} +inline __host__ __device__ void operator*=(uint3 &a, uint s) +{ + a.x *= s; a.y *= s; a.z *= s; +} + +// divide +inline __host__ __device__ uint3 operator/(uint3 a, uint3 b) +{ + return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z); +} +inline __host__ __device__ uint3 operator/(uint3 a, uint s) +{ + return make_uint3(a.x / s, a.y / s, a.z / s); +} +inline __host__ __device__ uint3 operator/(uint s, uint3 a) +{ + return make_uint3(a.x / s, a.y / s, a.z / s); +} +inline __host__ __device__ void operator/=(uint3 &a, uint s) +{ + a.x /= s; a.y /= s; a.z /= s; +} + +// clamp +inline __device__ __host__ uint clamp(uint f, uint a, uint b) +{ + return max(a, min(f, b)); +} + +inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b) +{ + return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} + +inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b) +{ + return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} + +#endif diff --git a/Extras/CUDA/param.cpp b/Extras/CUDA/param.cpp new file mode 100644 index 000000000..bad5fecc4 --- /dev/null +++ b/Extras/CUDA/param.cpp @@ -0,0 +1,3 @@ +#include + +const Param dummy("error"); diff --git a/Extras/CUDA/param.h b/Extras/CUDA/param.h new file mode 100644 index 000000000..0570af1dd --- /dev/null +++ b/Extras/CUDA/param.h @@ -0,0 +1,226 @@ +/* + Simple parameter system + sgreen@nvidia.com 4/2001 +*/ + +#ifndef PARAM_H +#define PARAM_H + +#ifdef _WIN32 +# pragma warning(disable:4786) // No stupid debug warnings +#endif + +#include +#include +#include +#include +#include +#include + +// base class for named parameter +class ParamBase { +public: + ParamBase(char *name) { m_name = name; } + + virtual ~ParamBase() { } + + std::string *GetName() { return &m_name; } + + virtual float GetFloatValue() = 0; + virtual int GetIntValue() = 0; + + virtual std::string GetValueString() = 0; + + virtual void Reset() = 0; + virtual void Increment() = 0; + virtual void Decrement() = 0; + + virtual float GetPercentage() = 0; + virtual void SetPercentage(float p) = 0; + + virtual void Write(std::ostream &stream) = 0; + virtual void Read(std::istream &stream) = 0; + + virtual bool IsList() = 0; + +protected: + std::string m_name; +}; + +// derived class for single-valued parameter +template class Param : public ParamBase { +public: + Param(char *name, T value = 0, T min = 0, T max = 10000, T step = 1, T* ptr = 0) + : ParamBase(name) + { + if (ptr) { + m_ptr = ptr; + } else { + m_ptr = &m_value; + } + *m_ptr = value; + m_default = value; + + m_min = min; + m_max = max; + m_step = step; + } + + virtual ~Param() { } + + virtual float GetFloatValue() { return (float) *m_ptr; } + virtual int GetIntValue() { return (int) *m_ptr; } + + T GetValue() const { return *m_ptr; } + T SetValue(const T value) { *m_ptr = value; } + + // inherited functions + std::string GetValueString() + { + std::ostringstream ost; + ost< m_max) + *m_ptr = m_max; + } + + void Decrement() + { + *m_ptr -= m_step; + if (*m_ptr < m_min) + *m_ptr = m_min; + } + + void Write(std::ostream &stream) { stream << m_name << " " << *m_ptr << '\n'; } + void Read(std::istream &stream) { stream >> m_name >> *m_ptr; } + + bool IsList() { return false; } + +private: + T m_value; + T *m_ptr; // pointer to value declared elsewhere + T m_default, m_min, m_max; + T m_step; +}; + + +extern const Param dummy; + +// list of parameters +class ParamList : public ParamBase { +public: + ParamList(char *name = "") + : ParamBase(name) + { + active = true; + } + + virtual ~ParamList() { } + + virtual float GetFloatValue() { return 0.0f; } + virtual int GetIntValue() { return 0; } + + void AddParam(ParamBase *param) + { + m_params.push_back(param); + m_map[*param->GetName()] = param; + m_current = m_params.begin(); + } + + // look-up parameter based on name + ParamBase *GetParam(char *name) + { + ParamBase *p = m_map[name]; + + if (p) + return p; + else + return (ParamBase *) &dummy; + } + + ParamBase *GetParam(int i) + { + return m_params[i]; + } + + ParamBase *GetCurrent() + { + return *m_current; + } + + int GetSize() { return (int)m_params.size(); } + + // inherited functions + std::string GetValueString() + { +// return m_name; + return "list"; + } + + void Reset() + { + m_current = m_params.begin(); + } + + void Increment() + { + m_current++; + if (m_current == m_params.end()) + m_current = m_params.begin(); + } + + void Decrement() + { + if (m_current == m_params.begin()) + m_current = m_params.end()-1; + else + m_current--; + + } + + float GetPercentage() { return 0.0f; } + void SetPercentage(float /*p*/) { ; } + + void Write(std::ostream &stream) + { + stream << m_name << '\n'; + for(std::vector::const_iterator p = m_params.begin(); p != m_params.end(); ++p) { + (*p)->Write(stream); + } + } + + void Read(std::istream &stream) + { + stream >> m_name; + for(std::vector::const_iterator p = m_params.begin(); p != m_params.end(); ++p) { + (*p)->Read(stream); + } + } + + bool IsList() { return true; } + +protected: + bool active; + std::vector m_params; + std::map m_map; + std::vector::const_iterator m_current; +}; + +#endif diff --git a/Extras/CUDA/paramgl.cpp b/Extras/CUDA/paramgl.cpp new file mode 100644 index 000000000..39e74e5d6 --- /dev/null +++ b/Extras/CUDA/paramgl.cpp @@ -0,0 +1,209 @@ +/* + ParamListGL + - class derived from ParamList to do simple OpenGL rendering of a parameter list + sgg 8/2001 +*/ + +#include +#include + +void beginWinCoords(void) +{ + glMatrixMode(GL_MODELVIEW); + glPushMatrix(); + glLoadIdentity(); + glTranslatef(0.0, glutGet(GLUT_WINDOW_HEIGHT) - 1, 0.0); + glScalef(1.0, -1.0, 1.0); + + glMatrixMode(GL_PROJECTION); + glPushMatrix(); + glLoadIdentity(); + glOrtho(0, glutGet(GLUT_WINDOW_WIDTH), 0, glutGet(GLUT_WINDOW_HEIGHT), -1, 1); + + glMatrixMode(GL_MODELVIEW); +} + +void endWinCoords(void) +{ + glMatrixMode(GL_PROJECTION); + glPopMatrix(); + + glMatrixMode(GL_MODELVIEW); + glPopMatrix(); +} + +void glPrint(int x, int y, const char *s, void *font) +{ + int i, len; + + glRasterPos2f(x, y); + len = (int) strlen(s); + for (i = 0; i < len; i++) { + glutBitmapCharacter(font, s[i]); + } +} + +void glPrintShadowed(int x, int y, const char *s, void *font, float *color) +{ + glColor3f(0.0, 0.0, 0.0); + glPrint(x-1, y-1, s, font); + + glColor3fv((GLfloat *) color); + glPrint(x, y, s, font); +} + + +ParamListGL::ParamListGL(char *name) : ParamList(name) +{ + font = (void *) GLUT_BITMAP_9_BY_15; +// font = (void *) GLUT_BITMAP_8_BY_13; + + bar_x = 250; + bar_w = 250; + bar_h = 10; + bar_offset = 5; + text_x = 5; + separation = 15; + value_x = 200; + font_h = 15; + start_x = 0; + start_y = 0; + + text_col_selected[0] = 1.0; + text_col_selected[1] = 1.0; + text_col_selected[2] = 1.0; + + text_col_unselected[0] = 0.75; + text_col_unselected[1] = 0.75; + text_col_unselected[2] = 0.75; + + bar_col_outer[0] = 0.0; + bar_col_outer[1] = 0.0; + bar_col_outer[2] = 0.0; + + bar_col_inner[0] = 0.0; + bar_col_inner[1] = 0.0; + bar_col_inner[2] = 0.0; + + text_col_shadow[0] = 0.0; + text_col_shadow[1] = 0.0; + text_col_shadow[2] = 0.0; +} + +void +ParamListGL::Render(int x, int y, bool shadow) +{ + beginWinCoords(); + + start_x = x; start_y = y; + + for(std::vector::const_iterator p = m_params.begin(); p != m_params.end(); ++p) { + if ((*p)->IsList()) { + ParamListGL *list = (ParamListGL *) (*p); + list->Render(x+10, y); + y += separation*list->GetSize(); + + } else { + if (p == m_current) + glColor3fv(text_col_selected); + else + glColor3fv(text_col_unselected); + + if (shadow) { + glPrintShadowed(x + text_x, y + font_h, (*p)->GetName()->c_str(), font, (p == m_current) ? text_col_selected : text_col_unselected); + glPrintShadowed(x + value_x, y + font_h, (*p)->GetValueString().c_str(), font, (p == m_current) ? text_col_selected : text_col_unselected); + } + else { + glPrint(x + text_x, y + font_h, (*p)->GetName()->c_str(), font); + glPrint(x + value_x, y + font_h, (*p)->GetValueString().c_str(), font); + } + +// glColor3fv((GLfloat *) &bar_col_outer); + glBegin(GL_LINE_LOOP); + glVertex2f(x + bar_x, y + bar_offset); + glVertex2f(x + bar_x + bar_w, y + bar_offset); + glVertex2f(x + bar_x + bar_w, y + bar_offset + bar_h); + glVertex2f(x + bar_x, y + bar_offset + bar_h); + glEnd(); + +// glColor3fv((GLfloat *) &bar_col_inner); + glRectf(x + bar_x, y + bar_offset + bar_h, x + bar_x + (bar_w*(*p)->GetPercentage()), y + bar_offset); + + y += separation; + } + + } + + endWinCoords(); +} + + +bool +ParamListGL::Mouse(int x, int y, int button, int state) +{ + if ((y < start_y) || (y > (int)(start_y + (separation * m_params.size()) - 1))) + return false; + + int i = (y - start_y) / separation; + + if ((button==GLUT_LEFT_BUTTON) && (state==GLUT_DOWN)) { +#if defined(__GNUC__) && (__GNUC__ < 3) + m_current = &m_params[i]; +#else + // MJH: workaround since the version of vector::at used here is non-standard + for (m_current = m_params.begin(); m_current != m_params.end() && i > 0; m_current++, i--); + //m_current = (std::vector::const_iterator)&m_params.at(i); +#endif + + if ((x > bar_x) && (x < bar_x + bar_w)) { + Motion(x, y); + } + } + return true; +} + +bool +ParamListGL::Motion(int x, int y) +{ + if ((y < start_y) || (y > start_y + (separation * (int)m_params.size()) - 1) ) + return false; + + if (x < bar_x) { + (*m_current)->SetPercentage(0.0); + return true; + } + + if (x > bar_x + bar_w) { + (*m_current)->SetPercentage(1.0); + return true; + } + + (*m_current)->SetPercentage((x-bar_x) / (float) bar_w); + return true; +} + +void +ParamListGL::Special(int key, int /*x*/, int /*y*/) +{ + switch(key) { + case GLUT_KEY_DOWN: + Increment(); + break; + case GLUT_KEY_UP: + Decrement(); + break; + case GLUT_KEY_RIGHT: + GetCurrent()->Increment(); + break; + case GLUT_KEY_LEFT: + GetCurrent()->Decrement(); + break; + case GLUT_KEY_HOME: + GetCurrent()->Reset(); + break; + case GLUT_KEY_END: + GetCurrent()->SetPercentage(1.0); + break; + } + glutPostRedisplay(); +} diff --git a/Extras/CUDA/paramgl.h b/Extras/CUDA/paramgl.h new file mode 100644 index 000000000..31aa00a54 --- /dev/null +++ b/Extras/CUDA/paramgl.h @@ -0,0 +1,54 @@ +/* + ParamListGL + - class derived from ParamList to do simple OpenGL rendering of a parameter list + sgg 8/2001 +*/ + +#ifndef PARAMGL_H +#define PARAMGL_H + +#if defined(__APPLE__) || defined(MACOSX) +#include +#else +#include +#endif + +#include + +void beginWinCoords(); +void endWinCoords(); +void glPrint(int x, int y, const char *s, void *font); +void glPrintShadowed(int x, int y, const char *s, void *font, float *color); + +class ParamListGL : public ParamList { +public: + ParamListGL(char *name = ""); + + void Render(int x, int y, bool shadow = false); + bool Mouse(int x, int y, int button=GLUT_LEFT_BUTTON, int state=GLUT_DOWN); + bool Motion(int x, int y); + void Special(int key, int x, int y); + + void SetSelectedColor(float r, float g, float b) { text_col_selected[0] = r; text_col_selected[1] = g; text_col_selected[2] = b; } + void SetUnSelectedColor(float r, float g, float b) { text_col_unselected[0] = r; text_col_unselected[1] = g; text_col_unselected[2] = b; } + + int bar_x; + int bar_w; + int bar_h; + int text_x; + int separation; + int value_x; + int font_h; + int start_x, start_y; + int bar_offset; + + float text_col_selected[3]; + float text_col_unselected[3]; + float text_col_shadow[3]; + float bar_col_outer[3]; + float bar_col_inner[3]; + + void *font; +}; + +#endif diff --git a/Extras/CUDA/particleSystem.cpp b/Extras/CUDA/particleSystem.cpp new file mode 100644 index 000000000..8e6704cf6 --- /dev/null +++ b/Extras/CUDA/particleSystem.cpp @@ -0,0 +1,280 @@ +/* + * Copyright 1993-2006 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + */ + +#include "particleSystem.h" +#include "particleSystem.cuh" +#include "radixsort.cuh" +#include "particles_kernel.cuh" + + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "btCudaBroadphase.h" + + +#ifndef CUDART_PI_F +#define CUDART_PI_F 3.141592654f +#endif + + +ParticleSystem::ParticleSystem(uint numParticles, uint3 gridSize) : + m_simulationMode(SIMULATION_BULLET_CPU)//SIMULATION_CUDA) +{ + this->m_params.numBodies = numParticles; + this->m_params.m_gridSize = gridSize; + initializeBullet(); +} + +ParticleSystem::~ParticleSystem() +{ + finalizeBullet(); +} +#include "../../Demos/OpenGL/GLDebugDrawer.h" + +GLDebugDrawer debugDrawer; + +void ParticleSystem::initializeBullet() +{ + + m_collisionConfiguration = new btDefaultCollisionConfiguration(); + m_dispatcher = new btCollisionDispatcher(m_collisionConfiguration); + //m_broadphase = new btDbvtBroadphase(); + //m_broadphase = new btAxisSweep3(btVector3(-3,-3,-3),btVector3(3,3,3)); + m_broadphase = new btCudaBroadphase(m_params,m_params.numBodies+6); + + + + m_constraintSolver=new btSequentialImpulseConstraintSolver(); + m_dynamicsWorld = new btDiscreteDynamicsWorld(m_dispatcher,m_broadphase,m_constraintSolver,m_collisionConfiguration); + m_dynamicsWorld->setDebugDrawer(&debugDrawer); + //debugDrawer.setDebugMode(btIDebugDraw::DBG_DrawPairs); + + + m_dynamicsWorld->setGravity(100*btVector3(m_params.gravity.x,m_params.gravity.y,m_params.gravity.z)); + m_dynamicsWorld->getSolverInfo().m_numIterations=1; + + btBoxShape* worldBox = new btBoxShape(btVector3(m_params.worldSize.x/2,m_params.worldSize.y/2,m_params.worldSize.z/2)); + worldBox->setMargin(0.f); + + //create 6 static planes for the world cube + btStaticPlaneShape* planeShape; + btRigidBody* body; + btVector3 worldSize(); + + int i; + + + + btSphereShape* particleSphere = new btSphereShape(m_params.particleRadius); + particleSphere->setMargin(0.0); + btVector3 localInertia; + particleSphere->calculateLocalInertia(1,localInertia); + + float* m_hPos = m_broadphase->getHposPtr(); + + for (i=0;isetActivationState(DISABLE_DEACTIVATION); + m_bulletParticles.push_back(body); + m_dynamicsWorld->addRigidBody(body); + } + + reset(CONFIG_GRID); + +/* for (i=0;i<6;i++) + { + btVector4 planeEq; + worldBox->getPlaneEquation(planeEq,i); + + planeShape = new btStaticPlaneShape(-planeEq,planeEq.getW()); + planeShape->setMargin(0.f); + btRigidBody::btRigidBodyConstructionInfo rbci(0.f,0,planeShape); + body = new btRigidBody(rbci); + m_dynamicsWorld->addRigidBody(body); + } +*/ + +} + +void ParticleSystem::finalizeBullet() +{ + delete m_dynamicsWorld; + delete m_constraintSolver; + delete m_broadphase; + delete m_dispatcher ; + delete m_collisionConfiguration; +} + + + +void +ParticleSystem::update(float deltaTime) +{ + assert(m_bInitialized); + + switch (m_simulationMode) + { + case SIMULATION_CUDA: + { + m_broadphase->quickHack(deltaTime); + //todo + break; + } + case SIMULATION_BULLET_CPU: + { + m_broadphase->integrate(); + + + ///copy particles from device to main memory + { + float* hPosData = m_broadphase->copyBuffersFromDeviceToHost(); + float* m_hVel = m_broadphase->getHvelPtr(); + m_broadphase->copyBuffersFromHostToDevice(); + + + //sync transform and velocity from particle system to Bullet + + for (int i=0;igetWorldTransform(); + trans.setOrigin(btVector3(hPosData[i*4],hPosData[i*4+1],hPosData[i*4+2])); + m_bulletParticles[i]->setLinearVelocity(btVector3(m_hVel[i*4],m_hVel[i*4+1],m_hVel[i*4+2])*10.); + } + } + + m_dynamicsWorld->stepSimulation(deltaTime); + +/* for (int i=0;icopyBuffersFromDeviceToHost(); + float* m_hVel = m_broadphase->getHvelPtr(); + + //sync transform and velocity from Bullet to particle system + for (int i=0;igetWorldTransform(); + hPosData[i*4] = trans.getOrigin().getX(); + hPosData[i*4+1] = trans.getOrigin().getY(); + hPosData[i*4+2] = trans.getOrigin().getZ(); + + m_hVel[i*4] = m_bulletParticles[i]->getLinearVelocity().getX()/10.; + m_hVel[i*4+1] = m_bulletParticles[i]->getLinearVelocity().getY()/10.; + m_hVel[i*4+2] = m_bulletParticles[i]->getLinearVelocity().getZ()/10.; + } + + m_broadphase->copyBuffersFromHostToDevice(); + } + + break; + + } + + + + default: + { + printf("unknown simulation method\n"); + } + }; + +} + + + +float* ParticleSystem::getArray(ParticleArray array) +{ + return m_broadphase->getArray((btCudaBroadphase::ParticleArray)array); + +} +void ParticleSystem::debugDraw() +{ + glDisable(GL_DEPTH_TEST); + m_dynamicsWorld->debugDrawWorld(); + glEnable(GL_DEPTH_TEST); +} + + +void ParticleSystem::reset(ParticleConfig config) +{ + m_broadphase->reset((btCudaBroadphase::ParticleConfig)config); + for (int i=0;isetAngularVelocity(btVector3(0,0,0)); + } + +} + + +void ParticleSystem::addSphere(int start, float *pos, float *vel, int r, float spacing) +{ + m_broadphase->addSphere(start,pos,vel,r,spacing); +} + +unsigned int ParticleSystem::getCurrentReadBuffer() const +{ + return m_broadphase->getCurrentReadBuffer(); +} +unsigned int ParticleSystem::getColorBuffer() const +{ + return m_broadphase->getColorBuffer(); +} + +void ParticleSystem::dumpGrid() +{ + return m_broadphase->dumpGrid(); +} + +void ParticleSystem::dumpParticles(uint start, uint count) +{ + m_broadphase->dumpParticles(start,count); +} + +int ParticleSystem::getNumParticles() const +{ + return m_params.numBodies; +} \ No newline at end of file diff --git a/Extras/CUDA/particleSystem.cu b/Extras/CUDA/particleSystem.cu new file mode 100644 index 000000000..ce8117e86 --- /dev/null +++ b/Extras/CUDA/particleSystem.cu @@ -0,0 +1,331 @@ +/* + * Copyright 1993-2006 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + */ + +//#include +#include +#include +#include + +#if defined(__APPLE__) || defined(MACOSX) +#include +#else +#include +#endif + +#include + +#include "particles_kernel.cu" +#include "radixsort.cu" + + //! Check for CUDA error +# define CUT_CHECK_ERROR(errorMessage) do { \ + cudaError_t err = cudaGetLastError(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } \ + err = cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } } while (0) + + +# define MY_CUDA_SAFE_CALL_NO_SYNC( call) do { \ + cudaError err = call; \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + exit(EXIT_FAILURE); \ + } } while (0) + +# define MY_CUDA_SAFE_CALL( call) do { \ + MY_CUDA_SAFE_CALL_NO_SYNC(call); \ + cudaError err = cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + exit(EXIT_FAILURE); \ + } } while (0) + + +extern "C" +{ + +void cudaInit(int argc, char **argv) +{ + //CUT_DEVICE_INIT(argc, argv); +} + +void allocateArray(void **devPtr, size_t size) +{ + MY_CUDA_SAFE_CALL(cudaMalloc(devPtr, size)); +} + +void freeArray(void *devPtr) +{ + MY_CUDA_SAFE_CALL(cudaFree(devPtr)); +} + +void threadSync() +{ + MY_CUDA_SAFE_CALL(cudaThreadSynchronize()); +} + +void copyArrayFromDevice(void* host, const void* device, unsigned int vbo, int size) +{ + if (vbo) + MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&device, vbo)); + MY_CUDA_SAFE_CALL(cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost)); + if (vbo) + MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vbo)); +} + +void copyArrayToDevice(void* device, const void* host, int offset, int size) +{ + MY_CUDA_SAFE_CALL(cudaMemcpy((char *) device + offset, host, size, cudaMemcpyHostToDevice)); +} + +void registerGLBufferObject(uint vbo) +{ + MY_CUDA_SAFE_CALL(cudaGLRegisterBufferObject(vbo)); +} + +void unregisterGLBufferObject(uint vbo) +{ + MY_CUDA_SAFE_CALL(cudaGLUnregisterBufferObject(vbo)); +} + +void setParameters(SimParams *hostParams) +{ + // copy parameters to constant memory + MY_CUDA_SAFE_CALL( cudaMemcpyToSymbol(params, hostParams, sizeof(SimParams)) ); +} + +//Round a / b to nearest higher integer value +int iDivUp(int a, int b){ + return (a % b != 0) ? (a / b + 1) : (a / b); +} + +// compute grid and thread block size for a given number of elements +void computeGridSize(int n, int blockSize, int &numBlocks, int &numThreads) +{ + numThreads = min(blockSize, n); + numBlocks = iDivUp(n, numThreads); +} + +void +integrateSystem(uint vboOldPos, uint vboNewPos, + float* oldVel, float* newVel, + float deltaTime, + int numBodies) +{ + int numThreads, numBlocks; + computeGridSize(numBodies, 256, numBlocks, numThreads); + + float *oldPos, *newPos; + MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&oldPos, vboOldPos)); + MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&newPos, vboNewPos)); + + // execute the kernel + integrate<<< numBlocks, numThreads >>>((float4*)newPos, (float4*)newVel, + (float4*)oldPos, (float4*)oldVel, + deltaTime); + + // check if kernel invocation generated an error + CUT_CHECK_ERROR("integrate kernel execution failed"); + + MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboOldPos)); + MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboNewPos)); +} + +void +updateGrid(uint vboPos, + uint* gridCounters, + uint* gridCells, + uint numBodies, + uint numCells) +{ + int numThreads, numBlocks; + computeGridSize(numBodies, 256, numBlocks, numThreads); + + float *pos; + MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&pos, vboPos)); + + MY_CUDA_SAFE_CALL(cudaMemset(gridCounters, 0, numCells*sizeof(uint))); + + // execute the kernel + updateGridD<<< numBlocks, numThreads >>>((float4 *) pos, + gridCounters, + gridCells); + + // check if kernel invocation generated an error + CUT_CHECK_ERROR("Kernel execution failed"); + + MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboPos)); +} + + +void +calcHash(uint vboPos, + uint* particleHash, + int numBodies) +{ + int numThreads, numBlocks; + computeGridSize(numBodies, 256, numBlocks, numThreads); + + float *pos; + MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&pos, vboPos)); + + // execute the kernel + calcHashD<<< numBlocks, numThreads >>>((float4 *) pos, + (uint2 *) particleHash); + + // check if kernel invocation generated an error + CUT_CHECK_ERROR("Kernel execution failed"); + + MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboPos)); +} + +void +reorderDataAndFindCellStart(uint* particleHash, + uint vboOldPos, + float* oldVel, + float* sortedPos, + float* sortedVel, + uint* cellStart, + uint numBodies, + uint numCells) +{ + int numThreads, numBlocks; + computeGridSize(numBodies, 256, numBlocks, numThreads); + + MY_CUDA_SAFE_CALL(cudaMemset(cellStart, 0xffffffff, numCells*sizeof(uint))); + + float *oldPos; + MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&oldPos, vboOldPos)); + +#if USE_TEX + MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldPosTex, oldPos, numBodies*sizeof(float4))); + MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldVelTex, oldVel, numBodies*sizeof(float4))); +#endif + + reorderDataAndFindCellStartD<<< numBlocks, numThreads >>>( + (uint2 *) particleHash, + (float4 *) oldPos, + (float4 *) oldVel, + (float4 *) sortedPos, + (float4 *) sortedVel, + (uint *) cellStart); + CUT_CHECK_ERROR("Kernel execution failed: reorderDataAndFindCellStartD"); + +#if USE_TEX + MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldPosTex)); + MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldVelTex)); +#endif + + MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboOldPos)); +} + +void +collide(uint vboOldPos, uint vboNewPos, + float* sortedPos, float* sortedVel, + float* oldVel, float* newVel, + uint* gridCounters, + uint* gridCells, + uint* particleHash, + uint* cellStart, + uint numBodies, + uint numCells, + uint maxParticlesPerCell) +{ + float4 *oldPos, *newPos; + MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&oldPos, vboOldPos)); + MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&newPos, vboNewPos)); + +#if USE_TEX + +#if USE_SORT + // use sorted arrays + MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldPosTex, sortedPos, numBodies*sizeof(float4))); + MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldVelTex, sortedVel, numBodies*sizeof(float4))); + + MY_CUDA_SAFE_CALL(cudaBindTexture(0, particleHashTex, particleHash, numBodies*sizeof(uint2))); + MY_CUDA_SAFE_CALL(cudaBindTexture(0, cellStartTex, cellStart, numCells*sizeof(uint))); +#else + + MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldPosTex, oldPos, numBodies*sizeof(float4))); + MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldVelTex, oldVel, numBodies*sizeof(float4))); + + MY_CUDA_SAFE_CALL(cudaBindTexture(0, gridCountersTex, gridCounters,numCells*sizeof(uint))); + MY_CUDA_SAFE_CALL(cudaBindTexture(0, gridCellsTex, gridCells, numCells*maxParticlesPerCell*sizeof(uint))); +#endif + +#endif + + // thread per particle + int numThreads, numBlocks; + computeGridSize(numBodies, BLOCKDIM, numBlocks, numThreads); + + // execute the kernel + collideD<<< numBlocks, numThreads >>>((float4*)newPos, (float4*)newVel, +#if USE_SORT + (float4*)sortedPos, (float4*)sortedVel, + (uint2 *) particleHash, + cellStart +#else + (float4*)oldPos, (float4*)oldVel, + gridCounters, + gridCells +#endif + ); + + // check if kernel invocation generated an error + CUT_CHECK_ERROR("Kernel execution failed"); + + MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboNewPos)); + MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboOldPos)); + +#if USE_TEX + MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldPosTex)); + MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldVelTex)); + +#if USE_SORT + MY_CUDA_SAFE_CALL(cudaUnbindTexture(particleHashTex)); + MY_CUDA_SAFE_CALL(cudaUnbindTexture(cellStartTex)); +#else + MY_CUDA_SAFE_CALL(cudaUnbindTexture(gridCountersTex)); + MY_CUDA_SAFE_CALL(cudaUnbindTexture(gridCellsTex)); +#endif +#endif +} + +} // extern "C" diff --git a/Extras/CUDA/particleSystem.cuh b/Extras/CUDA/particleSystem.cuh new file mode 100644 index 000000000..e2b297fc6 --- /dev/null +++ b/Extras/CUDA/particleSystem.cuh @@ -0,0 +1,57 @@ +extern "C" +{ +void cudaInit(int argc, char **argv); + +void allocateArray(void **devPtr, int size); +void freeArray(void *devPtr); + +void threadSync(); + +void copyArrayFromDevice(void* host, const void* device, unsigned int vbo, int size); +void copyArrayToDevice(void* device, const void* host, int offset, int size); +void registerGLBufferObject(unsigned int vbo); +void unregisterGLBufferObject(unsigned int vbo); + +void setParameters(SimParams *hostParams); + +void +integrateSystem(uint vboOldPos, uint vboNewPos, + float* oldVel, float* newVel, + float deltaTime, + int numBodies); + +void +updateGrid(uint vboPos, + uint* gridCounters, + uint* gridCells, + uint numBodies, + uint numCells); + +void +calcHash(uint vboPos, + uint* particleHash, + int numBodies); + +void +reorderDataAndFindCellStart(uint* particleHash, + uint vboOldPos, + float* oldVel, + float* sortedPos, + float* sortedVel, + uint* cellStart, + uint numBodies, + uint numCells); + +void +collide(uint vboOldPos, uint vboNewPos, + float* sortedPos, float* sortedVel, + float* oldVel, float* newVel, + uint* gridCounters, + uint* gridCells, + uint* particleHash, + uint* cellStart, + uint numBodies, + uint numCells, + uint maxParticlesPerCell); + +} diff --git a/Extras/CUDA/particleSystem.h b/Extras/CUDA/particleSystem.h new file mode 100644 index 000000000..3abf09078 --- /dev/null +++ b/Extras/CUDA/particleSystem.h @@ -0,0 +1,153 @@ +/* + * Copyright 1993-2006 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + */ + +#ifndef __BODYSYSTEMCUDA_H__ +#define __BODYSYSTEMCUDA_H__ + +#define DEBUG_GRID 0 +#define DO_TIMING 0 + +#include "particles_kernel.cuh" +#include "vector_functions.h" +#include "LinearMath/btAlignedObjectArray.h" + +// CUDA BodySystem: runs on the GPU +class ParticleSystem +{ +public: + ParticleSystem(uint numParticles, uint3 gridSize); + ~ParticleSystem(); + + + enum ParticleArray + { + POSITION, + VELOCITY, + }; + + enum ParticleConfig + { + CONFIG_RANDOM, + CONFIG_GRID, + _NUM_CONFIGS + }; + + enum SimulationMode + { + SIMULATION_CUDA, + SIMULATION_BULLET_CPU, + SIMULATION_NUM_MODES + }; + + void reset(ParticleConfig config); + void debugDraw(); + + + /// + ///Bullet data + /// + + void initializeBullet(); + void finalizeBullet(); + class btDiscreteDynamicsWorld* m_dynamicsWorld; + class btDefaultCollisionConfiguration* m_collisionConfiguration; + class btCollisionDispatcher* m_dispatcher; + class btCudaBroadphase* m_broadphase; + class btSequentialImpulseConstraintSolver* m_constraintSolver; + btAlignedObjectArray m_bulletParticles; + + + void update(float deltaTime); + + float* getArray(ParticleArray array); + + int getNumParticles() const; + + unsigned int getCurrentReadBuffer() const; + unsigned int getColorBuffer() const; + + void dumpGrid(); + void dumpParticles(uint start, uint count); + + void setIterations(int i) { m_solverIterations = i; } + + void setDamping(float x) { m_params.globalDamping = x; } + void setGravity(float x) { m_params.gravity = make_float3(0.0f, x, 0.0f); } + + void setCollideSpring(float x) { m_params.spring = x; } + void setCollideDamping(float x) { m_params.damping = x; } + void setCollideShear(float x) { m_params.shear = x; } + void setCollideAttraction(float x) { m_params.attraction = x; } + + void setColliderPos(float4 x) { m_params.colliderPos = x; } + + float getParticleRadius() { return m_params.particleRadius; } + float4 getColliderPos() { return m_params.colliderPos; } + float getColliderRadius() { return m_params.colliderRadius; } + uint3 getGridSize() { return m_params.gridSize; } + float3 getWorldOrigin() { return m_params.worldOrigin; } + float3 getCellSize() { return m_params.cellSize; } + + void addSphere(int index, float *pos, float *vel, int r, float spacing); + + SimulationMode getSimulationMode() const + { + return m_simulationMode; + } + + void setSimulationMode(SimulationMode mode) + { + m_simulationMode=mode; + } + +protected: // methods + ParticleSystem() + : m_simulationMode(SIMULATION_CUDA) + {} + + void initGrid(uint *size, float spacing, float jitter, uint numParticles); + + +protected: // data + bool m_bInitialized; + + + + // params + SimParams m_params; + uint m_maxParticlesPerCell; + + uint m_timer; + + uint m_solverIterations; + + SimulationMode m_simulationMode; +}; + +#endif // __BODYSYSTEMCUDA_H__ diff --git a/Extras/CUDA/particles.cpp b/Extras/CUDA/particles.cpp new file mode 100644 index 000000000..cd5ad40fd --- /dev/null +++ b/Extras/CUDA/particles.cpp @@ -0,0 +1,544 @@ +/* + * Copyright 1993-2007 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + */ + +/* + Particle system example with collisions using uniform grid +*/ + +#include +#include +#include +#include +//#include + +#include + +#if defined(__APPLE__) || defined(MACOSX) +#include +#else +#include +#endif + +#include "particleSystem.h" +#include "render_particles.h" +#include "paramgl.h" + +// view params +int ox, oy; +int buttonState = 0; +float camera_trans[] = {0, 0, -3}; +float camera_rot[] = {0, 0, 0}; +float camera_trans_lag[] = {0, 0, -3}; +float camera_rot_lag[] = {0, 0, 0}; +const float inertia = 0.1; +ParticleRenderer::DisplayMode displayMode = ParticleRenderer::PARTICLE_SPHERES; + + + +int mode = 0; +bool displayEnabled = true; +bool bPause = false; +bool displaySliders = false; +bool wireframe = false; + +enum { M_VIEW = 0, M_MOVE }; + +uint numParticles = 0; +uint3 gridSize; +int numIterations = 0; // run until exit + +// simulation parameters +float timestep = 0.5f; +float damping = 1.0f; +float gravity = 0.0003f; +int iterations = 1; +int ballr = 10; + +float collideSpring = 0.5f;; +float collideDamping = 0.02f;; +float collideShear = 0.1f; +float collideAttraction = 0.0f; + +ParticleSystem *psystem = 0; + +// fps + +ParticleRenderer *renderer = 0; + +float modelView[16]; + +ParamListGL *params; + +extern "C" void cudaInit(int argc, char **argv); + +void init(int numParticles, uint3 gridSize) +{ + psystem = new ParticleSystem(numParticles, gridSize); + psystem->reset(ParticleSystem::CONFIG_GRID); + + renderer = new ParticleRenderer; + renderer->setParticleRadius(psystem->getParticleRadius()); + renderer->setColorBuffer(psystem->getColorBuffer()); + + +} + +void initGL() +{ + glewInit(); + if (!glewIsSupported("GL_VERSION_2_0 GL_VERSION_1_5 GL_ARB_multitexture GL_ARB_vertex_buffer_object")) { + fprintf(stderr, "Required OpenGL extensions missing."); + exit(-1); + } + + glEnable(GL_DEPTH_TEST); + glClearColor(0.25, 0.25, 0.25, 1.0); + + glutReportErrors(); +} + + +void display() +{ + + // update the simulation + if (!bPause) + { + psystem->setIterations(iterations); + psystem->setDamping(damping); + psystem->setGravity(-gravity); + psystem->setCollideSpring(collideSpring); + psystem->setCollideDamping(collideDamping); + psystem->setCollideShear(collideShear); + psystem->setCollideAttraction(collideAttraction); + + psystem->update(timestep); + renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles()); + float* posArray = psystem->getArray(ParticleSystem::POSITION); + renderer->setPositions(posArray,psystem->getNumParticles()); + } + + // render + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); + + // view transform + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); + for (int c = 0; c < 3; ++c) + { + camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; + camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; + } + glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]); + glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); + glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); + + glGetFloatv(GL_MODELVIEW_MATRIX, modelView); + + // cube + glColor3f(1.0, 1.0, 1.0); + glutWireCube(2.0); + + // collider + glPushMatrix(); + float4 p = psystem->getColliderPos(); + glTranslatef(p.x, p.y, p.z); + glColor3f(1.0, 0.0, 0.0); + glutSolidSphere(psystem->getColliderRadius(), 20, 10); + glPopMatrix(); + + if (displayEnabled) + { + renderer->display(displayMode); + } + + if (displaySliders) { + glDisable(GL_DEPTH_TEST); + glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color + glEnable(GL_BLEND); + params->Render(0, 0); + glDisable(GL_BLEND); + glEnable(GL_DEPTH_TEST); + } + + psystem->debugDraw(); + + glutSwapBuffers(); + + { + char fps[256]; + //float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); + switch (psystem->getSimulationMode()) + { + case ParticleSystem::SIMULATION_CUDA: + { + sprintf(fps, "CUDA particles (%d particles)", numParticles); + break; + } + case ParticleSystem::SIMULATION_BULLET_CPU: + { + sprintf(fps, "Bullet btCudaBroadphase (%d btSphereShapes)", numParticles); + break; + } + default: + { + sprintf(fps, "Unknown simulation mode"); + } + } + glutSetWindowTitle(fps); + } + + glutReportErrors(); +} + +void reshape(int w, int h) +{ + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + gluPerspective(60.0, (float) w / (float) h, 0.1, 10.0); + + glMatrixMode(GL_MODELVIEW); + glViewport(0, 0, w, h); + + renderer->setWindowSize(w, h); + renderer->setFOV(60.0); +} + +void mouse(int button, int state, int x, int y) +{ + int mods; + + if (state == GLUT_DOWN) + buttonState |= 1<Mouse(x, y, button, state)) { + glutPostRedisplay(); + return; + } + } + + glutPostRedisplay(); +} + +// transfrom vector by matrix +void xform(float *v, float *r, GLfloat *m) +{ + r[0] = v[0]*m[0] + v[1]*m[4] + v[2]*m[8] + m[12]; + r[1] = v[0]*m[1] + v[1]*m[5] + v[2]*m[9] + m[13]; + r[2] = v[0]*m[2] + v[1]*m[6] + v[2]*m[10] + m[14]; +} + +// transform vector by transpose of matrix +void ixform(float *v, float *r, GLfloat *m) +{ + r[0] = v[0]*m[0] + v[1]*m[1] + v[2]*m[2]; + r[1] = v[0]*m[4] + v[1]*m[5] + v[2]*m[6]; + r[2] = v[0]*m[8] + v[1]*m[9] + v[2]*m[10]; +} + +void ixformPoint(float *v, float *r, GLfloat *m) +{ + float x[4]; + x[0] = v[0] - m[12]; + x[1] = v[1] - m[13]; + x[2] = v[2] - m[14]; + x[3] = 1.0f; + ixform(x, r, m); +} + +void motion(int x, int y) +{ + float dx, dy; + dx = x - ox; + dy = y - oy; + + if (displaySliders) { + if (params->Motion(x, y)) { + ox = x; oy = y; + glutPostRedisplay(); + return; + } + } + + switch(mode) + { + case M_VIEW: + if (buttonState == 3) { + // left+middle = zoom + camera_trans[2] += (dy / 100.0) * 0.5 * fabs(camera_trans[2]); + } + else if (buttonState & 2) { + // middle = translate + camera_trans[0] += dx / 100.0; + camera_trans[1] -= dy / 100.0; + } + else if (buttonState & 1) { + // left = rotate + camera_rot[0] += dy / 5.0; + camera_rot[1] += dx / 5.0; + } + break; + + case M_MOVE: + { + float translateSpeed = 0.003f; + float4 p = psystem->getColliderPos(); + if (buttonState==1) { + float v[3], r[3]; + v[0] = dx*translateSpeed; + v[1] = -dy*translateSpeed; + v[2] = 0.0f; + ixform(v, r, modelView); + p.x += r[0]; + p.y += r[1]; + p.z += r[2]; + } else if (buttonState==2) { + float v[3], r[3]; + v[0] = 0.0f; + v[1] = 0.0f; + v[2] = dy*translateSpeed; + ixform(v, r, modelView); + p.x += r[0]; + p.y += r[1]; + p.z += r[2]; + } + psystem->setColliderPos(p); + } + break; + } + + ox = x; oy = y; + glutPostRedisplay(); +} + +inline float frand() +{ + return rand() / (float) RAND_MAX; +} + +// commented out to remove unused parameter warnings in Linux +void key(unsigned char key, int /*x*/, int /*y*/) +{ + switch (key) + { + case ' ': + bPause = !bPause; + break; + case 13: + psystem->update(timestep); + renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles()); + break; + case '\033': + case 'q': + exit(0); + break; + case 'v': + mode = M_VIEW; + break; + case 'm': + mode = M_MOVE; + break; + case 's': + psystem->setSimulationMode((ParticleSystem::SimulationMode) ((psystem->getSimulationMode() + 1) % ParticleSystem::SIMULATION_NUM_MODES)); + break; + + case 'p': + displayMode = (ParticleRenderer::DisplayMode) ((displayMode + 1) % ParticleRenderer::PARTICLE_NUM_MODES); + break; + case 'd': + psystem->dumpGrid(); + break; + case 'u': + psystem->dumpParticles(0, 1); + break; + + case 'r': + displayEnabled = !displayEnabled; + break; + + case '1': + psystem->reset(ParticleSystem::CONFIG_GRID); + break; + case '2': + psystem->reset(ParticleSystem::CONFIG_RANDOM); + break; + case '3': + { + // inject a sphere of particles + float pr = psystem->getParticleRadius(); + float tr = pr+(pr*2.0f)*ballr; + float pos[4], vel[4]; + pos[0] = -1.0 + tr + frand()*(2.0f - tr*2.0f); + pos[1] = 1.0f - tr; + pos[2] = -1.0 + tr + frand()*(2.0f - tr*2.0f); + pos[3] = 0.0f; + vel[0] = vel[1] = vel[2] = vel[3] = 0.0f; + psystem->addSphere(0, pos, vel, ballr, pr*2.0f); + } + break; + case '4': + { + // shoot ball from camera + float pr = psystem->getParticleRadius(); + float vel[4], velw[4], pos[4], posw[4]; + vel[0] = 0.0f; + vel[1] = 0.0f; + vel[2] = -0.05f; + vel[3] = 0.0f; + ixform(vel, velw, modelView); + + pos[0] = 0.0f; + pos[1] = 0.0f; + pos[2] = -2.5f; + pos[3] = 1.0; + ixformPoint(pos, posw, modelView); + posw[3] = 0.0f; + + psystem->addSphere(0, posw, velw, ballr, pr*2.0f); + } + break; + + case 'w': + wireframe = !wireframe; + break; + + case 'h': + displaySliders = !displaySliders; + break; + } + + glutPostRedisplay(); +} + +void special(int k, int x, int y) +{ + if (displaySliders) { + params->Special(k, x, y); + } +} + +void idle(void) +{ + glutPostRedisplay(); +} + +void initParams() +{ + // create a new parameter list + params = new ParamListGL("misc"); + params->AddParam(new Param("time step", timestep, 0.0, 1.0, 0.01, ×tep)); + params->AddParam(new Param("iterations", iterations, 0, 10, 1, &iterations)); + params->AddParam(new Param("damping", damping, 0.0, 1.0, 0.001, &damping)); + params->AddParam(new Param("gravity", gravity, 0.0, 0.001, 0.0001, &gravity)); + params->AddParam(new Param("ball r", ballr, 1, 20, 1, &ballr)); + + params->AddParam(new Param("collide spring", collideSpring, 0.0, 1.0, 0.001, &collideSpring)); + params->AddParam(new Param("collide damping", collideDamping, 0.0, 0.1, 0.001, &collideDamping)); + params->AddParam(new Param("collide shear", collideShear, 0.0, 0.1, 0.001, &collideShear)); + params->AddParam(new Param("collide attract", collideAttraction, 0.0, 0.1, 0.001, &collideAttraction)); +} + +void mainMenu(int i) +{ + key((unsigned char) i, 0, 0); +} + +void initMenus() +{ + glutCreateMenu(mainMenu); + glutAddMenuEntry("Reset block [1]", '1'); + glutAddMenuEntry("Reset random [2]", '2'); + glutAddMenuEntry("Add sphere [3]", '3'); + glutAddMenuEntry("View mode [v]", 'v'); + glutAddMenuEntry("Move cursor mode [m]", 'm'); + glutAddMenuEntry("Toggle point rendering [p]", 'p'); + glutAddMenuEntry("Toggle Bullet simulation[s]", 's'); + glutAddMenuEntry("Toggle animation [ ]", ' '); + glutAddMenuEntry("Step animation [ret]", 13); + glutAddMenuEntry("Toggle sliders [h]", 'h'); + glutAddMenuEntry("Quit (esc)", '\033'); + glutAttachMenu(GLUT_RIGHT_BUTTON); +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int +main(int argc, char** argv) +{ + numParticles =1024;//1024;//64;//16380;//32768; + uint gridDim = 64; + numIterations = 0; + + gridSize.x = gridSize.y = gridSize.z = gridDim; + printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); + + bool benchmark = false; + + cudaInit(argc, argv); + + glutInit(&argc, argv); + glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); + glutInitWindowSize(640, 480); + glutCreateWindow("CUDA particles"); + + initGL(); + init(numParticles, gridSize); + initParams(); + initMenus(); + + + glutDisplayFunc(display); + glutReshapeFunc(reshape); + glutMouseFunc(mouse); + glutMotionFunc(motion); + glutKeyboardFunc(key); + glutSpecialFunc(special); + glutIdleFunc(idle); + + glutMainLoop(); + + + if (psystem) + delete psystem; + + return 0; +} diff --git a/Extras/CUDA/particles_kernel.cu b/Extras/CUDA/particles_kernel.cu new file mode 100644 index 000000000..9817cb384 --- /dev/null +++ b/Extras/CUDA/particles_kernel.cu @@ -0,0 +1,381 @@ +/* + * Copyright 1993-2006 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + */ + +/* + * Device code. + */ + +#ifndef _PARTICLES_KERNEL_H_ +#define _PARTICLES_KERNEL_H_ + +#include +#include +#include "cutil_math.h" +#include "math_constants.h" +#include "particles_kernel.cuh" + +#if USE_TEX +// textures for particle position and velocity +texture oldPosTex; +texture oldVelTex; + +texture particleHashTex; +texture cellStartTex; + +texture gridCountersTex; +texture gridCellsTex; +#endif + +__constant__ SimParams params; + +// integrate particle attributes +__global__ void +integrate(float4* newPos, float4* newVel, + float4* oldPos, float4* oldVel, + float deltaTime) +{ + int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; + + float4 pos4 = oldPos[index]; + float4 vel4 = oldVel[index]; + float3 pos = make_float3(pos4); + float3 vel = make_float3(vel4); + + vel += params.gravity * deltaTime; + vel *= params.globalDamping; + + // new position = old position + velocity * deltaTime + pos += vel * deltaTime; + + // bounce off cube sides + if (pos.x > 1.0f - params.particleRadius) { pos.x = 1.0f - params.particleRadius; vel.x *= params.boundaryDamping; } + if (pos.x < -1.0f + params.particleRadius) { pos.x = -1.0f + params.particleRadius; vel.x *= params.boundaryDamping;} + if (pos.y > 1.0f - params.particleRadius) { pos.y = 1.0f - params.particleRadius; vel.y *= params.boundaryDamping; } + if (pos.y < -1.0f + params.particleRadius) { pos.y = -1.0f + params.particleRadius; vel.y *= params.boundaryDamping;} + if (pos.z > 1.0f - params.particleRadius) { pos.z = 1.0f - params.particleRadius; vel.z *= params.boundaryDamping; } + if (pos.z < -1.0f + params.particleRadius) { pos.z = -1.0f + params.particleRadius; vel.z *= params.boundaryDamping;} + + // store new position and velocity + newPos[index] = make_float4(pos, pos4.w); + newVel[index] = make_float4(vel, vel4.w); +} + +// calculate position in uniform grid +__device__ int3 calcGridPos(float4 p) +{ + int3 gridPos; + gridPos.x = floor((p.x - params.worldOrigin.x) / params.cellSize.x); + gridPos.y = floor((p.y - params.worldOrigin.y) / params.cellSize.y); + gridPos.z = floor((p.z - params.worldOrigin.z) / params.cellSize.z); + return gridPos; +} + +// calculate address in grid from position (clamping to edges) +__device__ uint calcGridHash(int3 gridPos) +{ + gridPos.x = max(0, min(gridPos.x, params.gridSize.x-1)); + gridPos.y = max(0, min(gridPos.y, params.gridSize.y-1)); + gridPos.z = max(0, min(gridPos.z, params.gridSize.z-1)); + return __mul24(__mul24(gridPos.z, params.gridSize.y), params.gridSize.x) + __mul24(gridPos.y, params.gridSize.x) + gridPos.x; +} + +// add particle to cell using atomics +__device__ void addParticleToCell(int3 gridPos, + uint index, + uint* gridCounters, + uint* gridCells) +{ + // calculate grid hash + uint gridHash = calcGridHash(gridPos); + + // increment cell counter using atomics +#if defined CUDA_NO_SM_11_ATOMIC_INTRINSICS + int counter = 0; +#else + int counter = atomicAdd(&gridCounters[gridHash], 1); // returns previous value + counter = min(counter, params.maxParticlesPerCell-1); +#endif + + // write particle index into this cell (very uncoalesced!) + gridCells[gridHash*params.maxParticlesPerCell + counter] = index; +} + + +// update uniform grid +__global__ void +updateGridD(float4* pos, + uint* gridCounters, + uint* gridCells) +{ + int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; + float4 p = pos[index]; + + // get address in grid + int3 gridPos = calcGridPos(p); + + addParticleToCell(gridPos, index, gridCounters, gridCells); +} + +// calculate grid hash value for each particle +__global__ void +calcHashD(float4* pos, + uint2* particleHash) +{ + int index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; + float4 p = pos[index]; + + // get address in grid + int3 gridPos = calcGridPos(p); + uint gridHash = calcGridHash(gridPos); + + // store grid hash and particle index + particleHash[index] = make_uint2(gridHash, index); +} + +// rearrange particle data into sorted order, and find the start of each cell in the +// sorted hash array +__global__ void +reorderDataAndFindCellStartD(uint2* particleHash, // particle id sorted by hash + float4* oldPos, + float4* oldVel, + float4* sortedPos, + float4* sortedVel, + uint* cellStart) +{ + int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; + + uint2 sortedData = particleHash[index]; + + // Load hash data into shared memory so that we can look + // at neighboring particle's hash value without loading + // two hash values per thread + __shared__ uint sharedHash[257]; + sharedHash[threadIdx.x+1] = sortedData.x; + if (index > 0 && threadIdx.x == 0) + { + // first thread in block must load neighbor particle hash + volatile uint2 prevData = particleHash[index-1]; + sharedHash[0] = prevData.x; + } + + __syncthreads(); + if (index == 0 || sortedData.x != sharedHash[threadIdx.x]) + { + cellStart[sortedData.x] = index; + } + + // Now use the sorted index to reorder the pos and vel data + float4 pos = FETCH(oldPos, sortedData.y); // macro does either global read or texture fetch + float4 vel = FETCH(oldVel, sortedData.y); // see particles_kernel.cuh + + sortedPos[index] = pos; + sortedVel[index] = vel; + +} + +// collide two spheres using DEM method +__device__ float3 collideSpheres(float4 posA, float4 posB, + float4 velA, float4 velB, + float radiusA, float radiusB, + float attraction) +{ + // calculate relative position + float3 relPos; + relPos.x = posB.x - posA.x; + relPos.y = posB.y - posA.y; + relPos.z = posB.z - posA.z; + + float dist = length(relPos); + float collideDist = radiusA + radiusB; + + float3 force = make_float3(0.0f); + if (dist < collideDist) { + float3 norm = relPos / dist; + + // relative velocity + float3 relVel; + relVel.x = velB.x - velA.x; + relVel.y = velB.y - velA.y; + relVel.z = velB.z - velA.z; + + // relative tangential velocity + float3 tanVel = relVel - (dot(relVel, norm) * norm); + + // spring force + force = -params.spring*(collideDist - dist) * norm; + // dashpot (damping) force + force += params.damping*relVel; + // tangential shear force + force += params.shear*tanVel; + // attraction + force += attraction*relPos; + } + + return force; +} + + +// collide particle with all particles in a given cell +// version using grid built with atomics +__device__ +float3 collideCell(int3 gridPos, + uint index, + float4 pos, + float4 vel, + float4* oldPos, + float4* oldVel, + uint* gridCounters, + uint* gridCells) +{ + float3 force = make_float3(0.0f); + + if ((gridPos.x < 0) || (gridPos.x > params.gridSize.x-1) || + (gridPos.y < 0) || (gridPos.y > params.gridSize.y-1) || + (gridPos.z < 0) || (gridPos.z > params.gridSize.z-1)) { + return force; + } + + uint gridHash = calcGridHash(gridPos); + + // iterate over particles in this cell + uint particlesInCell = FETCH(gridCounters, gridHash); + particlesInCell = min(particlesInCell, params.maxParticlesPerCell-1); + + for(uint i=0; i params.gridSize.x-1) || + (gridPos.y < 0) || (gridPos.y > params.gridSize.y-1) || + (gridPos.z < 0) || (gridPos.z > params.gridSize.z-1)) { + return force; + } + + uint gridHash = calcGridHash(gridPos); + + // get start of bucket for this cell + uint bucketStart = FETCH(cellStart, gridHash); + if (bucketStart == 0xffffffff) + return force; // cell empty + + // iterate over particles in this cell + for(uint i=0; i>>(pData0, elements, elements_rounded_to_3072, shift); + // Prefix sum in radix groups, and then between groups throughout a block + RadixPrefixSum<<>>(); + // Sum the block offsets and then shuffle data into bins + RadixAddOffsetsAndShuffle<<>>(pData0, pData1, elements, elements_rounded_to_3072, shift); + + // Exchange data pointers + KeyValuePair* pTemp = pData0; + pData0 = pData1; + pData1 = pTemp; + } +} + +} diff --git a/Extras/CUDA/radixsort.cuh b/Extras/CUDA/radixsort.cuh new file mode 100644 index 000000000..f0e038d35 --- /dev/null +++ b/Extras/CUDA/radixsort.cuh @@ -0,0 +1,63 @@ +/* + * Copyright 1993-2006 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + */ + +/* Radixsort project which demonstrates the use of CUDA in a multi phase + * sorting computation. + * Type definitions. + */ + +#ifndef _RADIXSORT_H_ +#define _RADIXSORT_H_ + +#include + +#define SYNCIT __syncthreads() + +// Use 16 bit keys/values +#define SIXTEEN 0 + +typedef unsigned int uint; +typedef unsigned short ushort; + +#if SIXTEEN +typedef struct __align__(4) { + ushort key; + ushort value; +#else +typedef struct __align__(8) { + uint key; + uint value; +#endif +} KeyValuePair; + +extern "C" { + void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits); +} + +#endif // #ifndef _RADIXSORT_H_ diff --git a/Extras/CUDA/radixsort_kernel.cu b/Extras/CUDA/radixsort_kernel.cu new file mode 100644 index 000000000..e12dca2a0 --- /dev/null +++ b/Extras/CUDA/radixsort_kernel.cu @@ -0,0 +1,577 @@ +/* + * Copyright 1993-2006 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + */ + +/* Radixsort project with key/value and arbitrary datset size support + * which demonstrates the use of CUDA in a multi phase sorting + * computation. + * Device code. + */ + +#ifndef _RADIXSORT_KERNEL_H_ +#define _RADIXSORT_KERNEL_H_ + +#include +#include "radixsort.cuh" + +#define SYNCIT __syncthreads() + +static const int NUM_SMS = 16; +static const int NUM_THREADS_PER_SM = 192; +static const int NUM_THREADS_PER_BLOCK = 64; +//static const int NUM_THREADS = NUM_THREADS_PER_SM * NUM_SMS; +static const int NUM_BLOCKS = (NUM_THREADS_PER_SM / NUM_THREADS_PER_BLOCK) * NUM_SMS; +static const int RADIX = 8; // Number of bits per radix sort pass +static const int RADICES = 1 << RADIX; // Number of radices +static const int RADIXMASK = RADICES - 1; // Mask for each radix sort pass +#if SIXTEEN +static const int RADIXBITS = 16; // Number of bits to sort over +#else +static const int RADIXBITS = 32; // Number of bits to sort over +#endif +static const int RADIXTHREADS = 16; // Number of threads sharing each radix counter +static const int RADIXGROUPS = NUM_THREADS_PER_BLOCK / RADIXTHREADS; // Number of radix groups per CTA +static const int TOTALRADIXGROUPS = NUM_BLOCKS * RADIXGROUPS; // Number of radix groups for each radix +static const int SORTRADIXGROUPS = TOTALRADIXGROUPS * RADICES; // Total radix count +static const int GRFELEMENTS = (NUM_THREADS_PER_BLOCK / RADIXTHREADS) * RADICES; +static const int GRFSIZE = GRFELEMENTS * sizeof(uint); + +// Prefix sum variables +static const int PREFIX_NUM_THREADS_PER_SM = NUM_THREADS_PER_SM; +static const int PREFIX_NUM_THREADS_PER_BLOCK = PREFIX_NUM_THREADS_PER_SM; +static const int PREFIX_NUM_BLOCKS = (PREFIX_NUM_THREADS_PER_SM / PREFIX_NUM_THREADS_PER_BLOCK) * NUM_SMS; +static const int PREFIX_BLOCKSIZE = SORTRADIXGROUPS / PREFIX_NUM_BLOCKS; +static const int PREFIX_GRFELEMENTS = PREFIX_BLOCKSIZE + 2 * PREFIX_NUM_THREADS_PER_BLOCK; +static const int PREFIX_GRFSIZE = PREFIX_GRFELEMENTS * sizeof(uint); + +// Shuffle variables +static const int SHUFFLE_GRFOFFSET = RADIXGROUPS * RADICES; +static const int SHUFFLE_GRFELEMENTS = SHUFFLE_GRFOFFSET + PREFIX_NUM_BLOCKS; +static const int SHUFFLE_GRFSIZE = SHUFFLE_GRFELEMENTS * sizeof(uint); + + +#define SDATA( index) CUT_BANK_CHECKER(sdata, index) + +// Prefix sum data +uint gRadixSum[TOTALRADIXGROUPS * RADICES]; +__device__ uint dRadixSum[TOTALRADIXGROUPS * RADICES]; +uint gRadixBlockSum[PREFIX_NUM_BLOCKS]; +__device__ uint dRadixBlockSum[PREFIX_NUM_BLOCKS]; + +extern __shared__ uint sRadixSum[]; + + + +//////////////////////////////////////////////////////////////////////////////// +//! Perform a radix sum on the list to be sorted. Each SM holds a set of +//! radix counters for each group of RADIXGROUPS thread in the GRF. +//! +//! @param pData input data +//! @param elements total number of elements +//! @param elements_rounded_to_3072 total number of elements rounded up to the +//! nearest multiple of 3072 +//! @param shift the shift (0 to 24) that we are using to obtain the correct +//! byte +//////////////////////////////////////////////////////////////////////////////// +__global__ void RadixSum(KeyValuePair *pData, uint elements, uint elements_rounded_to_3072, uint shift) +{ + uint pos = threadIdx.x; + + // Zero radix counts + while (pos < GRFELEMENTS) + { + sRadixSum[pos] = 0; + pos += NUM_THREADS_PER_BLOCK; + } + + // Sum up data + // Source addresses computed so that each thread is reading from a block of + // consecutive addresses so there are no conflicts between threads + // They then loop over their combined region and the next batch works elsewhere. + // So threads 0 to 16 work on memory 0 to 320. + // First reading 0,1,2,3...15 then 16,17,18,19...31 and so on + // optimising parallel access to shared memory by a thread accessing 16*threadID + // The next radix group runs from 320 to 640 and the same applies in that region + uint tmod = threadIdx.x % RADIXTHREADS; + uint tpos = threadIdx.x / RADIXTHREADS; + + // Take the rounded element list size so that all threads have a certain size dataset to work with + // and no zero size datasets confusing the issue + // By using a multiple of 3072 we ensure that all threads have elements + // to work with until the last phase, at which point we individually test + uint element_fraction = elements_rounded_to_3072 / TOTALRADIXGROUPS; + + // Generate range + // Note that it is possible for both pos and end to be past the end of the element set + // which will be caught later. + pos = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction; + uint end = pos + element_fraction; + pos += tmod; + //printf("pos: %d\n", pos); + __syncthreads(); + + while (pos < end ) + { + uint key = 0; + + // Read first data element if we are in the set of elements + //if( pos < elements ) + //key = pData[pos].key; + KeyValuePair kvp; + // Read first data element, both items at once as the memory will want to coalesce like that anyway + if (pos < elements) + kvp = pData[pos]; + else + kvp.key = 0; + key = kvp.key; + + + // Calculate position of radix counter to increment + // There are RADICES radices in each pass (256) + // and hence this many counters for bin grouping + // Multiply by RADIXGROUPS (4) to spread through memory + // and into 4 radix groups + uint p = ((key >> shift) & RADIXMASK) * RADIXGROUPS; + + // Increment radix counters + // Each radix group has its own set of counters + // so we add the thread position [0-3], ie the group index. + // We slow down here and take at least 16 cycles to write to the summation boxes + // but other groups will only conflict with themselves and so can also be writing + // 16 cycles here at least avoids retries. + uint ppos = p + tpos; + + // If we are past the last element we don't want to do anything + // We do have to check each time, however, to ensure that all + // threads sync on each sync here. + if (tmod == 0 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 1 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 2 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 3 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 4 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 5 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 6 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 7 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 8 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 9 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 10 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 11 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 12 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 13 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 14 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + if (tmod == 15 && pos < elements) + sRadixSum[ppos]++; + SYNCIT; + + pos += RADIXTHREADS; + + } + + __syncthreads(); + + __syncthreads(); + + // Output radix sums into separate memory regions for each radix group + // So this memory then is layed out: + // 0...... 192..... 384 ................ 192*256 + // ie all 256 bins for each radix group + // in there: + // 0.............192 + // 0 4 8 12... - block idx * 4 + // And in the block boxes we see the 4 radix groups for that block + // So 0-192 should contain bin 0 for each radix group, and so on + uint offset = blockIdx.x * RADIXGROUPS; + uint row = threadIdx.x / RADIXGROUPS; + uint column = threadIdx.x % RADIXGROUPS; + while (row < RADICES) + { + dRadixSum[offset + row * TOTALRADIXGROUPS + column] = sRadixSum[row * RADIXGROUPS + column]; + row += NUM_THREADS_PER_BLOCK / RADIXGROUPS; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Performs first part of parallel prefix sum - individual sums of each radix +//! count. By the end of this we have prefix sums on a block level in dRadixSum +//! and totals for blocks in dRadixBlockSum. +//////////////////////////////////////////////////////////////////////////////// +__global__ void RadixPrefixSum() +{ + // Read radix groups in offset by one in the GRF so a zero can be inserted at the beginning + // and the final sum of all radix counts summed here is tacked onto the end for reading by + // the next stage + // Each block in this case is the full number of threads per SM (and hence the total number + // of radix groups), 192. We should then have the total set of offsets for an entire radix + // group by the end of this stage + // Device mem addressing + + uint brow = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS); + uint drow = threadIdx.x / TOTALRADIXGROUPS; // In default parameterisation this is always 0 + uint dcolumn = threadIdx.x % TOTALRADIXGROUPS; // And similarly this is always the same as threadIdx.x + uint dpos = (brow + drow) * TOTALRADIXGROUPS + dcolumn; + uint end = ((blockIdx.x + 1) * (RADICES / PREFIX_NUM_BLOCKS)) * TOTALRADIXGROUPS; + // Shared mem addressing + uint srow = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK); + uint scolumn = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK); + uint spos = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn; + + // Read (RADICES / PREFIX_NUM_BLOCKS) radix counts into the GRF alongside each other + while (dpos < end) + { + sRadixSum[spos] = dRadixSum[dpos]; + spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) * + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1); + dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS; + } + __syncthreads(); + + // Perform preliminary sum on each thread's stretch of data + // Each thread having a block of 16, with spacers between 0...16 18...33 and so on + int pos = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1); + end = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK); + uint sum = 0; + while (pos < end) + { + sum += sRadixSum[pos]; + sRadixSum[pos] = sum; + pos++; + } + __syncthreads(); + + + // Calculate internal offsets by performing a more traditional parallel + // prefix sum of the topmost member of each thread's work data. Right now, + // these are stored between the work data for each thread, allowing us to + // eliminate GRF conflicts as well as hold the offsets needed to complete the sum + // In other words we have: + // 0....15 16 17....32 33 34.... + // Where this first stage updates the intermediate values (so 16=15, 33=32 etc) + int m = (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1); + pos = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK); + sRadixSum[pos] = sRadixSum[pos - 1]; + __syncthreads(); + // This stage then performs a parallel prefix sum (ie use powers of 2 to propagate in log n stages) + // to update 17, 34 etc with the totals to that point (so 34 becomes [34] + [17]) and so on. + while (m < PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1)) + { + int p = pos - m; + uint t = ((p > 0) ? sRadixSum[p] : 0); + __syncthreads(); + sRadixSum[pos] += t; + __syncthreads(); + m *= 2; + } + __syncthreads(); + + + + // Add internal offsets to each thread's work data. + // So now we take 17 and add it to all values 18 to 33 so all offsets for that block + // are updated. + pos = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1); + end = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK); + int p = pos - 1; + sum = ((p > 0) ? sRadixSum[p] : 0); + while (pos < end) + { + sRadixSum[pos] += sum; + pos++; + } + __syncthreads(); + + // Write summed data back out to global memory in the same way as we read it in + // We now have prefix sum values internal to groups + brow = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS); + drow = threadIdx.x / TOTALRADIXGROUPS; + dcolumn = threadIdx.x % TOTALRADIXGROUPS; + srow = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK); + scolumn = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK); + dpos = (brow + drow) * TOTALRADIXGROUPS + dcolumn + 1; + spos = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn; + end = ((blockIdx.x + 1) * RADICES / PREFIX_NUM_BLOCKS) * TOTALRADIXGROUPS; + while (dpos < end) + { + dRadixSum[dpos] = sRadixSum[spos]; + dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS; + spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) * + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1); + } + + // Write last element to summation + // Storing block sums in a separate array + if (threadIdx.x == 0) { + dRadixBlockSum[blockIdx.x] = sRadixSum[PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) - 1]; + dRadixSum[blockIdx.x * PREFIX_BLOCKSIZE] = 0; + } +} + + +//////////////////////////////////////////////////////////////////////////////// +//! Initially perform prefix sum of block totals to obtain final set of offsets. +//! Then make use of radix sums to perform a shuffling of the data into the +//! correct bins. +//! +//! @param pSrc input data +//! @param pDst output data +//! @param elements total number of elements +//! @param shift the shift (0 to 24) that we are using to obtain the correct +//! byte +//////////////////////////////////////////////////////////////////////////////// +__global__ void RadixAddOffsetsAndShuffle(KeyValuePair* pSrc, KeyValuePair* pDst, uint elements, uint elements_rounded_to_3072, int shift) +{ + // Read offsets from previous blocks + if (threadIdx.x == 0) + sRadixSum[SHUFFLE_GRFOFFSET] = 0; + + if (threadIdx.x < PREFIX_NUM_BLOCKS - 1) + sRadixSum[SHUFFLE_GRFOFFSET + threadIdx.x + 1] = dRadixBlockSum[threadIdx.x]; + __syncthreads(); + + // Parallel prefix sum over block sums + int pos = threadIdx.x; + int n = 1; + while (n < PREFIX_NUM_BLOCKS) + { + int ppos = pos - n; + uint t0 = ((pos < PREFIX_NUM_BLOCKS) && (ppos >= 0)) ? sRadixSum[SHUFFLE_GRFOFFSET + ppos] : 0; + __syncthreads(); + if (pos < PREFIX_NUM_BLOCKS) + sRadixSum[SHUFFLE_GRFOFFSET + pos] += t0; + __syncthreads(); + n *= 2; + } + + // Read radix count data and add appropriate block offset + // for each radix at the memory location for this thread + // (where the other threads in the block will be reading + // as well, hence the large stride). + // There is one counter box per radix group per radix + // per block (4*256*3) + // We use 64 threads to read the 4 radix groups set of radices + // for the block. + int row = threadIdx.x / RADIXGROUPS; + int column = threadIdx.x % RADIXGROUPS; + int spos = row * RADIXGROUPS + column; + int dpos = row * TOTALRADIXGROUPS + column + blockIdx.x * RADIXGROUPS; + while (spos < SHUFFLE_GRFOFFSET) + { + sRadixSum[spos] = dRadixSum[dpos] + sRadixSum[SHUFFLE_GRFOFFSET + dpos / (TOTALRADIXGROUPS * RADICES / PREFIX_NUM_BLOCKS)]; + spos += NUM_THREADS_PER_BLOCK; + dpos += (NUM_THREADS_PER_BLOCK / RADIXGROUPS) * TOTALRADIXGROUPS; + } + __syncthreads(); + + //int pos; + // Shuffle data + // Each of the subbins for a block should be filled via the counters, properly interleaved + // Then, as we now iterate over each data value, we increment the subbins (each thread in the + // radix group in turn to avoid miss writes due to conflicts) and set locations correctly. + uint element_fraction = elements_rounded_to_3072 / TOTALRADIXGROUPS; + int tmod = threadIdx.x % RADIXTHREADS; + int tpos = threadIdx.x / RADIXTHREADS; + + pos = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction; + uint end = pos + element_fraction; //(blockIdx.x * RADIXGROUPS + tpos + 1) * element_fraction; + pos += tmod; + + __syncthreads(); + + while (pos < end ) + { + KeyValuePair kvp; +#if 1 // old load + // Read first data element, both items at once as the memory will want to coalesce like that anyway + if (pos < elements) + { + kvp = pSrc[pos]; + } + else + kvp.key = 0; + +#else // casting to float2 to get it to combine loads + int2 kvpf2; + + // Read first data element, both items at once as the memory will want to coalesce like that anyway + if (pos < elements) + { + // kvp = pSrc[pos]; + kvpf2 = ((int2*)pSrc)[pos]; + // printf("kvp: %f %f kvpf2: %f %f\n", kvp.key, kvp.value, kvpf2.x, kvpf2.y); + } + else + //kvp.key = 0; + kvpf2.x = 0; + + kvp.key = kvpf2.x; + kvp.value = kvpf2.y; +#endif + + uint index; + + // Calculate position of radix counter to increment + uint p = ((kvp.key >> shift) & RADIXMASK) * RADIXGROUPS; + + // Move data, keeping counts updated. + // Increment radix counters, relying on hexadecathread + // warp to prevent this code from stepping all over itself. + uint ppos = p + tpos; + if (tmod == 0 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 1 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 2 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 3 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 4 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 5 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 6 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 7 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 8 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 9 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 10 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 11 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 12 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 13 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 14 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + if (tmod == 15 && pos < elements) + { + index = sRadixSum[ppos]++; + pDst[index] = kvp; + } + SYNCIT; + + pos += RADIXTHREADS; + } + + __syncthreads(); +} + +#endif // #ifndef _RADIXSORT_KERNEL_H_ diff --git a/Extras/CUDA/render_particles.cpp b/Extras/CUDA/render_particles.cpp new file mode 100644 index 000000000..6a419ccd3 --- /dev/null +++ b/Extras/CUDA/render_particles.cpp @@ -0,0 +1,180 @@ +/* + * Copyright 1993-2006 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + */ + +#include + +#include +#include +#include + +#include "render_particles.h" +#include "shaders.h" + +#ifndef M_PI +#define M_PI 3.1415926535897932384626433832795 +#endif + +ParticleRenderer::ParticleRenderer() +: m_pos(0), + m_numParticles(0), + m_pointSize(1.0f), + m_particleRadius(0.125f * 0.5f), + m_program(0), + m_vbo(0), + m_colorVBO(0) +{ + _initGL(); +} + +ParticleRenderer::~ParticleRenderer() +{ + m_pos = 0; +} + +void ParticleRenderer::setPositions(float *pos, int numParticles) +{ + m_pos = pos; + m_numParticles = numParticles; +} + +void ParticleRenderer::setVertexBuffer(unsigned int vbo, int numParticles) +{ + m_vbo = vbo; + m_numParticles = numParticles; +} + +void ParticleRenderer::_drawPoints() +{ + if (!m_vbo) + { + glBegin(GL_POINTS); + { + int k = 0; + for (int i = 0; i < m_numParticles; ++i) + { + glVertex3fv(&m_pos[k]); + k += 4; + } + } + glEnd(); + } + else + { + glBindBufferARB(GL_ARRAY_BUFFER_ARB, m_vbo); + glVertexPointer(4, GL_FLOAT, 0, 0); + glEnableClientState(GL_VERTEX_ARRAY); + + if (m_colorVBO) { + glBindBufferARB(GL_ARRAY_BUFFER_ARB, m_colorVBO); + glColorPointer(4, GL_FLOAT, 0, 0); + glEnableClientState(GL_COLOR_ARRAY); + } + + glDrawArrays(GL_POINTS, 0, m_numParticles); + + glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); + glDisableClientState(GL_VERTEX_ARRAY); + glDisableClientState(GL_COLOR_ARRAY); + } +} + +void ParticleRenderer::display(DisplayMode mode /* = PARTICLE_POINTS */) +{ + switch (mode) + { + case PARTICLE_POINTS: + glColor3f(1, 1, 1); + glPointSize(m_pointSize); + _drawPoints(); + break; + + default: + case PARTICLE_SPHERES: + glEnable(GL_POINT_SPRITE_ARB); + glTexEnvi(GL_POINT_SPRITE_ARB, GL_COORD_REPLACE_ARB, GL_TRUE); + glEnable(GL_VERTEX_PROGRAM_POINT_SIZE_NV); + glDepthMask(GL_TRUE); + glEnable(GL_DEPTH_TEST); + + glUseProgram(m_program); + glUniform1f( glGetUniformLocation(m_program, "pointScale"), m_window_h / tanf(m_fov*0.5f*(float)M_PI/180.0f) ); + glUniform1f( glGetUniformLocation(m_program, "pointRadius"), m_particleRadius ); + + glColor3f(1, 1, 1); + _drawPoints(); + + glUseProgram(0); + glDisable(GL_POINT_SPRITE_ARB); + break; + } +} + +GLuint +ParticleRenderer::_compileProgram(const char *vsource, const char *fsource) +{ + GLuint vertexShader = glCreateShader(GL_VERTEX_SHADER); + GLuint fragmentShader = glCreateShader(GL_FRAGMENT_SHADER); + + glShaderSource(vertexShader, 1, &vsource, 0); + glShaderSource(fragmentShader, 1, &fsource, 0); + + glCompileShader(vertexShader); + glCompileShader(fragmentShader); + + GLuint program = glCreateProgram(); + + glAttachShader(program, vertexShader); + glAttachShader(program, fragmentShader); + + glLinkProgram(program); + + // check if program linked + GLint success = 0; + glGetProgramiv(program, GL_LINK_STATUS, &success); + + if (!success) { + char temp[256]; + glGetProgramInfoLog(program, 256, 0, temp); + printf("Failed to link program:\n%s\n", temp); + glDeleteProgram(program); + program = 0; + } + + return program; +} + +void ParticleRenderer::_initGL() +{ + m_program = _compileProgram(vertexShader, spherePixelShader); + +#if !defined(__APPLE__) && !defined(MACOSX) + glClampColorARB(GL_CLAMP_VERTEX_COLOR_ARB, GL_FALSE); + glClampColorARB(GL_CLAMP_FRAGMENT_COLOR_ARB, GL_FALSE); +#endif +} diff --git a/Extras/CUDA/render_particles.h b/Extras/CUDA/render_particles.h new file mode 100644 index 000000000..c60e6ef7c --- /dev/null +++ b/Extras/CUDA/render_particles.h @@ -0,0 +1,78 @@ +/* + * Copyright 1993-2006 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + */ + +#ifndef __RENDER_PARTICLES__ +#define __RENDER_PARTICLES__ + +class ParticleRenderer +{ +public: + ParticleRenderer(); + ~ParticleRenderer(); + + void setPositions(float *pos, int numParticles); + void setVertexBuffer(unsigned int vbo, int numParticles); + void setColorBuffer(unsigned int vbo) { m_colorVBO = vbo; } + + enum DisplayMode + { + PARTICLE_POINTS, + PARTICLE_SPHERES, + PARTICLE_NUM_MODES + }; + + void display(DisplayMode mode = PARTICLE_POINTS); + void displayGrid(); + + void setPointSize(float size) { m_pointSize = size; } + void setParticleRadius(float r) { m_particleRadius = r; } + void setFOV(float fov) { m_fov = fov; } + void setWindowSize(int w, int h) { m_window_w = w; m_window_h = h; } + +protected: // methods + void _initGL(); + void _drawPoints(); + GLuint _compileProgram(const char *vsource, const char *fsource); + +protected: // data + float *m_pos; + int m_numParticles; + + float m_pointSize; + float m_particleRadius; + float m_fov; + int m_window_w, m_window_h; + + GLuint m_program; + + GLuint m_vbo; + GLuint m_colorVBO; +}; + +#endif //__ RENDER_PARTICLES__ diff --git a/Extras/CUDA/shaders.cpp b/Extras/CUDA/shaders.cpp new file mode 100644 index 000000000..57a97572f --- /dev/null +++ b/Extras/CUDA/shaders.cpp @@ -0,0 +1,41 @@ +#define STRINGIFY(A) #A + +// vertex shader +const char *vertexShader = STRINGIFY( +uniform float pointRadius; // point size in world space +uniform float pointScale; // scale to calculate size in pixels +uniform float densityScale; +uniform float densityOffset; +void main() +{ + // calculate window-space point size + vec3 posEye = vec3(gl_ModelViewMatrix * vec4(gl_Vertex.xyz, 1.0)); + float dist = length(posEye); + gl_PointSize = pointRadius * (pointScale / dist); + + gl_TexCoord[0] = gl_MultiTexCoord0; + gl_Position = gl_ModelViewProjectionMatrix * vec4(gl_Vertex.xyz, 1.0); + + gl_FrontColor = gl_Color; +} +); + +// pixel shader for rendering points as shaded spheres +const char *spherePixelShader = STRINGIFY( +void main() +{ + const vec3 lightDir = vec3(0.577, 0.577, 0.577); + + // calculate normal from texture coordinates + vec3 N; + N.xy = gl_TexCoord[0].xy*vec2(2.0, -2.0) + vec2(-1.0, 1.0); + float mag = dot(N.xy, N.xy); + if (mag > 1.0) discard; // kill pixels outside circle + N.z = sqrt(1.0-mag); + + // calculate lighting + float diffuse = max(0.0, dot(lightDir, N)); + + gl_FragColor = gl_Color * diffuse; +} +); diff --git a/Extras/CUDA/shaders.h b/Extras/CUDA/shaders.h new file mode 100644 index 000000000..d90b40cef --- /dev/null +++ b/Extras/CUDA/shaders.h @@ -0,0 +1,2 @@ +extern const char *vertexShader; +extern const char *spherePixelShader;