Added btCudaBroadphase, some early research & development work to accelerate Bullet using CUDA

Re-uses the NVidia particle demo.
This commit is contained in:
erwin.coumans
2008-09-04 23:24:11 +00:00
parent d8a5bf2c9c
commit aef74321d7
25 changed files with 6570 additions and 0 deletions

7
Extras/CUDA/README.txt Normal file
View File

@@ -0,0 +1,7 @@
btCudaBroadphase is some R&D work, a first attempt to use CUDA in Bullet.
It uses the NVidia CUDA particle demo grid broadphase as Bullet broadphase.
Press 's' to toggle between the original CUDA particle demo and using Bullet+btCudaBroadphase.
See some related discussion here:
http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=500&start=105

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,124 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2008 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef CUDA_BROADPHASE_H
#define CUDA_BROADPHASE_H
#include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"
///The btCudaBroadphase uses CUDA to compute overlapping pairs using a GPU.
class btCudaBroadphase : public btSimpleBroadphase
{
bool m_bInitialized;
int m_numParticles;
// CPU data
float* m_hPos;
float* m_hVel;
float* m_hSortedPos;
unsigned int* m_hGridCounters;
unsigned int* m_hGridCells;
unsigned int* m_hParticleHash;
unsigned int* m_hCellStart;
// GPU data
float* m_dPos[2];
float* m_dVel[2];
float* m_dSortedPos;
float* m_dSortedVel;
// uniform grid data
unsigned int* m_dGridCounters; // counts number of entries per grid cell
unsigned int* m_dGridCells; // contains indices of up to "m_maxParticlesPerCell" particles per cell
unsigned int* m_dParticleHash[2];
unsigned int* m_dCellStart;
unsigned int m_posVbo[2];
unsigned int m_colorVBO;
unsigned int m_currentPosRead, m_currentVelRead;
unsigned int m_currentPosWrite, m_currentVelWrite;
// params
struct SimParams& m_simParams;
unsigned int m_maxParticlesPerCell;
protected:
unsigned int createVBO(unsigned int size);
void _initialize(int numParticles);
void _finalize();
public:
enum ParticleArray
{
POSITION,
VELOCITY,
};
enum ParticleConfig
{
CONFIG_RANDOM,
CONFIG_GRID,
_NUM_CONFIGS
};
btCudaBroadphase(SimParams& simParams,int maxProxies);
virtual ~btCudaBroadphase();
void initGrid(unsigned int* size, float spacing, float jitter, unsigned int numParticles);
void reset(ParticleConfig config);
void setArray(ParticleArray array, const float* data, int start, int count);
float* getArray(ParticleArray array);
void addSphere(int start, float *pos, float *vel, int r, float spacing);
virtual void calculateOverlappingPairs(btDispatcher* dispatcher);
unsigned int getCurrentReadBuffer() const { return m_posVbo[m_currentPosRead]; }
unsigned int getColorBuffer() const { return m_colorVBO; }
void dumpParticles(unsigned int start, unsigned int count);
void dumpGrid();
float* copyBuffersFromDeviceToHost();
void copyBuffersFromHostToDevice();
float* getHvelPtr();
float* getHposPtr();
void quickHack(float deltaTime);
void quickHack2();
void integrate();
};
#endif //CUDA_BROADPHASE_H

View File

@@ -0,0 +1,113 @@

Microsoft Visual Studio Solution File, Format Version 9.00
# Visual Studio 2005
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "btCudaBroadphase", "btCudaBroadphase.vcproj", "{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}"
ProjectSection(ProjectDependencies) = postProject
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE} = {6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}
{61BD1097-CF2E-B296-DAA9-73A6FE135319} = {61BD1097-CF2E-B296-DAA9-73A6FE135319}
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A} = {7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libbulletmath", "..\..\msvc\8\libbulletmath.vcproj", "{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libbulletdynamics", "..\..\msvc\8\libbulletdynamics.vcproj", "{61BD1097-CF2E-B296-DAA9-73A6FE135319}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libbulletcollision", "..\..\msvc\8\libbulletcollision.vcproj", "{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Win32 = Debug|Win32
Debug|x64 = Debug|x64
DebugDoublePrecision|Win32 = DebugDoublePrecision|Win32
DebugDoublePrecision|x64 = DebugDoublePrecision|x64
EmuDebug|Win32 = EmuDebug|Win32
EmuDebug|x64 = EmuDebug|x64
EmuRelease|Win32 = EmuRelease|Win32
EmuRelease|x64 = EmuRelease|x64
Release|Win32 = Release|Win32
Release|x64 = Release|x64
ReleaseDoublePrecision|Win32 = ReleaseDoublePrecision|Win32
ReleaseDoublePrecision|x64 = ReleaseDoublePrecision|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Debug|Win32.ActiveCfg = Debug|Win32
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Debug|Win32.Build.0 = Debug|Win32
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Debug|x64.ActiveCfg = Debug|x64
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Debug|x64.Build.0 = Debug|x64
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.DebugDoublePrecision|Win32.ActiveCfg = Debug|x64
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.DebugDoublePrecision|x64.ActiveCfg = Debug|x64
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.DebugDoublePrecision|x64.Build.0 = Debug|x64
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuDebug|Win32.ActiveCfg = EmuDebug|Win32
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuDebug|Win32.Build.0 = EmuDebug|Win32
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuDebug|x64.ActiveCfg = EmuDebug|x64
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuDebug|x64.Build.0 = EmuDebug|x64
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuRelease|Win32.ActiveCfg = EmuRelease|Win32
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuRelease|Win32.Build.0 = EmuRelease|Win32
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuRelease|x64.ActiveCfg = EmuRelease|x64
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuRelease|x64.Build.0 = EmuRelease|x64
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Release|Win32.ActiveCfg = Release|Win32
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Release|Win32.Build.0 = Release|Win32
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Release|x64.ActiveCfg = Release|x64
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Release|x64.Build.0 = Release|x64
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.ReleaseDoublePrecision|Win32.ActiveCfg = Release|x64
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.ReleaseDoublePrecision|x64.ActiveCfg = Release|x64
{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.ReleaseDoublePrecision|x64.Build.0 = Release|x64
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.Debug|Win32.ActiveCfg = Debug|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.Debug|Win32.Build.0 = Debug|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.Debug|x64.ActiveCfg = Debug|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.DebugDoublePrecision|Win32.ActiveCfg = DebugDoublePrecision|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.DebugDoublePrecision|Win32.Build.0 = DebugDoublePrecision|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.DebugDoublePrecision|x64.ActiveCfg = DebugDoublePrecision|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.EmuDebug|Win32.ActiveCfg = Debug|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.EmuDebug|Win32.Build.0 = Debug|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.EmuDebug|x64.ActiveCfg = Debug|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.EmuRelease|Win32.ActiveCfg = Release|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.EmuRelease|Win32.Build.0 = Release|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.EmuRelease|x64.ActiveCfg = Release|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.Release|Win32.ActiveCfg = Release|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.Release|Win32.Build.0 = Release|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.Release|x64.ActiveCfg = Release|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.ReleaseDoublePrecision|Win32.ActiveCfg = ReleaseDoublePrecision|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.ReleaseDoublePrecision|Win32.Build.0 = ReleaseDoublePrecision|Win32
{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.ReleaseDoublePrecision|x64.ActiveCfg = ReleaseDoublePrecision|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.Debug|Win32.ActiveCfg = Debug|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.Debug|Win32.Build.0 = Debug|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.Debug|x64.ActiveCfg = Debug|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.DebugDoublePrecision|Win32.ActiveCfg = DebugDoublePrecision|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.DebugDoublePrecision|Win32.Build.0 = DebugDoublePrecision|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.DebugDoublePrecision|x64.ActiveCfg = DebugDoublePrecision|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.EmuDebug|Win32.ActiveCfg = Debug|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.EmuDebug|Win32.Build.0 = Debug|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.EmuDebug|x64.ActiveCfg = Debug|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.EmuRelease|Win32.ActiveCfg = Release|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.EmuRelease|Win32.Build.0 = Release|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.EmuRelease|x64.ActiveCfg = Release|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.Release|Win32.ActiveCfg = Release|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.Release|Win32.Build.0 = Release|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.Release|x64.ActiveCfg = Release|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.ReleaseDoublePrecision|Win32.ActiveCfg = ReleaseDoublePrecision|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.ReleaseDoublePrecision|Win32.Build.0 = ReleaseDoublePrecision|Win32
{61BD1097-CF2E-B296-DAA9-73A6FE135319}.ReleaseDoublePrecision|x64.ActiveCfg = ReleaseDoublePrecision|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.Debug|Win32.ActiveCfg = Debug|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.Debug|Win32.Build.0 = Debug|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.Debug|x64.ActiveCfg = Debug|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.DebugDoublePrecision|Win32.ActiveCfg = DebugDoublePrecision|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.DebugDoublePrecision|Win32.Build.0 = DebugDoublePrecision|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.DebugDoublePrecision|x64.ActiveCfg = DebugDoublePrecision|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.EmuDebug|Win32.ActiveCfg = Debug|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.EmuDebug|Win32.Build.0 = Debug|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.EmuDebug|x64.ActiveCfg = Debug|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.EmuRelease|Win32.ActiveCfg = Release|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.EmuRelease|Win32.Build.0 = Release|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.EmuRelease|x64.ActiveCfg = Release|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.Release|Win32.ActiveCfg = Release|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.Release|Win32.Build.0 = Release|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.Release|x64.ActiveCfg = Release|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.ReleaseDoublePrecision|Win32.ActiveCfg = ReleaseDoublePrecision|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.ReleaseDoublePrecision|Win32.Build.0 = ReleaseDoublePrecision|Win32
{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.ReleaseDoublePrecision|x64.ActiveCfg = ReleaseDoublePrecision|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,86 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
#ifndef CUTIL_GL_ERROR
#define CUTIL_GL_ERROR
/* CUda UTility Library */
// includes, system
#ifdef _WIN32
# define WINDOWS_LEAN_AND_MEAN
# include <windows.h>
# include <stdlib.h>
# undef min
# undef max
#endif
// includes, graphics
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenGL/gl.h>
#include <OpenGL/glu.h>
#else
#include <GL/gl.h>
#include <GL/glu.h>
#endif
////////////////////////////////////////////////////////////////////////////
//! Check for OpenGL error
//! @return CUTTrue if no GL error has been encountered, otherwise 0
//! @param file __FILE__ macro
//! @param line __LINE__ macro
//! @note The GL error is listed on stderr
//! @note This function should be used via the CHECK_ERROR_GL() macro
////////////////////////////////////////////////////////////////////////////
CUTBoolean CUTIL_API
cutCheckErrorGL( const char* file, const int line)
{
CUTBoolean ret_val = CUTTrue;
// check for error
GLenum gl_error = glGetError();
if (gl_error != GL_NO_ERROR)
{
fprintf(stderr, "GL Error in file '%s' in line %d :\n", file, line);
fprintf(stderr, "%s\n", gluErrorString(gl_error));
ret_val = CUTFalse;
}
return ret_val;
}
#ifdef _DEBUG
#define CUT_CHECK_ERROR_GL() \
if( CUTFalse == cutCheckErrorGL( __FILE__, __LINE__)) { \
exit(EXIT_FAILURE); \
}
#endif // _DEBUG
#endif // CUTIL_GL_ERROR

767
Extras/CUDA/cutil_math.h Normal file
View File

@@ -0,0 +1,767 @@
/*
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*
* Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice.
*/
/*
This file implements common mathematical operations on vector types
(float3, float4 etc.) since these are not provided as standard by CUDA.
The syntax is modelled on the Cg standard library.
*/
#ifndef CUTIL_MATH_H
#define CUTIL_MATH_H
#include "cuda_runtime.h"
////////////////////////////////////////////////////////////////////////////////
typedef unsigned int uint;
typedef unsigned short ushort;
#ifndef __CUDACC__
#include <math.h>
inline float fminf(float a, float b)
{
return a < b ? a : b;
}
inline float fmaxf(float a, float b)
{
return a < b ? a : b;
}
inline int max(int a, int b)
{
return a > b ? a : b;
}
inline int min(int a, int b)
{
return a < b ? a : b;
}
#endif
// float functions
////////////////////////////////////////////////////////////////////////////////
// lerp
inline __device__ __host__ float lerp(float a, float b, float t)
{
return a + t*(b-a);
}
// clamp
inline __device__ __host__ float clamp(float f, float a, float b)
{
return fmaxf(a, fminf(f, b));
}
// int2 functions
////////////////////////////////////////////////////////////////////////////////
// negate
inline __host__ __device__ int2 operator-(int2 &a)
{
return make_int2(-a.x, -a.y);
}
// addition
inline __host__ __device__ int2 operator+(int2 a, int2 b)
{
return make_int2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(int2 &a, int2 b)
{
a.x += b.x; a.y += b.y;
}
// subtract
inline __host__ __device__ int2 operator-(int2 a, int2 b)
{
return make_int2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(int2 &a, int2 b)
{
a.x -= b.x; a.y -= b.y;
}
// multiply
inline __host__ __device__ int2 operator*(int2 a, int2 b)
{
return make_int2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ int2 operator*(int2 a, int s)
{
return make_int2(a.x * s, a.y * s);
}
inline __host__ __device__ int2 operator*(int s, int2 a)
{
return make_int2(a.x * s, a.y * s);
}
inline __host__ __device__ void operator*=(int2 &a, int s)
{
a.x *= s; a.y *= s;
}
// float2 functions
////////////////////////////////////////////////////////////////////////////////
// additional constructors
inline __host__ __device__ float2 make_float2(float s)
{
return make_float2(s, s);
}
inline __host__ __device__ float2 make_float2(int2 a)
{
return make_float2(float(a.x), float(a.y));
}
// negate
inline __host__ __device__ float2 operator-(float2 &a)
{
return make_float2(-a.x, -a.y);
}
// addition
inline __host__ __device__ float2 operator+(float2 a, float2 b)
{
return make_float2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(float2 &a, float2 b)
{
a.x += b.x; a.y += b.y;
}
// subtract
inline __host__ __device__ float2 operator-(float2 a, float2 b)
{
return make_float2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(float2 &a, float2 b)
{
a.x -= b.x; a.y -= b.y;
}
// multiply
inline __host__ __device__ float2 operator*(float2 a, float2 b)
{
return make_float2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ float2 operator*(float2 a, float s)
{
return make_float2(a.x * s, a.y * s);
}
inline __host__ __device__ float2 operator*(float s, float2 a)
{
return make_float2(a.x * s, a.y * s);
}
inline __host__ __device__ void operator*=(float2 &a, float s)
{
a.x *= s; a.y *= s;
}
// divide
inline __host__ __device__ float2 operator/(float2 a, float2 b)
{
return make_float2(a.x / b.x, a.y / b.y);
}
inline __host__ __device__ float2 operator/(float2 a, float s)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ float2 operator/(float s, float2 a)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ void operator/=(float2 &a, float s)
{
float inv = 1.0f / s;
a *= inv;
}
// lerp
inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
{
return a + t*(b-a);
}
// clamp
inline __device__ __host__ float2 clamp(float2 v, float a, float b)
{
return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
}
inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
{
return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
}
// dot product
inline __host__ __device__ float dot(float2 a, float2 b)
{
return a.x * b.x + a.y * b.y;
}
// length
inline __host__ __device__ float length(float2 v)
{
return sqrtf(dot(v, v));
}
// normalize
inline __host__ __device__ float2 normalize(float2 v)
{
float invLen = 1.0f / sqrtf(dot(v, v));
return v * invLen;
}
// floor
inline __host__ __device__ float2 floor(const float2 v)
{
return make_float2(floor(v.x), floor(v.y));
}
// reflect
inline __host__ __device__ float2 reflect(float2 i, float2 n)
{
return i - 2.0f * n * dot(n,i);
}
// float3 functions
////////////////////////////////////////////////////////////////////////////////
// additional constructors
inline __host__ __device__ float3 make_float3(float s)
{
return make_float3(s, s, s);
}
inline __host__ __device__ float3 make_float3(float2 a)
{
return make_float3(a.x, a.y, 0.0f);
}
inline __host__ __device__ float3 make_float3(float2 a, float s)
{
return make_float3(a.x, a.y, s);
}
inline __host__ __device__ float3 make_float3(float4 a)
{
return make_float3(a.x, a.y, a.z); // discards w
}
inline __host__ __device__ float3 make_float3(int3 a)
{
return make_float3(float(a.x), float(a.y), float(a.z));
}
// negate
inline __host__ __device__ float3 operator-(float3 &a)
{
return make_float3(-a.x, -a.y, -a.z);
}
// min
static __inline__ __host__ __device__ float3 fminf(float3 a, float3 b)
{
return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
}
// max
static __inline__ __host__ __device__ float3 fmaxf(float3 a, float3 b)
{
return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
}
// addition
inline __host__ __device__ float3 operator+(float3 a, float3 b)
{
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ float3 operator+(float3 a, float b)
{
return make_float3(a.x + b, a.y + b, a.z + b);
}
inline __host__ __device__ void operator+=(float3 &a, float3 b)
{
a.x += b.x; a.y += b.y; a.z += b.z;
}
// subtract
inline __host__ __device__ float3 operator-(float3 a, float3 b)
{
return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ float3 operator-(float3 a, float b)
{
return make_float3(a.x - b, a.y - b, a.z - b);
}
inline __host__ __device__ void operator-=(float3 &a, float3 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
// multiply
inline __host__ __device__ float3 operator*(float3 a, float3 b)
{
return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ float3 operator*(float3 a, float s)
{
return make_float3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ float3 operator*(float s, float3 a)
{
return make_float3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ void operator*=(float3 &a, float s)
{
a.x *= s; a.y *= s; a.z *= s;
}
// divide
inline __host__ __device__ float3 operator/(float3 a, float3 b)
{
return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ float3 operator/(float3 a, float s)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ float3 operator/(float s, float3 a)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ void operator/=(float3 &a, float s)
{
float inv = 1.0f / s;
a *= inv;
}
// lerp
inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
{
return a + t*(b-a);
}
// clamp
inline __device__ __host__ float3 clamp(float3 v, float a, float b)
{
return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
{
return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
// dot product
inline __host__ __device__ float dot(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
// cross product
inline __host__ __device__ float3 cross(float3 a, float3 b)
{
return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
}
// length
inline __host__ __device__ float length(float3 v)
{
return sqrtf(dot(v, v));
}
// normalize
inline __host__ __device__ float3 normalize(float3 v)
{
float invLen = 1.0f / sqrtf(dot(v, v));
return v * invLen;
}
// floor
inline __host__ __device__ float3 floor(const float3 v)
{
return make_float3(floor(v.x), floor(v.y), floor(v.z));
}
// reflect
inline __host__ __device__ float3 reflect(float3 i, float3 n)
{
return i - 2.0f * n * dot(n,i);
}
// float4 functions
////////////////////////////////////////////////////////////////////////////////
// additional constructors
inline __host__ __device__ float4 make_float4(float s)
{
return make_float4(s, s, s, s);
}
inline __host__ __device__ float4 make_float4(float3 a)
{
return make_float4(a.x, a.y, a.z, 0.0f);
}
inline __host__ __device__ float4 make_float4(float3 a, float w)
{
return make_float4(a.x, a.y, a.z, w);
}
inline __host__ __device__ float4 make_float4(int4 a)
{
return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
}
// negate
inline __host__ __device__ float4 operator-(float4 &a)
{
return make_float4(-a.x, -a.y, -a.z, -a.w);
}
// min
static __inline__ __host__ __device__ float4 fminf(float4 a, float4 b)
{
return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
}
// max
static __inline__ __host__ __device__ float4 fmaxf(float4 a, float4 b)
{
return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
}
// addition
inline __host__ __device__ float4 operator+(float4 a, float4 b)
{
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
}
inline __host__ __device__ void operator+=(float4 &a, float4 b)
{
a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
}
// subtract
inline __host__ __device__ float4 operator-(float4 a, float4 b)
{
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
}
inline __host__ __device__ void operator-=(float4 &a, float4 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
}
// multiply
inline __host__ __device__ float4 operator*(float4 a, float s)
{
return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __host__ __device__ float4 operator*(float s, float4 a)
{
return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __host__ __device__ void operator*=(float4 &a, float s)
{
a.x *= s; a.y *= s; a.z *= s; a.w *= s;
}
// divide
inline __host__ __device__ float4 operator/(float4 a, float4 b)
{
return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
}
inline __host__ __device__ float4 operator/(float4 a, float s)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ float4 operator/(float s, float4 a)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ void operator/=(float4 &a, float s)
{
float inv = 1.0f / s;
a *= inv;
}
// lerp
inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
{
return a + t*(b-a);
}
// clamp
inline __device__ __host__ float4 clamp(float4 v, float a, float b)
{
return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
}
inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
{
return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
}
// dot product
inline __host__ __device__ float dot(float4 a, float4 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}
// length
inline __host__ __device__ float length(float4 r)
{
return sqrtf(dot(r, r));
}
// normalize
inline __host__ __device__ float4 normalize(float4 v)
{
float invLen = 1.0f / sqrtf(dot(v, v));
return v * invLen;
}
// floor
inline __host__ __device__ float4 floor(const float4 v)
{
return make_float4(floor(v.x), floor(v.y), floor(v.z), floor(v.w));
}
// int3 functions
////////////////////////////////////////////////////////////////////////////////
// additional constructors
inline __host__ __device__ int3 make_int3(int s)
{
return make_int3(s, s, s);
}
inline __host__ __device__ int3 make_int3(float3 a)
{
return make_int3(int(a.x), int(a.y), int(a.z));
}
// negate
inline __host__ __device__ int3 operator-(int3 &a)
{
return make_int3(-a.x, -a.y, -a.z);
}
// min
inline __host__ __device__ int3 min(int3 a, int3 b)
{
return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
}
// max
inline __host__ __device__ int3 max(int3 a, int3 b)
{
return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
}
// addition
inline __host__ __device__ int3 operator+(int3 a, int3 b)
{
return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(int3 &a, int3 b)
{
a.x += b.x; a.y += b.y; a.z += b.z;
}
// subtract
inline __host__ __device__ int3 operator-(int3 a, int3 b)
{
return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ void operator-=(int3 &a, int3 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
// multiply
inline __host__ __device__ int3 operator*(int3 a, int3 b)
{
return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ int3 operator*(int3 a, int s)
{
return make_int3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ int3 operator*(int s, int3 a)
{
return make_int3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ void operator*=(int3 &a, int s)
{
a.x *= s; a.y *= s; a.z *= s;
}
// divide
inline __host__ __device__ int3 operator/(int3 a, int3 b)
{
return make_int3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ int3 operator/(int3 a, int s)
{
return make_int3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ int3 operator/(int s, int3 a)
{
return make_int3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ void operator/=(int3 &a, int s)
{
a.x /= s; a.y /= s; a.z /= s;
}
// clamp
inline __device__ __host__ int clamp(int f, int a, int b)
{
return max(a, min(f, b));
}
inline __device__ __host__ int3 clamp(int3 v, int a, int b)
{
return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
{
return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
// uint3 functions
////////////////////////////////////////////////////////////////////////////////
// additional constructors
inline __host__ __device__ uint3 make_uint3(uint s)
{
return make_uint3(s, s, s);
}
inline __host__ __device__ uint3 make_uint3(float3 a)
{
return make_uint3(uint(a.x), uint(a.y), uint(a.z));
}
// min
inline __host__ __device__ uint3 min(uint3 a, uint3 b)
{
return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
}
// max
inline __host__ __device__ uint3 max(uint3 a, uint3 b)
{
return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
}
// addition
inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
{
return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
{
a.x += b.x; a.y += b.y; a.z += b.z;
}
// subtract
inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
{
return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
// multiply
inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
{
return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ uint3 operator*(uint3 a, uint s)
{
return make_uint3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ uint3 operator*(uint s, uint3 a)
{
return make_uint3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ void operator*=(uint3 &a, uint s)
{
a.x *= s; a.y *= s; a.z *= s;
}
// divide
inline __host__ __device__ uint3 operator/(uint3 a, uint3 b)
{
return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ uint3 operator/(uint3 a, uint s)
{
return make_uint3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ uint3 operator/(uint s, uint3 a)
{
return make_uint3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ void operator/=(uint3 &a, uint s)
{
a.x /= s; a.y /= s; a.z /= s;
}
// clamp
inline __device__ __host__ uint clamp(uint f, uint a, uint b)
{
return max(a, min(f, b));
}
inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
{
return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
{
return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
#endif

3
Extras/CUDA/param.cpp Normal file
View File

@@ -0,0 +1,3 @@
#include <param.h>
const Param<int> dummy("error");

226
Extras/CUDA/param.h Normal file
View File

@@ -0,0 +1,226 @@
/*
Simple parameter system
sgreen@nvidia.com 4/2001
*/
#ifndef PARAM_H
#define PARAM_H
#ifdef _WIN32
# pragma warning(disable:4786) // No stupid debug warnings
#endif
#include <string>
#include <vector>
#include <map>
#include <iostream>
#include <sstream>
#include <iomanip>
// base class for named parameter
class ParamBase {
public:
ParamBase(char *name) { m_name = name; }
virtual ~ParamBase() { }
std::string *GetName() { return &m_name; }
virtual float GetFloatValue() = 0;
virtual int GetIntValue() = 0;
virtual std::string GetValueString() = 0;
virtual void Reset() = 0;
virtual void Increment() = 0;
virtual void Decrement() = 0;
virtual float GetPercentage() = 0;
virtual void SetPercentage(float p) = 0;
virtual void Write(std::ostream &stream) = 0;
virtual void Read(std::istream &stream) = 0;
virtual bool IsList() = 0;
protected:
std::string m_name;
};
// derived class for single-valued parameter
template<class T> class Param : public ParamBase {
public:
Param(char *name, T value = 0, T min = 0, T max = 10000, T step = 1, T* ptr = 0)
: ParamBase(name)
{
if (ptr) {
m_ptr = ptr;
} else {
m_ptr = &m_value;
}
*m_ptr = value;
m_default = value;
m_min = min;
m_max = max;
m_step = step;
}
virtual ~Param() { }
virtual float GetFloatValue() { return (float) *m_ptr; }
virtual int GetIntValue() { return (int) *m_ptr; }
T GetValue() const { return *m_ptr; }
T SetValue(const T value) { *m_ptr = value; }
// inherited functions
std::string GetValueString()
{
std::ostringstream ost;
ost<<std::setprecision(3)<<*m_ptr;
return ost.str();
}
float GetPercentage()
{
return (*m_ptr - m_min) / (float) (m_max - m_min);
}
void SetPercentage(float p)
{
*m_ptr = (T)(m_min + p * (m_max - m_min));
}
void Reset() { *m_ptr = m_default; }
void Increment()
{
*m_ptr += m_step;
if (*m_ptr > m_max)
*m_ptr = m_max;
}
void Decrement()
{
*m_ptr -= m_step;
if (*m_ptr < m_min)
*m_ptr = m_min;
}
void Write(std::ostream &stream) { stream << m_name << " " << *m_ptr << '\n'; }
void Read(std::istream &stream) { stream >> m_name >> *m_ptr; }
bool IsList() { return false; }
private:
T m_value;
T *m_ptr; // pointer to value declared elsewhere
T m_default, m_min, m_max;
T m_step;
};
extern const Param<int> dummy;
// list of parameters
class ParamList : public ParamBase {
public:
ParamList(char *name = "")
: ParamBase(name)
{
active = true;
}
virtual ~ParamList() { }
virtual float GetFloatValue() { return 0.0f; }
virtual int GetIntValue() { return 0; }
void AddParam(ParamBase *param)
{
m_params.push_back(param);
m_map[*param->GetName()] = param;
m_current = m_params.begin();
}
// look-up parameter based on name
ParamBase *GetParam(char *name)
{
ParamBase *p = m_map[name];
if (p)
return p;
else
return (ParamBase *) &dummy;
}
ParamBase *GetParam(int i)
{
return m_params[i];
}
ParamBase *GetCurrent()
{
return *m_current;
}
int GetSize() { return (int)m_params.size(); }
// inherited functions
std::string GetValueString()
{
// return m_name;
return "list";
}
void Reset()
{
m_current = m_params.begin();
}
void Increment()
{
m_current++;
if (m_current == m_params.end())
m_current = m_params.begin();
}
void Decrement()
{
if (m_current == m_params.begin())
m_current = m_params.end()-1;
else
m_current--;
}
float GetPercentage() { return 0.0f; }
void SetPercentage(float /*p*/) { ; }
void Write(std::ostream &stream)
{
stream << m_name << '\n';
for(std::vector<ParamBase *>::const_iterator p = m_params.begin(); p != m_params.end(); ++p) {
(*p)->Write(stream);
}
}
void Read(std::istream &stream)
{
stream >> m_name;
for(std::vector<ParamBase *>::const_iterator p = m_params.begin(); p != m_params.end(); ++p) {
(*p)->Read(stream);
}
}
bool IsList() { return true; }
protected:
bool active;
std::vector<ParamBase *> m_params;
std::map<std::string, ParamBase *> m_map;
std::vector<ParamBase *>::const_iterator m_current;
};
#endif

209
Extras/CUDA/paramgl.cpp Normal file
View File

@@ -0,0 +1,209 @@
/*
ParamListGL
- class derived from ParamList to do simple OpenGL rendering of a parameter list
sgg 8/2001
*/
#include <param.h>
#include <paramgl.h>
void beginWinCoords(void)
{
glMatrixMode(GL_MODELVIEW);
glPushMatrix();
glLoadIdentity();
glTranslatef(0.0, glutGet(GLUT_WINDOW_HEIGHT) - 1, 0.0);
glScalef(1.0, -1.0, 1.0);
glMatrixMode(GL_PROJECTION);
glPushMatrix();
glLoadIdentity();
glOrtho(0, glutGet(GLUT_WINDOW_WIDTH), 0, glutGet(GLUT_WINDOW_HEIGHT), -1, 1);
glMatrixMode(GL_MODELVIEW);
}
void endWinCoords(void)
{
glMatrixMode(GL_PROJECTION);
glPopMatrix();
glMatrixMode(GL_MODELVIEW);
glPopMatrix();
}
void glPrint(int x, int y, const char *s, void *font)
{
int i, len;
glRasterPos2f(x, y);
len = (int) strlen(s);
for (i = 0; i < len; i++) {
glutBitmapCharacter(font, s[i]);
}
}
void glPrintShadowed(int x, int y, const char *s, void *font, float *color)
{
glColor3f(0.0, 0.0, 0.0);
glPrint(x-1, y-1, s, font);
glColor3fv((GLfloat *) color);
glPrint(x, y, s, font);
}
ParamListGL::ParamListGL(char *name) : ParamList(name)
{
font = (void *) GLUT_BITMAP_9_BY_15;
// font = (void *) GLUT_BITMAP_8_BY_13;
bar_x = 250;
bar_w = 250;
bar_h = 10;
bar_offset = 5;
text_x = 5;
separation = 15;
value_x = 200;
font_h = 15;
start_x = 0;
start_y = 0;
text_col_selected[0] = 1.0;
text_col_selected[1] = 1.0;
text_col_selected[2] = 1.0;
text_col_unselected[0] = 0.75;
text_col_unselected[1] = 0.75;
text_col_unselected[2] = 0.75;
bar_col_outer[0] = 0.0;
bar_col_outer[1] = 0.0;
bar_col_outer[2] = 0.0;
bar_col_inner[0] = 0.0;
bar_col_inner[1] = 0.0;
bar_col_inner[2] = 0.0;
text_col_shadow[0] = 0.0;
text_col_shadow[1] = 0.0;
text_col_shadow[2] = 0.0;
}
void
ParamListGL::Render(int x, int y, bool shadow)
{
beginWinCoords();
start_x = x; start_y = y;
for(std::vector<ParamBase *>::const_iterator p = m_params.begin(); p != m_params.end(); ++p) {
if ((*p)->IsList()) {
ParamListGL *list = (ParamListGL *) (*p);
list->Render(x+10, y);
y += separation*list->GetSize();
} else {
if (p == m_current)
glColor3fv(text_col_selected);
else
glColor3fv(text_col_unselected);
if (shadow) {
glPrintShadowed(x + text_x, y + font_h, (*p)->GetName()->c_str(), font, (p == m_current) ? text_col_selected : text_col_unselected);
glPrintShadowed(x + value_x, y + font_h, (*p)->GetValueString().c_str(), font, (p == m_current) ? text_col_selected : text_col_unselected);
}
else {
glPrint(x + text_x, y + font_h, (*p)->GetName()->c_str(), font);
glPrint(x + value_x, y + font_h, (*p)->GetValueString().c_str(), font);
}
// glColor3fv((GLfloat *) &bar_col_outer);
glBegin(GL_LINE_LOOP);
glVertex2f(x + bar_x, y + bar_offset);
glVertex2f(x + bar_x + bar_w, y + bar_offset);
glVertex2f(x + bar_x + bar_w, y + bar_offset + bar_h);
glVertex2f(x + bar_x, y + bar_offset + bar_h);
glEnd();
// glColor3fv((GLfloat *) &bar_col_inner);
glRectf(x + bar_x, y + bar_offset + bar_h, x + bar_x + (bar_w*(*p)->GetPercentage()), y + bar_offset);
y += separation;
}
}
endWinCoords();
}
bool
ParamListGL::Mouse(int x, int y, int button, int state)
{
if ((y < start_y) || (y > (int)(start_y + (separation * m_params.size()) - 1)))
return false;
int i = (y - start_y) / separation;
if ((button==GLUT_LEFT_BUTTON) && (state==GLUT_DOWN)) {
#if defined(__GNUC__) && (__GNUC__ < 3)
m_current = &m_params[i];
#else
// MJH: workaround since the version of vector::at used here is non-standard
for (m_current = m_params.begin(); m_current != m_params.end() && i > 0; m_current++, i--);
//m_current = (std::vector<ParamBase *>::const_iterator)&m_params.at(i);
#endif
if ((x > bar_x) && (x < bar_x + bar_w)) {
Motion(x, y);
}
}
return true;
}
bool
ParamListGL::Motion(int x, int y)
{
if ((y < start_y) || (y > start_y + (separation * (int)m_params.size()) - 1) )
return false;
if (x < bar_x) {
(*m_current)->SetPercentage(0.0);
return true;
}
if (x > bar_x + bar_w) {
(*m_current)->SetPercentage(1.0);
return true;
}
(*m_current)->SetPercentage((x-bar_x) / (float) bar_w);
return true;
}
void
ParamListGL::Special(int key, int /*x*/, int /*y*/)
{
switch(key) {
case GLUT_KEY_DOWN:
Increment();
break;
case GLUT_KEY_UP:
Decrement();
break;
case GLUT_KEY_RIGHT:
GetCurrent()->Increment();
break;
case GLUT_KEY_LEFT:
GetCurrent()->Decrement();
break;
case GLUT_KEY_HOME:
GetCurrent()->Reset();
break;
case GLUT_KEY_END:
GetCurrent()->SetPercentage(1.0);
break;
}
glutPostRedisplay();
}

54
Extras/CUDA/paramgl.h Normal file
View File

@@ -0,0 +1,54 @@
/*
ParamListGL
- class derived from ParamList to do simple OpenGL rendering of a parameter list
sgg 8/2001
*/
#ifndef PARAMGL_H
#define PARAMGL_H
#if defined(__APPLE__) || defined(MACOSX)
#include <GLUT/glut.h>
#else
#include <GL/glut.h>
#endif
#include <param.h>
void beginWinCoords();
void endWinCoords();
void glPrint(int x, int y, const char *s, void *font);
void glPrintShadowed(int x, int y, const char *s, void *font, float *color);
class ParamListGL : public ParamList {
public:
ParamListGL(char *name = "");
void Render(int x, int y, bool shadow = false);
bool Mouse(int x, int y, int button=GLUT_LEFT_BUTTON, int state=GLUT_DOWN);
bool Motion(int x, int y);
void Special(int key, int x, int y);
void SetSelectedColor(float r, float g, float b) { text_col_selected[0] = r; text_col_selected[1] = g; text_col_selected[2] = b; }
void SetUnSelectedColor(float r, float g, float b) { text_col_unselected[0] = r; text_col_unselected[1] = g; text_col_unselected[2] = b; }
int bar_x;
int bar_w;
int bar_h;
int text_x;
int separation;
int value_x;
int font_h;
int start_x, start_y;
int bar_offset;
float text_col_selected[3];
float text_col_unselected[3];
float text_col_shadow[3];
float bar_col_outer[3];
float bar_col_inner[3];
void *font;
};
#endif

View File

@@ -0,0 +1,280 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
#include "particleSystem.h"
#include "particleSystem.cuh"
#include "radixsort.cuh"
#include "particles_kernel.cuh"
#include <assert.h>
#include <math.h>
#include <memory.h>
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <GL/glew.h>
#include <btBulletDynamicsCommon.h>
#include "btCudaBroadphase.h"
#ifndef CUDART_PI_F
#define CUDART_PI_F 3.141592654f
#endif
ParticleSystem::ParticleSystem(uint numParticles, uint3 gridSize) :
m_simulationMode(SIMULATION_BULLET_CPU)//SIMULATION_CUDA)
{
this->m_params.numBodies = numParticles;
this->m_params.m_gridSize = gridSize;
initializeBullet();
}
ParticleSystem::~ParticleSystem()
{
finalizeBullet();
}
#include "../../Demos/OpenGL/GLDebugDrawer.h"
GLDebugDrawer debugDrawer;
void ParticleSystem::initializeBullet()
{
m_collisionConfiguration = new btDefaultCollisionConfiguration();
m_dispatcher = new btCollisionDispatcher(m_collisionConfiguration);
//m_broadphase = new btDbvtBroadphase();
//m_broadphase = new btAxisSweep3(btVector3(-3,-3,-3),btVector3(3,3,3));
m_broadphase = new btCudaBroadphase(m_params,m_params.numBodies+6);
m_constraintSolver=new btSequentialImpulseConstraintSolver();
m_dynamicsWorld = new btDiscreteDynamicsWorld(m_dispatcher,m_broadphase,m_constraintSolver,m_collisionConfiguration);
m_dynamicsWorld->setDebugDrawer(&debugDrawer);
//debugDrawer.setDebugMode(btIDebugDraw::DBG_DrawPairs);
m_dynamicsWorld->setGravity(100*btVector3(m_params.gravity.x,m_params.gravity.y,m_params.gravity.z));
m_dynamicsWorld->getSolverInfo().m_numIterations=1;
btBoxShape* worldBox = new btBoxShape(btVector3(m_params.worldSize.x/2,m_params.worldSize.y/2,m_params.worldSize.z/2));
worldBox->setMargin(0.f);
//create 6 static planes for the world cube
btStaticPlaneShape* planeShape;
btRigidBody* body;
btVector3 worldSize();
int i;
btSphereShape* particleSphere = new btSphereShape(m_params.particleRadius);
particleSphere->setMargin(0.0);
btVector3 localInertia;
particleSphere->calculateLocalInertia(1,localInertia);
float* m_hPos = m_broadphase->getHposPtr();
for (i=0;i<m_params.numBodies;i++)
{
btRigidBody::btRigidBodyConstructionInfo rbci(1.,0,particleSphere,localInertia);
rbci.m_startWorldTransform.setOrigin(btVector3(m_hPos[i*4],m_hPos[i*4+1],m_hPos[i*4+2]));
body = new btRigidBody(rbci);
body->setActivationState(DISABLE_DEACTIVATION);
m_bulletParticles.push_back(body);
m_dynamicsWorld->addRigidBody(body);
}
reset(CONFIG_GRID);
/* for (i=0;i<6;i++)
{
btVector4 planeEq;
worldBox->getPlaneEquation(planeEq,i);
planeShape = new btStaticPlaneShape(-planeEq,planeEq.getW());
planeShape->setMargin(0.f);
btRigidBody::btRigidBodyConstructionInfo rbci(0.f,0,planeShape);
body = new btRigidBody(rbci);
m_dynamicsWorld->addRigidBody(body);
}
*/
}
void ParticleSystem::finalizeBullet()
{
delete m_dynamicsWorld;
delete m_constraintSolver;
delete m_broadphase;
delete m_dispatcher ;
delete m_collisionConfiguration;
}
void
ParticleSystem::update(float deltaTime)
{
assert(m_bInitialized);
switch (m_simulationMode)
{
case SIMULATION_CUDA:
{
m_broadphase->quickHack(deltaTime);
//todo
break;
}
case SIMULATION_BULLET_CPU:
{
m_broadphase->integrate();
///copy particles from device to main memory
{
float* hPosData = m_broadphase->copyBuffersFromDeviceToHost();
float* m_hVel = m_broadphase->getHvelPtr();
m_broadphase->copyBuffersFromHostToDevice();
//sync transform and velocity from particle system to Bullet
for (int i=0;i<m_params.numBodies;i++)
{
btTransform& trans = m_bulletParticles[i]->getWorldTransform();
trans.setOrigin(btVector3(hPosData[i*4],hPosData[i*4+1],hPosData[i*4+2]));
m_bulletParticles[i]->setLinearVelocity(btVector3(m_hVel[i*4],m_hVel[i*4+1],m_hVel[i*4+2])*10.);
}
}
m_dynamicsWorld->stepSimulation(deltaTime);
/* for (int i=0;i<m_numParticles;i++)
{
data[i*4+1] -= 0.001f;
m_hVel[i*4]=0;
m_hVel[i*4+1]=0;
m_hVel[i*4+2]=0;
}
*/
{
float* hPosData = m_broadphase->copyBuffersFromDeviceToHost();
float* m_hVel = m_broadphase->getHvelPtr();
//sync transform and velocity from Bullet to particle system
for (int i=0;i<m_params.numBodies;i++)
{
btTransform& trans = m_bulletParticles[i]->getWorldTransform();
hPosData[i*4] = trans.getOrigin().getX();
hPosData[i*4+1] = trans.getOrigin().getY();
hPosData[i*4+2] = trans.getOrigin().getZ();
m_hVel[i*4] = m_bulletParticles[i]->getLinearVelocity().getX()/10.;
m_hVel[i*4+1] = m_bulletParticles[i]->getLinearVelocity().getY()/10.;
m_hVel[i*4+2] = m_bulletParticles[i]->getLinearVelocity().getZ()/10.;
}
m_broadphase->copyBuffersFromHostToDevice();
}
break;
}
default:
{
printf("unknown simulation method\n");
}
};
}
float* ParticleSystem::getArray(ParticleArray array)
{
return m_broadphase->getArray((btCudaBroadphase::ParticleArray)array);
}
void ParticleSystem::debugDraw()
{
glDisable(GL_DEPTH_TEST);
m_dynamicsWorld->debugDrawWorld();
glEnable(GL_DEPTH_TEST);
}
void ParticleSystem::reset(ParticleConfig config)
{
m_broadphase->reset((btCudaBroadphase::ParticleConfig)config);
for (int i=0;i<m_bulletParticles.size();i++)
{
m_bulletParticles[i]->setAngularVelocity(btVector3(0,0,0));
}
}
void ParticleSystem::addSphere(int start, float *pos, float *vel, int r, float spacing)
{
m_broadphase->addSphere(start,pos,vel,r,spacing);
}
unsigned int ParticleSystem::getCurrentReadBuffer() const
{
return m_broadphase->getCurrentReadBuffer();
}
unsigned int ParticleSystem::getColorBuffer() const
{
return m_broadphase->getColorBuffer();
}
void ParticleSystem::dumpGrid()
{
return m_broadphase->dumpGrid();
}
void ParticleSystem::dumpParticles(uint start, uint count)
{
m_broadphase->dumpParticles(start,count);
}
int ParticleSystem::getNumParticles() const
{
return m_params.numBodies;
}

View File

@@ -0,0 +1,331 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
//#include <cutil.h>
#include <cstdlib>
#include <cstdio>
#include <string.h>
#if defined(__APPLE__) || defined(MACOSX)
#include <GLUT/glut.h>
#else
#include <GL/glut.h>
#endif
#include <cuda_gl_interop.h>
#include "particles_kernel.cu"
#include "radixsort.cu"
//! Check for CUDA error
# define CUT_CHECK_ERROR(errorMessage) do { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} } while (0)
# define MY_CUDA_SAFE_CALL_NO_SYNC( call) do { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
# define MY_CUDA_SAFE_CALL( call) do { \
MY_CUDA_SAFE_CALL_NO_SYNC(call); \
cudaError err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
extern "C"
{
void cudaInit(int argc, char **argv)
{
//CUT_DEVICE_INIT(argc, argv);
}
void allocateArray(void **devPtr, size_t size)
{
MY_CUDA_SAFE_CALL(cudaMalloc(devPtr, size));
}
void freeArray(void *devPtr)
{
MY_CUDA_SAFE_CALL(cudaFree(devPtr));
}
void threadSync()
{
MY_CUDA_SAFE_CALL(cudaThreadSynchronize());
}
void copyArrayFromDevice(void* host, const void* device, unsigned int vbo, int size)
{
if (vbo)
MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&device, vbo));
MY_CUDA_SAFE_CALL(cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost));
if (vbo)
MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vbo));
}
void copyArrayToDevice(void* device, const void* host, int offset, int size)
{
MY_CUDA_SAFE_CALL(cudaMemcpy((char *) device + offset, host, size, cudaMemcpyHostToDevice));
}
void registerGLBufferObject(uint vbo)
{
MY_CUDA_SAFE_CALL(cudaGLRegisterBufferObject(vbo));
}
void unregisterGLBufferObject(uint vbo)
{
MY_CUDA_SAFE_CALL(cudaGLUnregisterBufferObject(vbo));
}
void setParameters(SimParams *hostParams)
{
// copy parameters to constant memory
MY_CUDA_SAFE_CALL( cudaMemcpyToSymbol(params, hostParams, sizeof(SimParams)) );
}
//Round a / b to nearest higher integer value
int iDivUp(int a, int b){
return (a % b != 0) ? (a / b + 1) : (a / b);
}
// compute grid and thread block size for a given number of elements
void computeGridSize(int n, int blockSize, int &numBlocks, int &numThreads)
{
numThreads = min(blockSize, n);
numBlocks = iDivUp(n, numThreads);
}
void
integrateSystem(uint vboOldPos, uint vboNewPos,
float* oldVel, float* newVel,
float deltaTime,
int numBodies)
{
int numThreads, numBlocks;
computeGridSize(numBodies, 256, numBlocks, numThreads);
float *oldPos, *newPos;
MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&oldPos, vboOldPos));
MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&newPos, vboNewPos));
// execute the kernel
integrate<<< numBlocks, numThreads >>>((float4*)newPos, (float4*)newVel,
(float4*)oldPos, (float4*)oldVel,
deltaTime);
// check if kernel invocation generated an error
CUT_CHECK_ERROR("integrate kernel execution failed");
MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboOldPos));
MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboNewPos));
}
void
updateGrid(uint vboPos,
uint* gridCounters,
uint* gridCells,
uint numBodies,
uint numCells)
{
int numThreads, numBlocks;
computeGridSize(numBodies, 256, numBlocks, numThreads);
float *pos;
MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&pos, vboPos));
MY_CUDA_SAFE_CALL(cudaMemset(gridCounters, 0, numCells*sizeof(uint)));
// execute the kernel
updateGridD<<< numBlocks, numThreads >>>((float4 *) pos,
gridCounters,
gridCells);
// check if kernel invocation generated an error
CUT_CHECK_ERROR("Kernel execution failed");
MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboPos));
}
void
calcHash(uint vboPos,
uint* particleHash,
int numBodies)
{
int numThreads, numBlocks;
computeGridSize(numBodies, 256, numBlocks, numThreads);
float *pos;
MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&pos, vboPos));
// execute the kernel
calcHashD<<< numBlocks, numThreads >>>((float4 *) pos,
(uint2 *) particleHash);
// check if kernel invocation generated an error
CUT_CHECK_ERROR("Kernel execution failed");
MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboPos));
}
void
reorderDataAndFindCellStart(uint* particleHash,
uint vboOldPos,
float* oldVel,
float* sortedPos,
float* sortedVel,
uint* cellStart,
uint numBodies,
uint numCells)
{
int numThreads, numBlocks;
computeGridSize(numBodies, 256, numBlocks, numThreads);
MY_CUDA_SAFE_CALL(cudaMemset(cellStart, 0xffffffff, numCells*sizeof(uint)));
float *oldPos;
MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&oldPos, vboOldPos));
#if USE_TEX
MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldPosTex, oldPos, numBodies*sizeof(float4)));
MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldVelTex, oldVel, numBodies*sizeof(float4)));
#endif
reorderDataAndFindCellStartD<<< numBlocks, numThreads >>>(
(uint2 *) particleHash,
(float4 *) oldPos,
(float4 *) oldVel,
(float4 *) sortedPos,
(float4 *) sortedVel,
(uint *) cellStart);
CUT_CHECK_ERROR("Kernel execution failed: reorderDataAndFindCellStartD");
#if USE_TEX
MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldPosTex));
MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldVelTex));
#endif
MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboOldPos));
}
void
collide(uint vboOldPos, uint vboNewPos,
float* sortedPos, float* sortedVel,
float* oldVel, float* newVel,
uint* gridCounters,
uint* gridCells,
uint* particleHash,
uint* cellStart,
uint numBodies,
uint numCells,
uint maxParticlesPerCell)
{
float4 *oldPos, *newPos;
MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&oldPos, vboOldPos));
MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&newPos, vboNewPos));
#if USE_TEX
#if USE_SORT
// use sorted arrays
MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldPosTex, sortedPos, numBodies*sizeof(float4)));
MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldVelTex, sortedVel, numBodies*sizeof(float4)));
MY_CUDA_SAFE_CALL(cudaBindTexture(0, particleHashTex, particleHash, numBodies*sizeof(uint2)));
MY_CUDA_SAFE_CALL(cudaBindTexture(0, cellStartTex, cellStart, numCells*sizeof(uint)));
#else
MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldPosTex, oldPos, numBodies*sizeof(float4)));
MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldVelTex, oldVel, numBodies*sizeof(float4)));
MY_CUDA_SAFE_CALL(cudaBindTexture(0, gridCountersTex, gridCounters,numCells*sizeof(uint)));
MY_CUDA_SAFE_CALL(cudaBindTexture(0, gridCellsTex, gridCells, numCells*maxParticlesPerCell*sizeof(uint)));
#endif
#endif
// thread per particle
int numThreads, numBlocks;
computeGridSize(numBodies, BLOCKDIM, numBlocks, numThreads);
// execute the kernel
collideD<<< numBlocks, numThreads >>>((float4*)newPos, (float4*)newVel,
#if USE_SORT
(float4*)sortedPos, (float4*)sortedVel,
(uint2 *) particleHash,
cellStart
#else
(float4*)oldPos, (float4*)oldVel,
gridCounters,
gridCells
#endif
);
// check if kernel invocation generated an error
CUT_CHECK_ERROR("Kernel execution failed");
MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboNewPos));
MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboOldPos));
#if USE_TEX
MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldPosTex));
MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldVelTex));
#if USE_SORT
MY_CUDA_SAFE_CALL(cudaUnbindTexture(particleHashTex));
MY_CUDA_SAFE_CALL(cudaUnbindTexture(cellStartTex));
#else
MY_CUDA_SAFE_CALL(cudaUnbindTexture(gridCountersTex));
MY_CUDA_SAFE_CALL(cudaUnbindTexture(gridCellsTex));
#endif
#endif
}
} // extern "C"

View File

@@ -0,0 +1,57 @@
extern "C"
{
void cudaInit(int argc, char **argv);
void allocateArray(void **devPtr, int size);
void freeArray(void *devPtr);
void threadSync();
void copyArrayFromDevice(void* host, const void* device, unsigned int vbo, int size);
void copyArrayToDevice(void* device, const void* host, int offset, int size);
void registerGLBufferObject(unsigned int vbo);
void unregisterGLBufferObject(unsigned int vbo);
void setParameters(SimParams *hostParams);
void
integrateSystem(uint vboOldPos, uint vboNewPos,
float* oldVel, float* newVel,
float deltaTime,
int numBodies);
void
updateGrid(uint vboPos,
uint* gridCounters,
uint* gridCells,
uint numBodies,
uint numCells);
void
calcHash(uint vboPos,
uint* particleHash,
int numBodies);
void
reorderDataAndFindCellStart(uint* particleHash,
uint vboOldPos,
float* oldVel,
float* sortedPos,
float* sortedVel,
uint* cellStart,
uint numBodies,
uint numCells);
void
collide(uint vboOldPos, uint vboNewPos,
float* sortedPos, float* sortedVel,
float* oldVel, float* newVel,
uint* gridCounters,
uint* gridCells,
uint* particleHash,
uint* cellStart,
uint numBodies,
uint numCells,
uint maxParticlesPerCell);
}

View File

@@ -0,0 +1,153 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
#ifndef __BODYSYSTEMCUDA_H__
#define __BODYSYSTEMCUDA_H__
#define DEBUG_GRID 0
#define DO_TIMING 0
#include "particles_kernel.cuh"
#include "vector_functions.h"
#include "LinearMath/btAlignedObjectArray.h"
// CUDA BodySystem: runs on the GPU
class ParticleSystem
{
public:
ParticleSystem(uint numParticles, uint3 gridSize);
~ParticleSystem();
enum ParticleArray
{
POSITION,
VELOCITY,
};
enum ParticleConfig
{
CONFIG_RANDOM,
CONFIG_GRID,
_NUM_CONFIGS
};
enum SimulationMode
{
SIMULATION_CUDA,
SIMULATION_BULLET_CPU,
SIMULATION_NUM_MODES
};
void reset(ParticleConfig config);
void debugDraw();
///
///Bullet data
///
void initializeBullet();
void finalizeBullet();
class btDiscreteDynamicsWorld* m_dynamicsWorld;
class btDefaultCollisionConfiguration* m_collisionConfiguration;
class btCollisionDispatcher* m_dispatcher;
class btCudaBroadphase* m_broadphase;
class btSequentialImpulseConstraintSolver* m_constraintSolver;
btAlignedObjectArray<class btRigidBody*> m_bulletParticles;
void update(float deltaTime);
float* getArray(ParticleArray array);
int getNumParticles() const;
unsigned int getCurrentReadBuffer() const;
unsigned int getColorBuffer() const;
void dumpGrid();
void dumpParticles(uint start, uint count);
void setIterations(int i) { m_solverIterations = i; }
void setDamping(float x) { m_params.globalDamping = x; }
void setGravity(float x) { m_params.gravity = make_float3(0.0f, x, 0.0f); }
void setCollideSpring(float x) { m_params.spring = x; }
void setCollideDamping(float x) { m_params.damping = x; }
void setCollideShear(float x) { m_params.shear = x; }
void setCollideAttraction(float x) { m_params.attraction = x; }
void setColliderPos(float4 x) { m_params.colliderPos = x; }
float getParticleRadius() { return m_params.particleRadius; }
float4 getColliderPos() { return m_params.colliderPos; }
float getColliderRadius() { return m_params.colliderRadius; }
uint3 getGridSize() { return m_params.gridSize; }
float3 getWorldOrigin() { return m_params.worldOrigin; }
float3 getCellSize() { return m_params.cellSize; }
void addSphere(int index, float *pos, float *vel, int r, float spacing);
SimulationMode getSimulationMode() const
{
return m_simulationMode;
}
void setSimulationMode(SimulationMode mode)
{
m_simulationMode=mode;
}
protected: // methods
ParticleSystem()
: m_simulationMode(SIMULATION_CUDA)
{}
void initGrid(uint *size, float spacing, float jitter, uint numParticles);
protected: // data
bool m_bInitialized;
// params
SimParams m_params;
uint m_maxParticlesPerCell;
uint m_timer;
uint m_solverIterations;
SimulationMode m_simulationMode;
};
#endif // __BODYSYSTEMCUDA_H__

544
Extras/CUDA/particles.cpp Normal file
View File

@@ -0,0 +1,544 @@
/*
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
/*
Particle system example with collisions using uniform grid
*/
#include <cstdlib>
#include <cstdio>
#include <algorithm>
#include <math.h>
//#include <cutil.h>
#include <GL/glew.h>
#if defined(__APPLE__) || defined(MACOSX)
#include <GLUT/glut.h>
#else
#include <GL/glut.h>
#endif
#include "particleSystem.h"
#include "render_particles.h"
#include "paramgl.h"
// view params
int ox, oy;
int buttonState = 0;
float camera_trans[] = {0, 0, -3};
float camera_rot[] = {0, 0, 0};
float camera_trans_lag[] = {0, 0, -3};
float camera_rot_lag[] = {0, 0, 0};
const float inertia = 0.1;
ParticleRenderer::DisplayMode displayMode = ParticleRenderer::PARTICLE_SPHERES;
int mode = 0;
bool displayEnabled = true;
bool bPause = false;
bool displaySliders = false;
bool wireframe = false;
enum { M_VIEW = 0, M_MOVE };
uint numParticles = 0;
uint3 gridSize;
int numIterations = 0; // run until exit
// simulation parameters
float timestep = 0.5f;
float damping = 1.0f;
float gravity = 0.0003f;
int iterations = 1;
int ballr = 10;
float collideSpring = 0.5f;;
float collideDamping = 0.02f;;
float collideShear = 0.1f;
float collideAttraction = 0.0f;
ParticleSystem *psystem = 0;
// fps
ParticleRenderer *renderer = 0;
float modelView[16];
ParamListGL *params;
extern "C" void cudaInit(int argc, char **argv);
void init(int numParticles, uint3 gridSize)
{
psystem = new ParticleSystem(numParticles, gridSize);
psystem->reset(ParticleSystem::CONFIG_GRID);
renderer = new ParticleRenderer;
renderer->setParticleRadius(psystem->getParticleRadius());
renderer->setColorBuffer(psystem->getColorBuffer());
}
void initGL()
{
glewInit();
if (!glewIsSupported("GL_VERSION_2_0 GL_VERSION_1_5 GL_ARB_multitexture GL_ARB_vertex_buffer_object")) {
fprintf(stderr, "Required OpenGL extensions missing.");
exit(-1);
}
glEnable(GL_DEPTH_TEST);
glClearColor(0.25, 0.25, 0.25, 1.0);
glutReportErrors();
}
void display()
{
// update the simulation
if (!bPause)
{
psystem->setIterations(iterations);
psystem->setDamping(damping);
psystem->setGravity(-gravity);
psystem->setCollideSpring(collideSpring);
psystem->setCollideDamping(collideDamping);
psystem->setCollideShear(collideShear);
psystem->setCollideAttraction(collideAttraction);
psystem->update(timestep);
renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles());
float* posArray = psystem->getArray(ParticleSystem::POSITION);
renderer->setPositions(posArray,psystem->getNumParticles());
}
// render
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
// view transform
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
for (int c = 0; c < 3; ++c)
{
camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia;
camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia;
}
glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]);
glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0);
glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0);
glGetFloatv(GL_MODELVIEW_MATRIX, modelView);
// cube
glColor3f(1.0, 1.0, 1.0);
glutWireCube(2.0);
// collider
glPushMatrix();
float4 p = psystem->getColliderPos();
glTranslatef(p.x, p.y, p.z);
glColor3f(1.0, 0.0, 0.0);
glutSolidSphere(psystem->getColliderRadius(), 20, 10);
glPopMatrix();
if (displayEnabled)
{
renderer->display(displayMode);
}
if (displaySliders) {
glDisable(GL_DEPTH_TEST);
glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color
glEnable(GL_BLEND);
params->Render(0, 0);
glDisable(GL_BLEND);
glEnable(GL_DEPTH_TEST);
}
psystem->debugDraw();
glutSwapBuffers();
{
char fps[256];
//float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f);
switch (psystem->getSimulationMode())
{
case ParticleSystem::SIMULATION_CUDA:
{
sprintf(fps, "CUDA particles (%d particles)", numParticles);
break;
}
case ParticleSystem::SIMULATION_BULLET_CPU:
{
sprintf(fps, "Bullet btCudaBroadphase (%d btSphereShapes)", numParticles);
break;
}
default:
{
sprintf(fps, "Unknown simulation mode");
}
}
glutSetWindowTitle(fps);
}
glutReportErrors();
}
void reshape(int w, int h)
{
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluPerspective(60.0, (float) w / (float) h, 0.1, 10.0);
glMatrixMode(GL_MODELVIEW);
glViewport(0, 0, w, h);
renderer->setWindowSize(w, h);
renderer->setFOV(60.0);
}
void mouse(int button, int state, int x, int y)
{
int mods;
if (state == GLUT_DOWN)
buttonState |= 1<<button;
else if (state == GLUT_UP)
buttonState = 0;
mods = glutGetModifiers();
if (mods & GLUT_ACTIVE_SHIFT) {
buttonState = 2;
} else if (mods & GLUT_ACTIVE_CTRL) {
buttonState = 3;
}
ox = x; oy = y;
if (displaySliders) {
if (params->Mouse(x, y, button, state)) {
glutPostRedisplay();
return;
}
}
glutPostRedisplay();
}
// transfrom vector by matrix
void xform(float *v, float *r, GLfloat *m)
{
r[0] = v[0]*m[0] + v[1]*m[4] + v[2]*m[8] + m[12];
r[1] = v[0]*m[1] + v[1]*m[5] + v[2]*m[9] + m[13];
r[2] = v[0]*m[2] + v[1]*m[6] + v[2]*m[10] + m[14];
}
// transform vector by transpose of matrix
void ixform(float *v, float *r, GLfloat *m)
{
r[0] = v[0]*m[0] + v[1]*m[1] + v[2]*m[2];
r[1] = v[0]*m[4] + v[1]*m[5] + v[2]*m[6];
r[2] = v[0]*m[8] + v[1]*m[9] + v[2]*m[10];
}
void ixformPoint(float *v, float *r, GLfloat *m)
{
float x[4];
x[0] = v[0] - m[12];
x[1] = v[1] - m[13];
x[2] = v[2] - m[14];
x[3] = 1.0f;
ixform(x, r, m);
}
void motion(int x, int y)
{
float dx, dy;
dx = x - ox;
dy = y - oy;
if (displaySliders) {
if (params->Motion(x, y)) {
ox = x; oy = y;
glutPostRedisplay();
return;
}
}
switch(mode)
{
case M_VIEW:
if (buttonState == 3) {
// left+middle = zoom
camera_trans[2] += (dy / 100.0) * 0.5 * fabs(camera_trans[2]);
}
else if (buttonState & 2) {
// middle = translate
camera_trans[0] += dx / 100.0;
camera_trans[1] -= dy / 100.0;
}
else if (buttonState & 1) {
// left = rotate
camera_rot[0] += dy / 5.0;
camera_rot[1] += dx / 5.0;
}
break;
case M_MOVE:
{
float translateSpeed = 0.003f;
float4 p = psystem->getColliderPos();
if (buttonState==1) {
float v[3], r[3];
v[0] = dx*translateSpeed;
v[1] = -dy*translateSpeed;
v[2] = 0.0f;
ixform(v, r, modelView);
p.x += r[0];
p.y += r[1];
p.z += r[2];
} else if (buttonState==2) {
float v[3], r[3];
v[0] = 0.0f;
v[1] = 0.0f;
v[2] = dy*translateSpeed;
ixform(v, r, modelView);
p.x += r[0];
p.y += r[1];
p.z += r[2];
}
psystem->setColliderPos(p);
}
break;
}
ox = x; oy = y;
glutPostRedisplay();
}
inline float frand()
{
return rand() / (float) RAND_MAX;
}
// commented out to remove unused parameter warnings in Linux
void key(unsigned char key, int /*x*/, int /*y*/)
{
switch (key)
{
case ' ':
bPause = !bPause;
break;
case 13:
psystem->update(timestep);
renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles());
break;
case '\033':
case 'q':
exit(0);
break;
case 'v':
mode = M_VIEW;
break;
case 'm':
mode = M_MOVE;
break;
case 's':
psystem->setSimulationMode((ParticleSystem::SimulationMode) ((psystem->getSimulationMode() + 1) % ParticleSystem::SIMULATION_NUM_MODES));
break;
case 'p':
displayMode = (ParticleRenderer::DisplayMode) ((displayMode + 1) % ParticleRenderer::PARTICLE_NUM_MODES);
break;
case 'd':
psystem->dumpGrid();
break;
case 'u':
psystem->dumpParticles(0, 1);
break;
case 'r':
displayEnabled = !displayEnabled;
break;
case '1':
psystem->reset(ParticleSystem::CONFIG_GRID);
break;
case '2':
psystem->reset(ParticleSystem::CONFIG_RANDOM);
break;
case '3':
{
// inject a sphere of particles
float pr = psystem->getParticleRadius();
float tr = pr+(pr*2.0f)*ballr;
float pos[4], vel[4];
pos[0] = -1.0 + tr + frand()*(2.0f - tr*2.0f);
pos[1] = 1.0f - tr;
pos[2] = -1.0 + tr + frand()*(2.0f - tr*2.0f);
pos[3] = 0.0f;
vel[0] = vel[1] = vel[2] = vel[3] = 0.0f;
psystem->addSphere(0, pos, vel, ballr, pr*2.0f);
}
break;
case '4':
{
// shoot ball from camera
float pr = psystem->getParticleRadius();
float vel[4], velw[4], pos[4], posw[4];
vel[0] = 0.0f;
vel[1] = 0.0f;
vel[2] = -0.05f;
vel[3] = 0.0f;
ixform(vel, velw, modelView);
pos[0] = 0.0f;
pos[1] = 0.0f;
pos[2] = -2.5f;
pos[3] = 1.0;
ixformPoint(pos, posw, modelView);
posw[3] = 0.0f;
psystem->addSphere(0, posw, velw, ballr, pr*2.0f);
}
break;
case 'w':
wireframe = !wireframe;
break;
case 'h':
displaySliders = !displaySliders;
break;
}
glutPostRedisplay();
}
void special(int k, int x, int y)
{
if (displaySliders) {
params->Special(k, x, y);
}
}
void idle(void)
{
glutPostRedisplay();
}
void initParams()
{
// create a new parameter list
params = new ParamListGL("misc");
params->AddParam(new Param<float>("time step", timestep, 0.0, 1.0, 0.01, &timestep));
params->AddParam(new Param<int>("iterations", iterations, 0, 10, 1, &iterations));
params->AddParam(new Param<float>("damping", damping, 0.0, 1.0, 0.001, &damping));
params->AddParam(new Param<float>("gravity", gravity, 0.0, 0.001, 0.0001, &gravity));
params->AddParam(new Param<int>("ball r", ballr, 1, 20, 1, &ballr));
params->AddParam(new Param<float>("collide spring", collideSpring, 0.0, 1.0, 0.001, &collideSpring));
params->AddParam(new Param<float>("collide damping", collideDamping, 0.0, 0.1, 0.001, &collideDamping));
params->AddParam(new Param<float>("collide shear", collideShear, 0.0, 0.1, 0.001, &collideShear));
params->AddParam(new Param<float>("collide attract", collideAttraction, 0.0, 0.1, 0.001, &collideAttraction));
}
void mainMenu(int i)
{
key((unsigned char) i, 0, 0);
}
void initMenus()
{
glutCreateMenu(mainMenu);
glutAddMenuEntry("Reset block [1]", '1');
glutAddMenuEntry("Reset random [2]", '2');
glutAddMenuEntry("Add sphere [3]", '3');
glutAddMenuEntry("View mode [v]", 'v');
glutAddMenuEntry("Move cursor mode [m]", 'm');
glutAddMenuEntry("Toggle point rendering [p]", 'p');
glutAddMenuEntry("Toggle Bullet simulation[s]", 's');
glutAddMenuEntry("Toggle animation [ ]", ' ');
glutAddMenuEntry("Step animation [ret]", 13);
glutAddMenuEntry("Toggle sliders [h]", 'h');
glutAddMenuEntry("Quit (esc)", '\033');
glutAttachMenu(GLUT_RIGHT_BUTTON);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char** argv)
{
numParticles =1024;//1024;//64;//16380;//32768;
uint gridDim = 64;
numIterations = 0;
gridSize.x = gridSize.y = gridSize.z = gridDim;
printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z);
bool benchmark = false;
cudaInit(argc, argv);
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE);
glutInitWindowSize(640, 480);
glutCreateWindow("CUDA particles");
initGL();
init(numParticles, gridSize);
initParams();
initMenus();
glutDisplayFunc(display);
glutReshapeFunc(reshape);
glutMouseFunc(mouse);
glutMotionFunc(motion);
glutKeyboardFunc(key);
glutSpecialFunc(special);
glutIdleFunc(idle);
glutMainLoop();
if (psystem)
delete psystem;
return 0;
}

View File

@@ -0,0 +1,381 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
/*
* Device code.
*/
#ifndef _PARTICLES_KERNEL_H_
#define _PARTICLES_KERNEL_H_
#include <stdio.h>
#include <math.h>
#include "cutil_math.h"
#include "math_constants.h"
#include "particles_kernel.cuh"
#if USE_TEX
// textures for particle position and velocity
texture<float4, 1, cudaReadModeElementType> oldPosTex;
texture<float4, 1, cudaReadModeElementType> oldVelTex;
texture<uint2, 1, cudaReadModeElementType> particleHashTex;
texture<uint, 1, cudaReadModeElementType> cellStartTex;
texture<uint, 1, cudaReadModeElementType> gridCountersTex;
texture<uint, 1, cudaReadModeElementType> gridCellsTex;
#endif
__constant__ SimParams params;
// integrate particle attributes
__global__ void
integrate(float4* newPos, float4* newVel,
float4* oldPos, float4* oldVel,
float deltaTime)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
float4 pos4 = oldPos[index];
float4 vel4 = oldVel[index];
float3 pos = make_float3(pos4);
float3 vel = make_float3(vel4);
vel += params.gravity * deltaTime;
vel *= params.globalDamping;
// new position = old position + velocity * deltaTime
pos += vel * deltaTime;
// bounce off cube sides
if (pos.x > 1.0f - params.particleRadius) { pos.x = 1.0f - params.particleRadius; vel.x *= params.boundaryDamping; }
if (pos.x < -1.0f + params.particleRadius) { pos.x = -1.0f + params.particleRadius; vel.x *= params.boundaryDamping;}
if (pos.y > 1.0f - params.particleRadius) { pos.y = 1.0f - params.particleRadius; vel.y *= params.boundaryDamping; }
if (pos.y < -1.0f + params.particleRadius) { pos.y = -1.0f + params.particleRadius; vel.y *= params.boundaryDamping;}
if (pos.z > 1.0f - params.particleRadius) { pos.z = 1.0f - params.particleRadius; vel.z *= params.boundaryDamping; }
if (pos.z < -1.0f + params.particleRadius) { pos.z = -1.0f + params.particleRadius; vel.z *= params.boundaryDamping;}
// store new position and velocity
newPos[index] = make_float4(pos, pos4.w);
newVel[index] = make_float4(vel, vel4.w);
}
// calculate position in uniform grid
__device__ int3 calcGridPos(float4 p)
{
int3 gridPos;
gridPos.x = floor((p.x - params.worldOrigin.x) / params.cellSize.x);
gridPos.y = floor((p.y - params.worldOrigin.y) / params.cellSize.y);
gridPos.z = floor((p.z - params.worldOrigin.z) / params.cellSize.z);
return gridPos;
}
// calculate address in grid from position (clamping to edges)
__device__ uint calcGridHash(int3 gridPos)
{
gridPos.x = max(0, min(gridPos.x, params.gridSize.x-1));
gridPos.y = max(0, min(gridPos.y, params.gridSize.y-1));
gridPos.z = max(0, min(gridPos.z, params.gridSize.z-1));
return __mul24(__mul24(gridPos.z, params.gridSize.y), params.gridSize.x) + __mul24(gridPos.y, params.gridSize.x) + gridPos.x;
}
// add particle to cell using atomics
__device__ void addParticleToCell(int3 gridPos,
uint index,
uint* gridCounters,
uint* gridCells)
{
// calculate grid hash
uint gridHash = calcGridHash(gridPos);
// increment cell counter using atomics
#if defined CUDA_NO_SM_11_ATOMIC_INTRINSICS
int counter = 0;
#else
int counter = atomicAdd(&gridCounters[gridHash], 1); // returns previous value
counter = min(counter, params.maxParticlesPerCell-1);
#endif
// write particle index into this cell (very uncoalesced!)
gridCells[gridHash*params.maxParticlesPerCell + counter] = index;
}
// update uniform grid
__global__ void
updateGridD(float4* pos,
uint* gridCounters,
uint* gridCells)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
float4 p = pos[index];
// get address in grid
int3 gridPos = calcGridPos(p);
addParticleToCell(gridPos, index, gridCounters, gridCells);
}
// calculate grid hash value for each particle
__global__ void
calcHashD(float4* pos,
uint2* particleHash)
{
int index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
float4 p = pos[index];
// get address in grid
int3 gridPos = calcGridPos(p);
uint gridHash = calcGridHash(gridPos);
// store grid hash and particle index
particleHash[index] = make_uint2(gridHash, index);
}
// rearrange particle data into sorted order, and find the start of each cell in the
// sorted hash array
__global__ void
reorderDataAndFindCellStartD(uint2* particleHash, // particle id sorted by hash
float4* oldPos,
float4* oldVel,
float4* sortedPos,
float4* sortedVel,
uint* cellStart)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
uint2 sortedData = particleHash[index];
// Load hash data into shared memory so that we can look
// at neighboring particle's hash value without loading
// two hash values per thread
__shared__ uint sharedHash[257];
sharedHash[threadIdx.x+1] = sortedData.x;
if (index > 0 && threadIdx.x == 0)
{
// first thread in block must load neighbor particle hash
volatile uint2 prevData = particleHash[index-1];
sharedHash[0] = prevData.x;
}
__syncthreads();
if (index == 0 || sortedData.x != sharedHash[threadIdx.x])
{
cellStart[sortedData.x] = index;
}
// Now use the sorted index to reorder the pos and vel data
float4 pos = FETCH(oldPos, sortedData.y); // macro does either global read or texture fetch
float4 vel = FETCH(oldVel, sortedData.y); // see particles_kernel.cuh
sortedPos[index] = pos;
sortedVel[index] = vel;
}
// collide two spheres using DEM method
__device__ float3 collideSpheres(float4 posA, float4 posB,
float4 velA, float4 velB,
float radiusA, float radiusB,
float attraction)
{
// calculate relative position
float3 relPos;
relPos.x = posB.x - posA.x;
relPos.y = posB.y - posA.y;
relPos.z = posB.z - posA.z;
float dist = length(relPos);
float collideDist = radiusA + radiusB;
float3 force = make_float3(0.0f);
if (dist < collideDist) {
float3 norm = relPos / dist;
// relative velocity
float3 relVel;
relVel.x = velB.x - velA.x;
relVel.y = velB.y - velA.y;
relVel.z = velB.z - velA.z;
// relative tangential velocity
float3 tanVel = relVel - (dot(relVel, norm) * norm);
// spring force
force = -params.spring*(collideDist - dist) * norm;
// dashpot (damping) force
force += params.damping*relVel;
// tangential shear force
force += params.shear*tanVel;
// attraction
force += attraction*relPos;
}
return force;
}
// collide particle with all particles in a given cell
// version using grid built with atomics
__device__
float3 collideCell(int3 gridPos,
uint index,
float4 pos,
float4 vel,
float4* oldPos,
float4* oldVel,
uint* gridCounters,
uint* gridCells)
{
float3 force = make_float3(0.0f);
if ((gridPos.x < 0) || (gridPos.x > params.gridSize.x-1) ||
(gridPos.y < 0) || (gridPos.y > params.gridSize.y-1) ||
(gridPos.z < 0) || (gridPos.z > params.gridSize.z-1)) {
return force;
}
uint gridHash = calcGridHash(gridPos);
// iterate over particles in this cell
uint particlesInCell = FETCH(gridCounters, gridHash);
particlesInCell = min(particlesInCell, params.maxParticlesPerCell-1);
for(uint i=0; i<particlesInCell; i++) {
uint index2 = FETCH(gridCells, gridHash*params.maxParticlesPerCell + i);
if (index2 != index) { // check not colliding with self
float4 pos2 = FETCH(oldPos, index2);
float4 vel2 = FETCH(oldVel, index2);
// collide two spheres
float3 projVec = collideSpheres(pos, pos2, vel, vel2, params.particleRadius, params.particleRadius, params.attraction);
force += projVec;
}
}
return force;
}
// version using sorted grid
__device__
float3 collideCell2(int3 gridPos,
uint index,
float4 pos,
float4 vel,
float4* oldPos,
float4* oldVel,
uint2* particleHash,
uint* cellStart)
{
float3 force = make_float3(0.0f);
if ((gridPos.x < 0) || (gridPos.x > params.gridSize.x-1) ||
(gridPos.y < 0) || (gridPos.y > params.gridSize.y-1) ||
(gridPos.z < 0) || (gridPos.z > params.gridSize.z-1)) {
return force;
}
uint gridHash = calcGridHash(gridPos);
// get start of bucket for this cell
uint bucketStart = FETCH(cellStart, gridHash);
if (bucketStart == 0xffffffff)
return force; // cell empty
// iterate over particles in this cell
for(uint i=0; i<params.maxParticlesPerCell; i++) {
uint index2 = bucketStart + i;
uint2 cellData = FETCH(particleHash, index2);
if (cellData.x != gridHash) break; // no longer in same bucket
if (index2 != index) { // check not colliding with self
float4 pos2 = FETCH(oldPos, index2);
float4 vel2 = FETCH(oldVel, index2);
// collide two spheres
float3 projVec = collideSpheres(pos, pos2, vel, vel2, params.particleRadius, params.particleRadius, params.attraction);
force += projVec;
}
}
return force;
}
__global__ void
collideD(float4* newPos, float4* newVel,
float4* oldPos, float4* oldVel,
#if USE_SORT
uint2* particleHash,
uint* cellStart
#else
uint* gridCounters,
uint* gridCells
#endif
)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
// read particle data from sorted arrays
float4 pos = FETCH(oldPos, index);
float4 vel = FETCH(oldVel, index);
// get address in grid
int3 gridPos = calcGridPos(pos);
float3 force = make_float3(0.0f);
// examine only neighbouring cells
for(int z=-1; z<=1; z++) {
for(int y=-1; y<=1; y++) {
for(int x=-1; x<=1; x++) {
#if USE_SORT
force += collideCell2(gridPos + make_int3(x, y, z), index, pos, vel, oldPos, oldVel, particleHash, cellStart);
#else
force += collideCell(gridPos + make_int3(x, y, z), index, pos, vel, oldPos, oldVel, gridCounters, gridCells);
#endif
}
}
}
float3 projVec = collideSpheres(pos, params.colliderPos, vel, make_float4(0.0f, 0.0f, 0.0f, 0.0f), params.particleRadius, params.colliderRadius, 0.0f);
force += projVec;
#if USE_SORT
// write new velocity back to original unsorted location
volatile uint2 sortedData = particleHash[index];
newVel[sortedData.y] = vel + make_float4(force, 0.0f);
#else
newVel[index] = vel + make_float4(force, 0.0f);
#endif
}
#endif

View File

@@ -0,0 +1,47 @@
#ifndef PARTICLES_KERNEL_H
#define PARTICLES_KERNEL_H
#define BLOCKDIM 64
#define USE_SORT 1
#ifndef __DEVICE_EMULATION__
#define USE_TEX 1
#endif
#ifdef USE_TEX
#define FETCH(t, i) tex1Dfetch(t##Tex, i)
#else
#define FETCH(t, i) t[i]
#endif
#include "vector_types.h"
typedef unsigned int uint;
struct SimParams {
float4 colliderPos;
float colliderRadius;
float3 gravity;
float globalDamping;
float particleRadius;
uint3 gridSize;
uint numCells;
float3 worldOrigin;
float3 cellSize;
float3 worldSize;
uint3 m_gridSize;
uint numBodies;
uint maxParticlesPerCell;
float spring;
float damping;
float shear;
float attraction;
float boundaryDamping;
};
#endif

79
Extras/CUDA/radixsort.cu Normal file
View File

@@ -0,0 +1,79 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
/* Radixsort project with key/value and arbitrary datset size support
* which demonstrates the use of CUDA in a multi phase sorting
* computation.
* Host code.
*/
#include "radixsort.cuh"
#include "radixsort_kernel.cu"
extern "C"
{
////////////////////////////////////////////////////////////////////////////////
//! Perform a radix sort
//! Sorting performed in place on passed arrays.
//!
//! @param pData0 input and output array - data will be sorted
//! @param pData1 additional array to allow ping pong computation
//! @param elements number of elements to sort
////////////////////////////////////////////////////////////////////////////////
void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits)
{
// Round element count to total number of threads for efficiency
uint elements_rounded_to_3072;
int modval = elements % 3072;
if( modval == 0 )
elements_rounded_to_3072 = elements;
else
elements_rounded_to_3072 = elements + (3072 - (modval));
// Iterate over n bytes of y bit word, using each byte to sort the list in turn
for (uint shift = 0; shift < bits; shift += RADIX)
{
// Perform one round of radix sorting
// Generate per radix group sums radix counts across a radix group
RadixSum<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, GRFSIZE>>>(pData0, elements, elements_rounded_to_3072, shift);
// Prefix sum in radix groups, and then between groups throughout a block
RadixPrefixSum<<<PREFIX_NUM_BLOCKS, PREFIX_NUM_THREADS_PER_BLOCK, PREFIX_GRFSIZE>>>();
// Sum the block offsets and then shuffle data into bins
RadixAddOffsetsAndShuffle<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, SHUFFLE_GRFSIZE>>>(pData0, pData1, elements, elements_rounded_to_3072, shift);
// Exchange data pointers
KeyValuePair* pTemp = pData0;
pData0 = pData1;
pData1 = pTemp;
}
}
}

63
Extras/CUDA/radixsort.cuh Normal file
View File

@@ -0,0 +1,63 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
/* Radixsort project which demonstrates the use of CUDA in a multi phase
* sorting computation.
* Type definitions.
*/
#ifndef _RADIXSORT_H_
#define _RADIXSORT_H_
#include <host_defines.h>
#define SYNCIT __syncthreads()
// Use 16 bit keys/values
#define SIXTEEN 0
typedef unsigned int uint;
typedef unsigned short ushort;
#if SIXTEEN
typedef struct __align__(4) {
ushort key;
ushort value;
#else
typedef struct __align__(8) {
uint key;
uint value;
#endif
} KeyValuePair;
extern "C" {
void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits);
}
#endif // #ifndef _RADIXSORT_H_

View File

@@ -0,0 +1,577 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
/* Radixsort project with key/value and arbitrary datset size support
* which demonstrates the use of CUDA in a multi phase sorting
* computation.
* Device code.
*/
#ifndef _RADIXSORT_KERNEL_H_
#define _RADIXSORT_KERNEL_H_
#include <stdio.h>
#include "radixsort.cuh"
#define SYNCIT __syncthreads()
static const int NUM_SMS = 16;
static const int NUM_THREADS_PER_SM = 192;
static const int NUM_THREADS_PER_BLOCK = 64;
//static const int NUM_THREADS = NUM_THREADS_PER_SM * NUM_SMS;
static const int NUM_BLOCKS = (NUM_THREADS_PER_SM / NUM_THREADS_PER_BLOCK) * NUM_SMS;
static const int RADIX = 8; // Number of bits per radix sort pass
static const int RADICES = 1 << RADIX; // Number of radices
static const int RADIXMASK = RADICES - 1; // Mask for each radix sort pass
#if SIXTEEN
static const int RADIXBITS = 16; // Number of bits to sort over
#else
static const int RADIXBITS = 32; // Number of bits to sort over
#endif
static const int RADIXTHREADS = 16; // Number of threads sharing each radix counter
static const int RADIXGROUPS = NUM_THREADS_PER_BLOCK / RADIXTHREADS; // Number of radix groups per CTA
static const int TOTALRADIXGROUPS = NUM_BLOCKS * RADIXGROUPS; // Number of radix groups for each radix
static const int SORTRADIXGROUPS = TOTALRADIXGROUPS * RADICES; // Total radix count
static const int GRFELEMENTS = (NUM_THREADS_PER_BLOCK / RADIXTHREADS) * RADICES;
static const int GRFSIZE = GRFELEMENTS * sizeof(uint);
// Prefix sum variables
static const int PREFIX_NUM_THREADS_PER_SM = NUM_THREADS_PER_SM;
static const int PREFIX_NUM_THREADS_PER_BLOCK = PREFIX_NUM_THREADS_PER_SM;
static const int PREFIX_NUM_BLOCKS = (PREFIX_NUM_THREADS_PER_SM / PREFIX_NUM_THREADS_PER_BLOCK) * NUM_SMS;
static const int PREFIX_BLOCKSIZE = SORTRADIXGROUPS / PREFIX_NUM_BLOCKS;
static const int PREFIX_GRFELEMENTS = PREFIX_BLOCKSIZE + 2 * PREFIX_NUM_THREADS_PER_BLOCK;
static const int PREFIX_GRFSIZE = PREFIX_GRFELEMENTS * sizeof(uint);
// Shuffle variables
static const int SHUFFLE_GRFOFFSET = RADIXGROUPS * RADICES;
static const int SHUFFLE_GRFELEMENTS = SHUFFLE_GRFOFFSET + PREFIX_NUM_BLOCKS;
static const int SHUFFLE_GRFSIZE = SHUFFLE_GRFELEMENTS * sizeof(uint);
#define SDATA( index) CUT_BANK_CHECKER(sdata, index)
// Prefix sum data
uint gRadixSum[TOTALRADIXGROUPS * RADICES];
__device__ uint dRadixSum[TOTALRADIXGROUPS * RADICES];
uint gRadixBlockSum[PREFIX_NUM_BLOCKS];
__device__ uint dRadixBlockSum[PREFIX_NUM_BLOCKS];
extern __shared__ uint sRadixSum[];
////////////////////////////////////////////////////////////////////////////////
//! Perform a radix sum on the list to be sorted. Each SM holds a set of
//! radix counters for each group of RADIXGROUPS thread in the GRF.
//!
//! @param pData input data
//! @param elements total number of elements
//! @param elements_rounded_to_3072 total number of elements rounded up to the
//! nearest multiple of 3072
//! @param shift the shift (0 to 24) that we are using to obtain the correct
//! byte
////////////////////////////////////////////////////////////////////////////////
__global__ void RadixSum(KeyValuePair *pData, uint elements, uint elements_rounded_to_3072, uint shift)
{
uint pos = threadIdx.x;
// Zero radix counts
while (pos < GRFELEMENTS)
{
sRadixSum[pos] = 0;
pos += NUM_THREADS_PER_BLOCK;
}
// Sum up data
// Source addresses computed so that each thread is reading from a block of
// consecutive addresses so there are no conflicts between threads
// They then loop over their combined region and the next batch works elsewhere.
// So threads 0 to 16 work on memory 0 to 320.
// First reading 0,1,2,3...15 then 16,17,18,19...31 and so on
// optimising parallel access to shared memory by a thread accessing 16*threadID
// The next radix group runs from 320 to 640 and the same applies in that region
uint tmod = threadIdx.x % RADIXTHREADS;
uint tpos = threadIdx.x / RADIXTHREADS;
// Take the rounded element list size so that all threads have a certain size dataset to work with
// and no zero size datasets confusing the issue
// By using a multiple of 3072 we ensure that all threads have elements
// to work with until the last phase, at which point we individually test
uint element_fraction = elements_rounded_to_3072 / TOTALRADIXGROUPS;
// Generate range
// Note that it is possible for both pos and end to be past the end of the element set
// which will be caught later.
pos = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction;
uint end = pos + element_fraction;
pos += tmod;
//printf("pos: %d\n", pos);
__syncthreads();
while (pos < end )
{
uint key = 0;
// Read first data element if we are in the set of elements
//if( pos < elements )
//key = pData[pos].key;
KeyValuePair kvp;
// Read first data element, both items at once as the memory will want to coalesce like that anyway
if (pos < elements)
kvp = pData[pos];
else
kvp.key = 0;
key = kvp.key;
// Calculate position of radix counter to increment
// There are RADICES radices in each pass (256)
// and hence this many counters for bin grouping
// Multiply by RADIXGROUPS (4) to spread through memory
// and into 4 radix groups
uint p = ((key >> shift) & RADIXMASK) * RADIXGROUPS;
// Increment radix counters
// Each radix group has its own set of counters
// so we add the thread position [0-3], ie the group index.
// We slow down here and take at least 16 cycles to write to the summation boxes
// but other groups will only conflict with themselves and so can also be writing
// 16 cycles here at least avoids retries.
uint ppos = p + tpos;
// If we are past the last element we don't want to do anything
// We do have to check each time, however, to ensure that all
// threads sync on each sync here.
if (tmod == 0 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 1 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 2 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 3 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 4 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 5 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 6 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 7 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 8 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 9 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 10 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 11 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 12 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 13 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 14 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 15 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
pos += RADIXTHREADS;
}
__syncthreads();
__syncthreads();
// Output radix sums into separate memory regions for each radix group
// So this memory then is layed out:
// 0...... 192..... 384 ................ 192*256
// ie all 256 bins for each radix group
// in there:
// 0.............192
// 0 4 8 12... - block idx * 4
// And in the block boxes we see the 4 radix groups for that block
// So 0-192 should contain bin 0 for each radix group, and so on
uint offset = blockIdx.x * RADIXGROUPS;
uint row = threadIdx.x / RADIXGROUPS;
uint column = threadIdx.x % RADIXGROUPS;
while (row < RADICES)
{
dRadixSum[offset + row * TOTALRADIXGROUPS + column] = sRadixSum[row * RADIXGROUPS + column];
row += NUM_THREADS_PER_BLOCK / RADIXGROUPS;
}
}
////////////////////////////////////////////////////////////////////////////////
//! Performs first part of parallel prefix sum - individual sums of each radix
//! count. By the end of this we have prefix sums on a block level in dRadixSum
//! and totals for blocks in dRadixBlockSum.
////////////////////////////////////////////////////////////////////////////////
__global__ void RadixPrefixSum()
{
// Read radix groups in offset by one in the GRF so a zero can be inserted at the beginning
// and the final sum of all radix counts summed here is tacked onto the end for reading by
// the next stage
// Each block in this case is the full number of threads per SM (and hence the total number
// of radix groups), 192. We should then have the total set of offsets for an entire radix
// group by the end of this stage
// Device mem addressing
uint brow = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS);
uint drow = threadIdx.x / TOTALRADIXGROUPS; // In default parameterisation this is always 0
uint dcolumn = threadIdx.x % TOTALRADIXGROUPS; // And similarly this is always the same as threadIdx.x
uint dpos = (brow + drow) * TOTALRADIXGROUPS + dcolumn;
uint end = ((blockIdx.x + 1) * (RADICES / PREFIX_NUM_BLOCKS)) * TOTALRADIXGROUPS;
// Shared mem addressing
uint srow = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
uint scolumn = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
uint spos = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn;
// Read (RADICES / PREFIX_NUM_BLOCKS) radix counts into the GRF alongside each other
while (dpos < end)
{
sRadixSum[spos] = dRadixSum[dpos];
spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) *
(PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS;
}
__syncthreads();
// Perform preliminary sum on each thread's stretch of data
// Each thread having a block of 16, with spacers between 0...16 18...33 and so on
int pos = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
end = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
uint sum = 0;
while (pos < end)
{
sum += sRadixSum[pos];
sRadixSum[pos] = sum;
pos++;
}
__syncthreads();
// Calculate internal offsets by performing a more traditional parallel
// prefix sum of the topmost member of each thread's work data. Right now,
// these are stored between the work data for each thread, allowing us to
// eliminate GRF conflicts as well as hold the offsets needed to complete the sum
// In other words we have:
// 0....15 16 17....32 33 34....
// Where this first stage updates the intermediate values (so 16=15, 33=32 etc)
int m = (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
pos = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) +
(PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
sRadixSum[pos] = sRadixSum[pos - 1];
__syncthreads();
// This stage then performs a parallel prefix sum (ie use powers of 2 to propagate in log n stages)
// to update 17, 34 etc with the totals to that point (so 34 becomes [34] + [17]) and so on.
while (m < PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1))
{
int p = pos - m;
uint t = ((p > 0) ? sRadixSum[p] : 0);
__syncthreads();
sRadixSum[pos] += t;
__syncthreads();
m *= 2;
}
__syncthreads();
// Add internal offsets to each thread's work data.
// So now we take 17 and add it to all values 18 to 33 so all offsets for that block
// are updated.
pos = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
end = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
int p = pos - 1;
sum = ((p > 0) ? sRadixSum[p] : 0);
while (pos < end)
{
sRadixSum[pos] += sum;
pos++;
}
__syncthreads();
// Write summed data back out to global memory in the same way as we read it in
// We now have prefix sum values internal to groups
brow = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS);
drow = threadIdx.x / TOTALRADIXGROUPS;
dcolumn = threadIdx.x % TOTALRADIXGROUPS;
srow = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
scolumn = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
dpos = (brow + drow) * TOTALRADIXGROUPS + dcolumn + 1;
spos = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn;
end = ((blockIdx.x + 1) * RADICES / PREFIX_NUM_BLOCKS) * TOTALRADIXGROUPS;
while (dpos < end)
{
dRadixSum[dpos] = sRadixSum[spos];
dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS;
spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) *
(PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
}
// Write last element to summation
// Storing block sums in a separate array
if (threadIdx.x == 0) {
dRadixBlockSum[blockIdx.x] = sRadixSum[PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) - 1];
dRadixSum[blockIdx.x * PREFIX_BLOCKSIZE] = 0;
}
}
////////////////////////////////////////////////////////////////////////////////
//! Initially perform prefix sum of block totals to obtain final set of offsets.
//! Then make use of radix sums to perform a shuffling of the data into the
//! correct bins.
//!
//! @param pSrc input data
//! @param pDst output data
//! @param elements total number of elements
//! @param shift the shift (0 to 24) that we are using to obtain the correct
//! byte
////////////////////////////////////////////////////////////////////////////////
__global__ void RadixAddOffsetsAndShuffle(KeyValuePair* pSrc, KeyValuePair* pDst, uint elements, uint elements_rounded_to_3072, int shift)
{
// Read offsets from previous blocks
if (threadIdx.x == 0)
sRadixSum[SHUFFLE_GRFOFFSET] = 0;
if (threadIdx.x < PREFIX_NUM_BLOCKS - 1)
sRadixSum[SHUFFLE_GRFOFFSET + threadIdx.x + 1] = dRadixBlockSum[threadIdx.x];
__syncthreads();
// Parallel prefix sum over block sums
int pos = threadIdx.x;
int n = 1;
while (n < PREFIX_NUM_BLOCKS)
{
int ppos = pos - n;
uint t0 = ((pos < PREFIX_NUM_BLOCKS) && (ppos >= 0)) ? sRadixSum[SHUFFLE_GRFOFFSET + ppos] : 0;
__syncthreads();
if (pos < PREFIX_NUM_BLOCKS)
sRadixSum[SHUFFLE_GRFOFFSET + pos] += t0;
__syncthreads();
n *= 2;
}
// Read radix count data and add appropriate block offset
// for each radix at the memory location for this thread
// (where the other threads in the block will be reading
// as well, hence the large stride).
// There is one counter box per radix group per radix
// per block (4*256*3)
// We use 64 threads to read the 4 radix groups set of radices
// for the block.
int row = threadIdx.x / RADIXGROUPS;
int column = threadIdx.x % RADIXGROUPS;
int spos = row * RADIXGROUPS + column;
int dpos = row * TOTALRADIXGROUPS + column + blockIdx.x * RADIXGROUPS;
while (spos < SHUFFLE_GRFOFFSET)
{
sRadixSum[spos] = dRadixSum[dpos] + sRadixSum[SHUFFLE_GRFOFFSET + dpos / (TOTALRADIXGROUPS * RADICES / PREFIX_NUM_BLOCKS)];
spos += NUM_THREADS_PER_BLOCK;
dpos += (NUM_THREADS_PER_BLOCK / RADIXGROUPS) * TOTALRADIXGROUPS;
}
__syncthreads();
//int pos;
// Shuffle data
// Each of the subbins for a block should be filled via the counters, properly interleaved
// Then, as we now iterate over each data value, we increment the subbins (each thread in the
// radix group in turn to avoid miss writes due to conflicts) and set locations correctly.
uint element_fraction = elements_rounded_to_3072 / TOTALRADIXGROUPS;
int tmod = threadIdx.x % RADIXTHREADS;
int tpos = threadIdx.x / RADIXTHREADS;
pos = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction;
uint end = pos + element_fraction; //(blockIdx.x * RADIXGROUPS + tpos + 1) * element_fraction;
pos += tmod;
__syncthreads();
while (pos < end )
{
KeyValuePair kvp;
#if 1 // old load
// Read first data element, both items at once as the memory will want to coalesce like that anyway
if (pos < elements)
{
kvp = pSrc[pos];
}
else
kvp.key = 0;
#else // casting to float2 to get it to combine loads
int2 kvpf2;
// Read first data element, both items at once as the memory will want to coalesce like that anyway
if (pos < elements)
{
// kvp = pSrc[pos];
kvpf2 = ((int2*)pSrc)[pos];
// printf("kvp: %f %f kvpf2: %f %f\n", kvp.key, kvp.value, kvpf2.x, kvpf2.y);
}
else
//kvp.key = 0;
kvpf2.x = 0;
kvp.key = kvpf2.x;
kvp.value = kvpf2.y;
#endif
uint index;
// Calculate position of radix counter to increment
uint p = ((kvp.key >> shift) & RADIXMASK) * RADIXGROUPS;
// Move data, keeping counts updated.
// Increment radix counters, relying on hexadecathread
// warp to prevent this code from stepping all over itself.
uint ppos = p + tpos;
if (tmod == 0 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 1 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 2 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 3 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 4 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 5 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 6 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 7 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 8 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 9 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 10 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 11 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 12 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 13 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 14 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 15 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
pos += RADIXTHREADS;
}
__syncthreads();
}
#endif // #ifndef _RADIXSORT_KERNEL_H_

View File

@@ -0,0 +1,180 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
#include <GL/glew.h>
#include <math.h>
#include <assert.h>
#include <stdio.h>
#include "render_particles.h"
#include "shaders.h"
#ifndef M_PI
#define M_PI 3.1415926535897932384626433832795
#endif
ParticleRenderer::ParticleRenderer()
: m_pos(0),
m_numParticles(0),
m_pointSize(1.0f),
m_particleRadius(0.125f * 0.5f),
m_program(0),
m_vbo(0),
m_colorVBO(0)
{
_initGL();
}
ParticleRenderer::~ParticleRenderer()
{
m_pos = 0;
}
void ParticleRenderer::setPositions(float *pos, int numParticles)
{
m_pos = pos;
m_numParticles = numParticles;
}
void ParticleRenderer::setVertexBuffer(unsigned int vbo, int numParticles)
{
m_vbo = vbo;
m_numParticles = numParticles;
}
void ParticleRenderer::_drawPoints()
{
if (!m_vbo)
{
glBegin(GL_POINTS);
{
int k = 0;
for (int i = 0; i < m_numParticles; ++i)
{
glVertex3fv(&m_pos[k]);
k += 4;
}
}
glEnd();
}
else
{
glBindBufferARB(GL_ARRAY_BUFFER_ARB, m_vbo);
glVertexPointer(4, GL_FLOAT, 0, 0);
glEnableClientState(GL_VERTEX_ARRAY);
if (m_colorVBO) {
glBindBufferARB(GL_ARRAY_BUFFER_ARB, m_colorVBO);
glColorPointer(4, GL_FLOAT, 0, 0);
glEnableClientState(GL_COLOR_ARRAY);
}
glDrawArrays(GL_POINTS, 0, m_numParticles);
glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
glDisableClientState(GL_VERTEX_ARRAY);
glDisableClientState(GL_COLOR_ARRAY);
}
}
void ParticleRenderer::display(DisplayMode mode /* = PARTICLE_POINTS */)
{
switch (mode)
{
case PARTICLE_POINTS:
glColor3f(1, 1, 1);
glPointSize(m_pointSize);
_drawPoints();
break;
default:
case PARTICLE_SPHERES:
glEnable(GL_POINT_SPRITE_ARB);
glTexEnvi(GL_POINT_SPRITE_ARB, GL_COORD_REPLACE_ARB, GL_TRUE);
glEnable(GL_VERTEX_PROGRAM_POINT_SIZE_NV);
glDepthMask(GL_TRUE);
glEnable(GL_DEPTH_TEST);
glUseProgram(m_program);
glUniform1f( glGetUniformLocation(m_program, "pointScale"), m_window_h / tanf(m_fov*0.5f*(float)M_PI/180.0f) );
glUniform1f( glGetUniformLocation(m_program, "pointRadius"), m_particleRadius );
glColor3f(1, 1, 1);
_drawPoints();
glUseProgram(0);
glDisable(GL_POINT_SPRITE_ARB);
break;
}
}
GLuint
ParticleRenderer::_compileProgram(const char *vsource, const char *fsource)
{
GLuint vertexShader = glCreateShader(GL_VERTEX_SHADER);
GLuint fragmentShader = glCreateShader(GL_FRAGMENT_SHADER);
glShaderSource(vertexShader, 1, &vsource, 0);
glShaderSource(fragmentShader, 1, &fsource, 0);
glCompileShader(vertexShader);
glCompileShader(fragmentShader);
GLuint program = glCreateProgram();
glAttachShader(program, vertexShader);
glAttachShader(program, fragmentShader);
glLinkProgram(program);
// check if program linked
GLint success = 0;
glGetProgramiv(program, GL_LINK_STATUS, &success);
if (!success) {
char temp[256];
glGetProgramInfoLog(program, 256, 0, temp);
printf("Failed to link program:\n%s\n", temp);
glDeleteProgram(program);
program = 0;
}
return program;
}
void ParticleRenderer::_initGL()
{
m_program = _compileProgram(vertexShader, spherePixelShader);
#if !defined(__APPLE__) && !defined(MACOSX)
glClampColorARB(GL_CLAMP_VERTEX_COLOR_ARB, GL_FALSE);
glClampColorARB(GL_CLAMP_FRAGMENT_COLOR_ARB, GL_FALSE);
#endif
}

View File

@@ -0,0 +1,78 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
#ifndef __RENDER_PARTICLES__
#define __RENDER_PARTICLES__
class ParticleRenderer
{
public:
ParticleRenderer();
~ParticleRenderer();
void setPositions(float *pos, int numParticles);
void setVertexBuffer(unsigned int vbo, int numParticles);
void setColorBuffer(unsigned int vbo) { m_colorVBO = vbo; }
enum DisplayMode
{
PARTICLE_POINTS,
PARTICLE_SPHERES,
PARTICLE_NUM_MODES
};
void display(DisplayMode mode = PARTICLE_POINTS);
void displayGrid();
void setPointSize(float size) { m_pointSize = size; }
void setParticleRadius(float r) { m_particleRadius = r; }
void setFOV(float fov) { m_fov = fov; }
void setWindowSize(int w, int h) { m_window_w = w; m_window_h = h; }
protected: // methods
void _initGL();
void _drawPoints();
GLuint _compileProgram(const char *vsource, const char *fsource);
protected: // data
float *m_pos;
int m_numParticles;
float m_pointSize;
float m_particleRadius;
float m_fov;
int m_window_w, m_window_h;
GLuint m_program;
GLuint m_vbo;
GLuint m_colorVBO;
};
#endif //__ RENDER_PARTICLES__

41
Extras/CUDA/shaders.cpp Normal file
View File

@@ -0,0 +1,41 @@
#define STRINGIFY(A) #A
// vertex shader
const char *vertexShader = STRINGIFY(
uniform float pointRadius; // point size in world space
uniform float pointScale; // scale to calculate size in pixels
uniform float densityScale;
uniform float densityOffset;
void main()
{
// calculate window-space point size
vec3 posEye = vec3(gl_ModelViewMatrix * vec4(gl_Vertex.xyz, 1.0));
float dist = length(posEye);
gl_PointSize = pointRadius * (pointScale / dist);
gl_TexCoord[0] = gl_MultiTexCoord0;
gl_Position = gl_ModelViewProjectionMatrix * vec4(gl_Vertex.xyz, 1.0);
gl_FrontColor = gl_Color;
}
);
// pixel shader for rendering points as shaded spheres
const char *spherePixelShader = STRINGIFY(
void main()
{
const vec3 lightDir = vec3(0.577, 0.577, 0.577);
// calculate normal from texture coordinates
vec3 N;
N.xy = gl_TexCoord[0].xy*vec2(2.0, -2.0) + vec2(-1.0, 1.0);
float mag = dot(N.xy, N.xy);
if (mag > 1.0) discard; // kill pixels outside circle
N.z = sqrt(1.0-mag);
// calculate lighting
float diffuse = max(0.0, dot(lightDir, N));
gl_FragColor = gl_Color * diffuse;
}
);

2
Extras/CUDA/shaders.h Normal file
View File

@@ -0,0 +1,2 @@
extern const char *vertexShader;
extern const char *spherePixelShader;