Added initial Smoothed Particle Hydrodynamics implementation (SPH), for CPU and CUDA.

This software is contributed under the ZLib license by Rama Hoetzlein, http://www.rchoetzlein.com
We plan to integrate the SPH into the core Bullet library, including interaction with rigid bodies and soft bodies.
This commit is contained in:
erwin.coumans
2009-05-13 22:28:03 +00:00
parent 4616b62686
commit e260bcd1b2
50 changed files with 61309 additions and 0 deletions

View File

@@ -0,0 +1,22 @@
/*
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
Copyright (C) 2009. Rama Hoetzlein, http://www.rchoetzlein.com
ZLib license
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "fluid.h"

44
Extras/sph/fluids/fluid.h Normal file
View File

@@ -0,0 +1,44 @@
/*
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
ZLib license
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef DEF_FLUID
#define DEF_FLUID
#include "vector.h"
#include "common_defs.h"
struct Fluid {
public:
Vector3DF pos; // Basic particle (must match Particle class)
DWORD clr;
int next;
Vector3DF vel;
Vector3DF vel_eval;
unsigned short age;
float pressure; // Smoothed Particle Hydrodynamics
float density;
Vector3DF sph_force;
};
#endif /*PARTICLE_H_*/

View File

@@ -0,0 +1,869 @@
/*
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
ZLib license
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include <conio.h>
#ifdef _MSC_VER
#include <gl/glut.h>
#else
#include <GL/glut.h>
#endif
#include "common_defs.h"
#include "mtime.h"
#include "fluid_system.h"
#ifdef BUILD_CUDA
#include "fluid_system_host.cuh"
#endif
#define EPSILON 0.00001f //for collision detection
FluidSystem::FluidSystem ()
{
}
void FluidSystem::Initialize ( int mode, int total )
{
if ( mode != BFLUID ) {
printf ( "ERROR: FluidSystem not initialized as BFLUID.\n");
}
PointSet::Initialize ( mode, total );
FreeBuffers ();
AddBuffer ( BFLUID, sizeof ( Fluid ), total );
AddAttribute ( 0, "pos", sizeof ( Vector3DF ), false );
AddAttribute ( 0, "color", sizeof ( DWORD ), false );
AddAttribute ( 0, "vel", sizeof ( Vector3DF ), false );
AddAttribute ( 0, "ndx", sizeof ( unsigned short ), false );
AddAttribute ( 0, "age", sizeof ( unsigned short ), false );
AddAttribute ( 0, "pressure", sizeof ( double ), false );
AddAttribute ( 0, "density", sizeof ( double ), false );
AddAttribute ( 0, "sph_force", sizeof ( Vector3DF ), false );
AddAttribute ( 0, "next", sizeof ( Fluid* ), false );
AddAttribute ( 0, "tag", sizeof ( bool ), false );
SPH_Setup ();
Reset ( total );
}
void FluidSystem::Reset ( int nmax )
{
ResetBuffer ( 0, nmax );
m_DT = 0.003; // 0.001; // .001 = for point grav
// Reset parameters
m_Param [ MAX_FRAC ] = 1.0;
m_Param [ POINT_GRAV ] = 0.0;
m_Param [ PLANE_GRAV ] = 1.0;
m_Param [ BOUND_ZMIN_SLOPE ] = 0.0;
m_Param [ FORCE_XMAX_SIN ] = 0.0;
m_Param [ FORCE_XMIN_SIN ] = 0.0;
m_Toggle [ WRAP_X ] = false;
m_Toggle [ WALL_BARRIER ] = false;
m_Toggle [ LEVY_BARRIER ] = false;
m_Toggle [ DRAIN_BARRIER ] = false;
m_Param [ SPH_INTSTIFF ] = 1.00;
m_Param [ SPH_VISC ] = 0.2;
m_Param [ SPH_INTSTIFF ] = 0.50;
m_Param [ SPH_EXTSTIFF ] = 20000;
m_Param [ SPH_SMOOTHRADIUS ] = 0.01;
m_Vec [ POINT_GRAV_POS ].Set ( 0, 0, 50 );
m_Vec [ PLANE_GRAV_DIR ].Set ( 0, 0, -9.8 );
m_Vec [ EMIT_POS ].Set ( 0, 0, 0 );
m_Vec [ EMIT_RATE ].Set ( 0, 0, 0 );
m_Vec [ EMIT_ANG ].Set ( 0, 90, 1.0 );
m_Vec [ EMIT_DANG ].Set ( 0, 0, 0 );
}
int FluidSystem::AddPoint ()
{
xref ndx;
Fluid* f = (Fluid*) AddElem ( 0, ndx );
f->sph_force.Set(0,0,0);
f->vel.Set(0,0,0);
f->vel_eval.Set(0,0,0);
f->next = 0x0;
f->pressure = 0;
f->density = 0;
return ndx;
}
int FluidSystem::AddPointReuse ()
{
xref ndx;
Fluid* f;
if ( NumPoints() <= mBuf[0].max-2 )
f = (Fluid*) AddElem ( 0, ndx );
else
f = (Fluid*) RandomElem ( 0, ndx );
f->sph_force.Set(0,0,0);
f->vel.Set(0,0,0);
f->vel_eval.Set(0,0,0);
f->next = 0x0;
f->pressure = 0;
f->density = 0;
return ndx;
}
void FluidSystem::Run ()
{
bool bTiming = true;
mint::Time start, stop;
float ss = m_Param [ SPH_PDIST ] / m_Param[ SPH_SIMSCALE ]; // simulation scale (not Schutzstaffel)
if ( m_Vec[EMIT_RATE].x > 0 && (++m_Frame) % (int) m_Vec[EMIT_RATE].x == 0 ) {
//m_Frame = 0;
Emit ( ss );
}
#ifdef NOGRID
// Slow method - O(n^2)
SPH_ComputePressureSlow ();
SPH_ComputeForceSlow ();
#else
if ( m_Toggle[USE_CUDA] ) {
#ifdef BUILD_CUDA
// -- GPU --
start.SetSystemTime ( ACC_NSEC );
TransferToCUDA ( mBuf[0].data, (int*) &m_Grid[0], NumPoints() );
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "TO: %s\n", stop.GetReadableTime().c_str() ); }
start.SetSystemTime ( ACC_NSEC );
Grid_InsertParticlesCUDA ();
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "INSERT (CUDA): %s\n", stop.GetReadableTime().c_str() ); }
start.SetSystemTime ( ACC_NSEC );
SPH_ComputePressureCUDA ();
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "PRESS (CUDA): %s\n", stop.GetReadableTime().c_str() ); }
start.SetSystemTime ( ACC_NSEC );
SPH_ComputeForceCUDA ();
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "FORCE (CUDA): %s\n", stop.GetReadableTime().c_str() ); }
//** CUDA integrator is incomplete..
// Once integrator is done, we can remove TransferTo/From steps
/*start.SetSystemTime ( ACC_NSEC );
SPH_AdvanceCUDA( m_DT, m_DT/m_Param[SPH_SIMSCALE] );
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "ADV (CUDA): %s\n", stop.GetReadableTime().c_str() ); }*/
start.SetSystemTime ( ACC_NSEC );
TransferFromCUDA ( mBuf[0].data, (int*) &m_Grid[0], NumPoints() );
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "FROM: %s\n", stop.GetReadableTime().c_str() ); }
// .. Do advance on CPU
Advance();
#endif
} else {
// -- CPU only --
start.SetSystemTime ( ACC_NSEC );
Grid_InsertParticles ();
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "INSERT: %s\n", stop.GetReadableTime().c_str() ); }
start.SetSystemTime ( ACC_NSEC );
SPH_ComputePressureGrid ();
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "PRESS: %s\n", stop.GetReadableTime().c_str() ); }
start.SetSystemTime ( ACC_NSEC );
SPH_ComputeForceGridNC ();
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "FORCE: %s\n", stop.GetReadableTime().c_str() ); }
start.SetSystemTime ( ACC_NSEC );
Advance();
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "ADV: %s\n", stop.GetReadableTime().c_str() ); }
}
#endif
}
void FluidSystem::SPH_DrawDomain ()
{
Vector3DF min, max;
min = m_Vec[SPH_VOLMIN];
max = m_Vec[SPH_VOLMAX];
min.z += 0.5;
glColor3f ( 0.0, 0.0, 1.0 );
glBegin ( GL_LINES );
glVertex3f ( min.x, min.y, min.z ); glVertex3f ( max.x, min.y, min.z );
glVertex3f ( min.x, max.y, min.z ); glVertex3f ( max.x, max.y, min.z );
glVertex3f ( min.x, min.y, min.z ); glVertex3f ( min.x, max.y, min.z );
glVertex3f ( max.x, min.y, min.z ); glVertex3f ( max.x, max.y, min.z );
glEnd ();
}
void FluidSystem::Advance ()
{
char *dat1, *dat1_end;
Fluid* p;
Vector3DF norm, z;
Vector3DF dir, accel;
Vector3DF vnext;
Vector3DF min, max;
double adj;
float SL, SL2, ss, radius;
float stiff, damp, speed, diff;
SL = m_Param[SPH_LIMIT];
SL2 = SL*SL;
stiff = m_Param[SPH_EXTSTIFF];
damp = m_Param[SPH_EXTDAMP];
radius = m_Param[SPH_PRADIUS];
min = m_Vec[SPH_VOLMIN];
max = m_Vec[SPH_VOLMAX];
ss = m_Param[SPH_SIMSCALE];
dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride ) {
p = (Fluid*) dat1;
// Compute Acceleration
accel = p->sph_force;
accel *= m_Param[SPH_PMASS];
// Velocity limiting
speed = accel.x*accel.x + accel.y*accel.y + accel.z*accel.z;
if ( speed > SL2 ) {
accel *= SL / sqrt(speed);
}
// Boundary Conditions
// Z-axis walls
diff = 2 * radius - ( p->pos.z - min.z - (p->pos.x - m_Vec[SPH_VOLMIN].x) * m_Param[BOUND_ZMIN_SLOPE] )*ss;
if (diff > EPSILON ) {
norm.Set ( -m_Param[BOUND_ZMIN_SLOPE], 0, 1.0 - m_Param[BOUND_ZMIN_SLOPE] );
adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
}
diff = 2 * radius - ( max.z - p->pos.z )*ss;
if (diff > EPSILON) {
norm.Set ( 0, 0, -1 );
adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
}
// X-axis walls
if ( !m_Toggle[WRAP_X] ) {
diff = 2 * radius - ( p->pos.x - min.x + (sin(m_Time*10.0)-1+(p->pos.y*0.025)*0.25) * m_Param[FORCE_XMIN_SIN] )*ss;
//diff = 2 * radius - ( p->pos.x - min.x + (sin(m_Time*10.0)-1) * m_Param[FORCE_XMIN_SIN] )*ss;
if (diff > EPSILON ) {
norm.Set ( 1.0, 0, 0 );
adj = (m_Param[ FORCE_XMIN_SIN ] + 1) * stiff * diff - damp * norm.Dot ( p->vel_eval ) ;
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
}
diff = 2 * radius - ( max.x - p->pos.x + (sin(m_Time*10.0)-1) * m_Param[FORCE_XMAX_SIN] )*ss;
if (diff > EPSILON) {
norm.Set ( -1, 0, 0 );
adj = (m_Param[ FORCE_XMAX_SIN ]+1) * stiff * diff - damp * norm.Dot ( p->vel_eval );
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
}
}
// Y-axis walls
diff = 2 * radius - ( p->pos.y - min.y )*ss;
if (diff > EPSILON) {
norm.Set ( 0, 1, 0 );
adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
}
diff = 2 * radius - ( max.y - p->pos.y )*ss;
if (diff > EPSILON) {
norm.Set ( 0, -1, 0 );
adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
}
// Wall barrier
if ( m_Toggle[WALL_BARRIER] ) {
diff = 2 * radius - ( p->pos.x - 0 )*ss;
if (diff < 2*radius && diff > EPSILON && fabs(p->pos.y) < 3 && p->pos.z < 10) {
norm.Set ( 1.0, 0, 0 );
adj = 2*stiff * diff - damp * norm.Dot ( p->vel_eval ) ;
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
}
}
// Levy barrier
if ( m_Toggle[LEVY_BARRIER] ) {
diff = 2 * radius - ( p->pos.x - 0 )*ss;
if (diff < 2*radius && diff > EPSILON && fabs(p->pos.y) > 5 && p->pos.z < 10) {
norm.Set ( 1.0, 0, 0 );
adj = 2*stiff * diff - damp * norm.Dot ( p->vel_eval ) ;
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
}
}
// Drain barrier
if ( m_Toggle[DRAIN_BARRIER] ) {
diff = 2 * radius - ( p->pos.z - min.z-15 )*ss;
if (diff < 2*radius && diff > EPSILON && (fabs(p->pos.x)>3 || fabs(p->pos.y)>3) ) {
norm.Set ( 0, 0, 1);
adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
}
}
// Plane gravity
if ( m_Param[PLANE_GRAV] > 0)
accel += m_Vec[PLANE_GRAV_DIR];
// Point gravity
if ( m_Param[POINT_GRAV] > 0 ) {
norm.x = ( p->pos.x - m_Vec[POINT_GRAV_POS].x );
norm.y = ( p->pos.y - m_Vec[POINT_GRAV_POS].y );
norm.z = ( p->pos.z - m_Vec[POINT_GRAV_POS].z );
norm.Normalize ();
norm *= m_Param[POINT_GRAV];
accel -= norm;
}
// Leapfrog Integration ----------------------------
vnext = accel;
vnext *= m_DT;
vnext += p->vel; // v(t+1/2) = v(t-1/2) + a(t) dt
p->vel_eval = p->vel;
p->vel_eval += vnext;
p->vel_eval *= 0.5; // v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5 used to compute forces later
p->vel = vnext;
vnext *= m_DT/ss;
p->pos += vnext; // p(t+1) = p(t) + v(t+1/2) dt
if ( m_Param[CLR_MODE]==1.0 ) {
adj = fabs(vnext.x)+fabs(vnext.y)+fabs(vnext.z) / 7000.0;
adj = (adj > 1.0) ? 1.0 : adj;
p->clr = COLORA( 0, adj, adj, 1 );
}
if ( m_Param[CLR_MODE]==2.0 ) {
float v = 0.5 + ( p->pressure / 1500.0);
if ( v < 0.1 ) v = 0.1;
if ( v > 1.0 ) v = 1.0;
p->clr = COLORA ( v, 1-v, 0, 1 );
}
// Euler integration -------------------------------
/* accel += m_Gravity;
accel *= m_DT;
p->vel += accel; // v(t+1) = v(t) + a(t) dt
p->vel_eval += accel;
p->vel_eval *= m_DT/d;
p->pos += p->vel_eval;
p->vel_eval = p->vel; */
if ( m_Toggle[WRAP_X] ) {
diff = p->pos.x - (m_Vec[SPH_VOLMIN].x + 2); // -- Simulates object in center of flow
if ( diff <= 0 ) {
p->pos.x = (m_Vec[SPH_VOLMAX].x - 2) + diff*2;
p->pos.z = 10;
}
}
}
m_Time += m_DT;
}
//------------------------------------------------------ SPH Setup
//
// Range = +/- 10.0 * 0.006 (r) = 0.12 m (= 120 mm = 4.7 inch)
// Container Volume (Vc) = 0.001728 m^3
// Rest Density (D) = 1000.0 kg / m^3
// Particle Mass (Pm) = 0.00020543 kg (mass = vol * density)
// Number of Particles (N) = 4000.0
// Water Mass (M) = 0.821 kg (= 821 grams)
// Water Volume (V) = 0.000821 m^3 (= 3.4 cups, .21 gals)
// Smoothing Radius (R) = 0.02 m (= 20 mm = ~3/4 inch)
// Particle Radius (Pr) = 0.00366 m (= 4 mm = ~1/8 inch)
// Particle Volume (Pv) = 2.054e-7 m^3 (= .268 milliliters)
// Rest Distance (Pd) = 0.0059 m
//
// Given: D, Pm, N
// Pv = Pm / D 0.00020543 kg / 1000 kg/m^3 = 2.054e-7 m^3
// Pv = 4/3*pi*Pr^3 cuberoot( 2.054e-7 m^3 * 3/(4pi) ) = 0.00366 m
// M = Pm * N 0.00020543 kg * 4000.0 = 0.821 kg
// V = M / D 0.821 kg / 1000 kg/m^3 = 0.000821 m^3
// V = Pv * N 2.054e-7 m^3 * 4000 = 0.000821 m^3
// Pd = cuberoot(Pm/D) cuberoot(0.00020543/1000) = 0.0059 m
//
// Ideal grid cell size (gs) = 2 * smoothing radius = 0.02*2 = 0.04
// Ideal domain size = k*gs/d = k*0.02*2/0.005 = k*8 = {8, 16, 24, 32, 40, 48, ..}
// (k = number of cells, gs = cell size, d = simulation scale)
void FluidSystem::SPH_Setup ()
{
m_Param [ SPH_SIMSCALE ] = 0.004; // unit size
m_Param [ SPH_VISC ] = 0.2; // pascal-second (Pa.s) = 1 kg m^-1 s^-1 (see wikipedia page on viscosity)
m_Param [ SPH_RESTDENSITY ] = 600.0; // kg / m^3
m_Param [ SPH_PMASS ] = 0.00020543; // kg
m_Param [ SPH_PRADIUS ] = 0.004; // m
m_Param [ SPH_PDIST ] = 0.0059; // m
m_Param [ SPH_SMOOTHRADIUS ] = 0.01; // m
m_Param [ SPH_INTSTIFF ] = 1.00;
m_Param [ SPH_EXTSTIFF ] = 10000.0;
m_Param [ SPH_EXTDAMP ] = 256.0;
m_Param [ SPH_LIMIT ] = 200.0; // m / s
m_Toggle [ SPH_GRID ] = false;
m_Toggle [ SPH_DEBUG ] = false;
SPH_ComputeKernels ();
}
void FluidSystem::SPH_ComputeKernels ()
{
m_Param [ SPH_PDIST ] = pow ( m_Param[SPH_PMASS] / m_Param[SPH_RESTDENSITY], 1/3.0 );
m_R2 = m_Param [SPH_SMOOTHRADIUS] * m_Param[SPH_SMOOTHRADIUS];
m_Poly6Kern = 315.0f / (64.0f * 3.141592 * pow( m_Param[SPH_SMOOTHRADIUS], 9) ); // Wpoly6 kernel (denominator part) - 2003 Muller, p.4
m_SpikyKern = -45.0f / (3.141592 * pow( m_Param[SPH_SMOOTHRADIUS], 6) ); // Laplacian of viscocity (denominator): PI h^6
m_LapKern = 45.0f / (3.141592 * pow( m_Param[SPH_SMOOTHRADIUS], 6) );
}
void FluidSystem::SPH_CreateExample ( int n, int nmax )
{
Vector3DF pos;
Vector3DF min, max;
Reset ( nmax );
switch ( n ) {
case 0: // Wave pool
//-- TEST CASE: 2x2x2 grid, 32 particles. NOTE: Set PRADIUS to 0.0004 to reduce wall influence
// grid 0: 3*3*2 = 18 particles
// grid 1,2: 3*1*2 = 6 particles
// grid 3: 1*1*2 = 2 particles
// grid 4,5,6: 0 = 0 particles
/*m_Vec [ SPH_VOLMIN ].Set ( -2.5, -2.5, 0 );
m_Vec [ SPH_VOLMAX ].Set ( 2.5, 2.5, 5.0 );
m_Vec [ SPH_INITMIN ].Set ( -2.5, -2.5, 0 );
m_Vec [ SPH_INITMAX ].Set ( 2.5, 2.5, 1.6 );*/
m_Vec [ SPH_VOLMIN ].Set ( -30, -30, 0 );
m_Vec [ SPH_VOLMAX ].Set ( 30, 30, 40 );
//m_Vec [ SPH_INITMIN ].Set ( -5, -5, 10 );
//m_Vec [ SPH_INITMAX ].Set ( 5, 5, 20 );
m_Vec [ SPH_INITMIN ].Set ( -20, -26, 10 );
m_Vec [ SPH_INITMAX ].Set ( 20, 26, 40 );
m_Param [ FORCE_XMIN_SIN ] = 12.0;
m_Param [ BOUND_ZMIN_SLOPE ] = 0.05;
break;
case 1: // Dam break
m_Vec [ SPH_VOLMIN ].Set ( -30, -14, 0 );
m_Vec [ SPH_VOLMAX ].Set ( 30, 14, 60 );
m_Vec [ SPH_INITMIN ].Set ( 0, -13, 0 );
m_Vec [ SPH_INITMAX ].Set ( 29, 13, 30 );
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
break;
case 2: // Dual-Wave pool
m_Vec [ SPH_VOLMIN ].Set ( -60, -5, 0 );
m_Vec [ SPH_VOLMAX ].Set ( 60, 5, 50 );
m_Vec [ SPH_INITMIN ].Set ( -46, -5, 0 );
m_Vec [ SPH_INITMAX ].Set ( 46, 5, 15 );
m_Param [ FORCE_XMIN_SIN ] = 8.0;
m_Param [ FORCE_XMAX_SIN ] = 8.0;
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
break;
case 3: // Swirl Stream
m_Vec [ SPH_VOLMIN ].Set ( -30, -30, 0 );
m_Vec [ SPH_VOLMAX ].Set ( 30, 30, 50 );
m_Vec [ SPH_INITMIN ].Set ( -30, -30, 0 );
m_Vec [ SPH_INITMAX ].Set ( 30, 30, 40 );
m_Vec [ EMIT_POS ].Set ( -20, -20, 22 );
m_Vec [ EMIT_RATE ].Set ( 1, 4, 0 );
m_Vec [ EMIT_ANG ].Set ( 0, 120, 1.5 );
m_Vec [ EMIT_DANG ].Set ( 0, 0, 0 );
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
break;
case 4: // Shockwave
m_Vec [ SPH_VOLMIN ].Set ( -60, -15, 0 );
m_Vec [ SPH_VOLMAX ].Set ( 60, 15, 50 );
m_Vec [ SPH_INITMIN ].Set ( -59, -14, 0 );
m_Vec [ SPH_INITMAX ].Set ( 59, 14, 30 );
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
m_Toggle [ WALL_BARRIER ] = true;
m_Toggle [ WRAP_X ] = true;
break;
case 5: // Zero gravity
m_Vec [ SPH_VOLMIN ].Set ( -40, -40, 0 );
m_Vec [ SPH_VOLMAX ].Set ( 40, 40, 50 );
m_Vec [ SPH_INITMIN ].Set ( -20, -20, 20 );
m_Vec [ SPH_INITMAX ].Set ( 20, 20, 40 );
m_Vec [ EMIT_POS ].Set ( -20, 0, 40 );
m_Vec [ EMIT_RATE ].Set ( 2, 1, 0 );
m_Vec [ EMIT_ANG ].Set ( 0, 120, 0.25 );
m_Vec [ PLANE_GRAV_DIR ].Set ( 0, 0, 0 );
m_Param [ SPH_INTSTIFF ] = 0.20;
break;
case 6: // Point gravity
m_Vec [ SPH_VOLMIN ].Set ( -40, -40, 0 );
m_Vec [ SPH_VOLMAX ].Set ( 40, 40, 50 );
m_Vec [ SPH_INITMIN ].Set ( -20, -20, 20 );
m_Vec [ SPH_INITMAX ].Set ( 20, 20, 40 );
m_Param [ SPH_INTSTIFF ] = 0.50;
m_Vec [ EMIT_POS ].Set ( -20, 20, 25 );
m_Vec [ EMIT_RATE ].Set ( 1, 4, 0 );
m_Vec [ EMIT_ANG ].Set ( -20, 100, 2.0 );
m_Vec [ POINT_GRAV_POS ].Set ( 0, 0, 25 );
m_Vec [ PLANE_GRAV_DIR ].Set ( 0, 0, 0 );
m_Param [ POINT_GRAV ] = 3.5;
break;
case 7: // Levy break
m_Vec [ SPH_VOLMIN ].Set ( -40, -40, 0 );
m_Vec [ SPH_VOLMAX ].Set ( 40, 40, 50 );
m_Vec [ SPH_INITMIN ].Set ( 10, -40, 0 );
m_Vec [ SPH_INITMAX ].Set ( 40, 40, 50 );
m_Vec [ EMIT_POS ].Set ( 34, 27, 16.6 );
m_Vec [ EMIT_RATE ].Set ( 2, 9, 0 );
m_Vec [ EMIT_ANG ].Set ( 118, 200, 1.0 );
m_Toggle [ LEVY_BARRIER ] = true;
m_Param [ BOUND_ZMIN_SLOPE ] = 0.1;
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
break;
case 8: // Drain
m_Vec [ SPH_VOLMIN ].Set ( -20, -20, 0 );
m_Vec [ SPH_VOLMAX ].Set ( 20, 20, 50 );
m_Vec [ SPH_INITMIN ].Set ( -15, -20, 20 );
m_Vec [ SPH_INITMAX ].Set ( 20, 20, 50 );
m_Vec [ EMIT_POS ].Set ( -16, -16, 30 );
m_Vec [ EMIT_RATE ].Set ( 1, 4, 0 );
m_Vec [ EMIT_ANG ].Set ( -20, 140, 1.8 );
m_Toggle [ DRAIN_BARRIER ] = true;
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
break;
case 9: // Tumbler
m_Vec [ SPH_VOLMIN ].Set ( -30, -30, 0 );
m_Vec [ SPH_VOLMAX ].Set ( 30, 30, 50 );
m_Vec [ SPH_INITMIN ].Set ( 24, -29, 20 );
m_Vec [ SPH_INITMAX ].Set ( 29, 29, 40 );
m_Param [ SPH_VISC ] = 0.1;
m_Param [ SPH_INTSTIFF ] = 0.50;
m_Param [ SPH_EXTSTIFF ] = 8000;
//m_Param [ SPH_SMOOTHRADIUS ] = 0.01;
m_Param [ BOUND_ZMIN_SLOPE ] = 0.4;
m_Param [ FORCE_XMIN_SIN ] = 12.00;
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
break;
case 10: // Large sim
m_Vec [ SPH_VOLMIN ].Set ( -35, -35, 0 );
m_Vec [ SPH_VOLMAX ].Set ( 35, 35, 60 );
m_Vec [ SPH_INITMIN ].Set ( -5, -35, 0 );
m_Vec [ SPH_INITMAX ].Set ( 30, 0, 60 );
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
break;
}
SPH_ComputeKernels ();
m_Param [ SPH_SIMSIZE ] = m_Param [ SPH_SIMSCALE ] * (m_Vec[SPH_VOLMAX].z - m_Vec[SPH_VOLMIN].z);
m_Param [ SPH_PDIST ] = pow ( m_Param[SPH_PMASS] / m_Param[SPH_RESTDENSITY], 1/3.0 );
float ss = m_Param [ SPH_PDIST ]*0.87 / m_Param[ SPH_SIMSCALE ];
printf ( "Spacing: %f\n", ss);
AddVolume ( m_Vec[SPH_INITMIN], m_Vec[SPH_INITMAX], ss ); // Create the particles
float cell_size = m_Param[SPH_SMOOTHRADIUS]*2.0; // Grid cell size (2r)
Grid_Setup ( m_Vec[SPH_VOLMIN], m_Vec[SPH_VOLMAX], m_Param[SPH_SIMSCALE], cell_size, 1.0 ); // Setup grid
Grid_InsertParticles (); // Insert particles
Vector3DF vmin, vmax;
vmin = m_Vec[SPH_VOLMIN];
vmin -= Vector3DF(2,2,2);
vmax = m_Vec[SPH_VOLMAX];
vmax += Vector3DF(2,2,-2);
#ifdef BUILD_CUDA
FluidClearCUDA ();
Sleep ( 500 );
FluidSetupCUDA ( NumPoints(), sizeof(Fluid), *(float3*)& m_GridMin, *(float3*)& m_GridMax, *(float3*)& m_GridRes, *(float3*)& m_GridSize, (int) m_Vec[EMIT_RATE].x );
Sleep ( 500 );
FluidParamCUDA ( m_Param[SPH_SIMSCALE], m_Param[SPH_SMOOTHRADIUS], m_Param[SPH_PMASS], m_Param[SPH_RESTDENSITY], m_Param[SPH_INTSTIFF], m_Param[SPH_VISC] );
#endif
}
// Compute Pressures - Very slow yet simple. O(n^2)
void FluidSystem::SPH_ComputePressureSlow ()
{
char *dat1, *dat1_end;
char *dat2, *dat2_end;
Fluid *p, *q;
int cnt = 0;
double dx, dy, dz, sum, dsq, c;
double d, d2, mR, mR2;
d = m_Param[SPH_SIMSCALE];
d2 = d*d;
mR = m_Param[SPH_SMOOTHRADIUS];
mR2 = mR*mR;
dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride ) {
p = (Fluid*) dat1;
sum = 0.0;
cnt = 0;
dat2_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
for ( dat2 = mBuf[0].data; dat2 < dat2_end; dat2 += mBuf[0].stride ) {
q = (Fluid*) dat2;
if ( p==q ) continue;
dx = ( p->pos.x - q->pos.x)*d; // dist in cm
dy = ( p->pos.y - q->pos.y)*d;
dz = ( p->pos.z - q->pos.z)*d;
dsq = (dx*dx + dy*dy + dz*dz);
if ( mR2 > dsq ) {
c = m_R2 - dsq;
sum += c * c * c;
cnt++;
//if ( p == m_CurrP ) q->tag = true;
}
}
p->density = sum * m_Param[SPH_PMASS] * m_Poly6Kern ;
p->pressure = ( p->density - m_Param[SPH_RESTDENSITY] ) * m_Param[SPH_INTSTIFF];
p->density = 1.0f / p->density;
}
}
// Compute Pressures - Using spatial grid, and also create neighbor table
void FluidSystem::SPH_ComputePressureGrid ()
{
char *dat1, *dat1_end;
Fluid* p;
Fluid* pcurr;
int pndx;
int i, cnt = 0;
float dx, dy, dz, sum, dsq, c;
float d, d2, mR, mR2;
float radius = m_Param[SPH_SMOOTHRADIUS] / m_Param[SPH_SIMSCALE];
d = m_Param[SPH_SIMSCALE];
d2 = d*d;
mR = m_Param[SPH_SMOOTHRADIUS];
mR2 = mR*mR;
dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
i = 0;
for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride, i++ ) {
p = (Fluid*) dat1;
sum = 0.0;
m_NC[i] = 0;
Grid_FindCells ( p->pos, radius );
for (int cell=0; cell < 8; cell++) {
if ( m_GridCell[cell] != -1 ) {
pndx = m_Grid [ m_GridCell[cell] ];
while ( pndx != -1 ) {
pcurr = (Fluid*) (mBuf[0].data + pndx*mBuf[0].stride);
if ( pcurr == p ) {pndx = pcurr->next; continue; }
dx = ( p->pos.x - pcurr->pos.x)*d; // dist in cm
dy = ( p->pos.y - pcurr->pos.y)*d;
dz = ( p->pos.z - pcurr->pos.z)*d;
dsq = (dx*dx + dy*dy + dz*dz);
if ( mR2 > dsq ) {
c = m_R2 - dsq;
sum += c * c * c;
if ( m_NC[i] < MAX_NEIGHBOR ) {
m_Neighbor[i][ m_NC[i] ] = pndx;
m_NDist[i][ m_NC[i] ] = sqrt(dsq);
m_NC[i]++;
}
}
pndx = pcurr->next;
}
}
m_GridCell[cell] = -1;
}
p->density = sum * m_Param[SPH_PMASS] * m_Poly6Kern ;
p->pressure = ( p->density - m_Param[SPH_RESTDENSITY] ) * m_Param[SPH_INTSTIFF];
p->density = 1.0f / p->density;
}
}
// Compute Forces - Very slow, but simple. O(n^2)
void FluidSystem::SPH_ComputeForceSlow ()
{
char *dat1, *dat1_end;
char *dat2, *dat2_end;
Fluid *p, *q;
Vector3DF force, fcurr;
register double pterm, vterm, dterm;
double c, r, d, sum, dsq;
double dx, dy, dz;
double mR, mR2, visc;
d = m_Param[SPH_SIMSCALE];
mR = m_Param[SPH_SMOOTHRADIUS];
mR2 = (mR*mR);
visc = m_Param[SPH_VISC];
vterm = m_LapKern * visc;
dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride ) {
p = (Fluid*) dat1;
sum = 0.0;
force.Set ( 0, 0, 0 );
dat2_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
for ( dat2 = mBuf[0].data; dat2 < dat2_end; dat2 += mBuf[0].stride ) {
q = (Fluid*) dat2;
if ( p == q ) continue;
dx = ( p->pos.x - q->pos.x )*d; // dist in cm
dy = ( p->pos.y - q->pos.y )*d;
dz = ( p->pos.z - q->pos.z )*d;
dsq = (dx*dx + dy*dy + dz*dz);
if ( mR2 > dsq ) {
r = sqrt ( dsq );
c = (mR - r);
pterm = -0.5f * c * m_SpikyKern * ( p->pressure + q->pressure) / r;
dterm = c * p->density * q->density;
force.x += ( pterm * dx + vterm * (q->vel_eval.x - p->vel_eval.x) ) * dterm;
force.y += ( pterm * dy + vterm * (q->vel_eval.y - p->vel_eval.y) ) * dterm;
force.z += ( pterm * dz + vterm * (q->vel_eval.z - p->vel_eval.z) ) * dterm;
}
}
p->sph_force = force;
}
}
// Compute Forces - Using spatial grid. Faster.
void FluidSystem::SPH_ComputeForceGrid ()
{
char *dat1, *dat1_end;
Fluid *p;
Fluid *pcurr;
int pndx;
Vector3DF force, fcurr;
register double pterm, vterm, dterm;
double c, d, dsq, r;
double dx, dy, dz;
double mR, mR2, visc;
float radius = m_Param[SPH_SMOOTHRADIUS] / m_Param[SPH_SIMSCALE];
d = m_Param[SPH_SIMSCALE];
mR = m_Param[SPH_SMOOTHRADIUS];
mR2 = (mR*mR);
visc = m_Param[SPH_VISC];
dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride ) {
p = (Fluid*) dat1;
force.Set ( 0, 0, 0 );
Grid_FindCells ( p->pos, radius );
for (int cell=0; cell < 8; cell++) {
if ( m_GridCell[cell] != -1 ) {
pndx = m_Grid [ m_GridCell[cell] ];
while ( pndx != -1 ) {
pcurr = (Fluid*) (mBuf[0].data + pndx*mBuf[0].stride);
if ( pcurr == p ) {pndx = pcurr->next; continue; }
dx = ( p->pos.x - pcurr->pos.x)*d; // dist in cm
dy = ( p->pos.y - pcurr->pos.y)*d;
dz = ( p->pos.z - pcurr->pos.z)*d;
dsq = (dx*dx + dy*dy + dz*dz);
if ( mR2 > dsq ) {
r = sqrt ( dsq );
c = (mR - r);
pterm = -0.5f * c * m_SpikyKern * ( p->pressure + pcurr->pressure) / r;
dterm = c * p->density * pcurr->density;
vterm = m_LapKern * visc;
force.x += ( pterm * dx + vterm * (pcurr->vel_eval.x - p->vel_eval.x) ) * dterm;
force.y += ( pterm * dy + vterm * (pcurr->vel_eval.y - p->vel_eval.y) ) * dterm;
force.z += ( pterm * dz + vterm * (pcurr->vel_eval.z - p->vel_eval.z) ) * dterm;
}
pndx = pcurr->next;
}
}
}
p->sph_force = force;
}
}
// Compute Forces - Using spatial grid with saved neighbor table. Fastest.
void FluidSystem::SPH_ComputeForceGridNC ()
{
char *dat1, *dat1_end;
Fluid *p;
Fluid *pcurr;
Vector3DF force, fcurr;
register float pterm, vterm, dterm;
int i;
float c, d;
float dx, dy, dz;
float mR, mR2, visc;
d = m_Param[SPH_SIMSCALE];
mR = m_Param[SPH_SMOOTHRADIUS];
mR2 = (mR*mR);
visc = m_Param[SPH_VISC];
dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
i = 0;
for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride, i++ ) {
p = (Fluid*) dat1;
force.Set ( 0, 0, 0 );
for (int j=0; j < m_NC[i]; j++ ) {
pcurr = (Fluid*) (mBuf[0].data + m_Neighbor[i][j]*mBuf[0].stride);
dx = ( p->pos.x - pcurr->pos.x)*d; // dist in cm
dy = ( p->pos.y - pcurr->pos.y)*d;
dz = ( p->pos.z - pcurr->pos.z)*d;
c = ( mR - m_NDist[i][j] );
pterm = -0.5f * c * m_SpikyKern * ( p->pressure + pcurr->pressure) / m_NDist[i][j];
dterm = c * p->density * pcurr->density;
vterm = m_LapKern * visc;
force.x += ( pterm * dx + vterm * (pcurr->vel_eval.x - p->vel_eval.x) ) * dterm;
force.y += ( pterm * dy + vterm * (pcurr->vel_eval.y - p->vel_eval.y) ) * dterm;
force.z += ( pterm * dz + vterm * (pcurr->vel_eval.z - p->vel_eval.z) ) * dterm;
}
p->sph_force = force;
}
}

View File

@@ -0,0 +1,71 @@
/*
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
Copyright (C) 2009. Rama Hoetzlein, http://www.rchoetzlein.com
ZLib license
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include <cutil.h>
#include <cstdlib>
#include <cstdio>
#include <string.h>
#if defined(__APPLE__) || defined(MACOSX)
#include <GLUT/glut.h>
#else
#include <GL/glut.h>
#endif
#include <cuda_gl_interop.h>
#include "fluid_system_kern.cu"
extern "C"
{
// Compute number of blocks to create
int iDivUp (int a, int b) {
return (a % b != 0) ? (a / b + 1) : (a / b);
}
void computeNumBlocks (int numPnts, int minThreads, int &numBlocks, int &numThreads)
{
numThreads = min( minThreads, numPnts );
numBlocks = iDivUp ( numPnts, numThreads );
}
void Grid_InsertParticlesCUDA ( uchar* data, uint stride, uint numPoints )
{
int numThreads, numBlocks;
computeNumBlocks (numPoints, 256, numBlocks, numThreads);
// transfer point data to device
char* pntData;
size = numPoints * stride;
cudaMalloc( (void**) &pntData, size);
cudaMemcpy( pntData, data, size, cudaMemcpyHostToDevice);
// execute the kernel
insertParticles<<< numBlocks, numThreads >>> ( pntData, stride );
// transfer data back to host
cudaMemcpy( data, pntData, cudaMemcpyDeviceToHost);
// check if kernel invocation generated an error
CUT_CHECK_ERROR("Kernel execution failed");
CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboPos));
}

View File

@@ -0,0 +1,106 @@
/*
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
ZLib license
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef DEF_FLUID_SYS
#define DEF_FLUID_SYS
#include <iostream>
#include <vector>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "point_set.h"
#include "fluid.h"
// Scalar params
#define SPH_SIMSIZE 4
#define SPH_SIMSCALE 5
#define SPH_VISC 6
#define SPH_RESTDENSITY 7
#define SPH_PMASS 8
#define SPH_PRADIUS 9
#define SPH_PDIST 10
#define SPH_SMOOTHRADIUS 11
#define SPH_INTSTIFF 12
#define SPH_EXTSTIFF 13
#define SPH_EXTDAMP 14
#define SPH_LIMIT 15
#define BOUND_ZMIN_SLOPE 16
#define FORCE_XMAX_SIN 17
#define FORCE_XMIN_SIN 18
#define MAX_FRAC 19
#define CLR_MODE 20
// Vector params
#define SPH_VOLMIN 7
#define SPH_VOLMAX 8
#define SPH_INITMIN 9
#define SPH_INITMAX 10
// Toggles
#define SPH_GRID 0
#define SPH_DEBUG 1
#define WRAP_X 2
#define WALL_BARRIER 3
#define LEVY_BARRIER 4
#define DRAIN_BARRIER 5
#define USE_CUDA 6
#define MAX_PARAM 21
#define BFLUID 2
class FluidSystem : public PointSet {
public:
FluidSystem ();
// Basic Particle System
virtual void Initialize ( int mode, int nmax );
virtual void Reset ( int nmax );
virtual void Run ();
virtual void Advance ();
virtual int AddPoint ();
virtual int AddPointReuse ();
Fluid* AddFluid () { return (Fluid*) GetElem(0, AddPointReuse()); }
Fluid* GetFluid (int n) { return (Fluid*) GetElem(0, n); }
// Smoothed Particle Hydrodynamics
void SPH_Setup ();
void SPH_CreateExample ( int n, int nmax );
void SPH_DrawDomain ();
void SPH_ComputeKernels ();
void SPH_ComputePressureSlow (); // O(n^2)
void SPH_ComputePressureGrid (); // O(kn) - spatial grid
void SPH_ComputeForceSlow (); // O(n^2)
void SPH_ComputeForceGrid (); // O(kn) - spatial grid
void SPH_ComputeForceGridNC (); // O(cn) - neighbor table
private:
// Smoothed Particle Hydrodynamics
double m_R2, m_Poly6Kern, m_LapKern, m_SpikyKern; // Kernel functions
};
#endif

View File

@@ -0,0 +1,247 @@
/*
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
ZLib license
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "C:\CUDA\common\inc\cutil.h" // cutil32.lib
#include <string.h>
#if defined(__APPLE__) || defined(MACOSX)
#include <GLUT/glut.h>
#else
#include <GL/glut.h>
#endif
#include <cuda_gl_interop.h>
#include "radixsort.cu"
#include "fluid_system_kern.cu" // build kernel
FluidParams fcuda;
__device__ char* bufPnts; // point data (array of Fluid structs)
__device__ char* bufPntSort; // point data (array of Fluid structs)
__device__ uint* bufHash[2]; // point grid hash
__device__ int* bufGrid;
extern "C"
{
// Initialize CUDA
void cudaInit(int argc, char **argv)
{
CUT_DEVICE_INIT(argc, argv);
cudaDeviceProp p;
cudaGetDeviceProperties ( &p, 0);
printf ( "-- CUDA --\n" );
printf ( "Name: %s\n", p.name );
printf ( "Revision: %d.%d\n", p.major, p.minor );
printf ( "Global Mem: %d\n", p.totalGlobalMem );
printf ( "Shared/Blk: %d\n", p.sharedMemPerBlock );
printf ( "Regs/Blk: %d\n", p.regsPerBlock );
printf ( "Warp Size: %d\n", p.warpSize );
printf ( "Mem Pitch: %d\n", p.memPitch );
printf ( "Thrds/Blk: %d\n", p.maxThreadsPerBlock );
printf ( "Const Mem: %d\n", p.totalConstMem );
printf ( "Clock Rate: %d\n", p.clockRate );
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufPnts, 10 ) );
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufPntSort, 10 ) );
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufHash, 10 ) );
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufGrid, 10 ) );
};
// Compute number of blocks to create
int iDivUp (int a, int b) {
return (a % b != 0) ? (a / b + 1) : (a / b);
}
void computeNumBlocks (int numPnts, int maxThreads, int &numBlocks, int &numThreads)
{
numThreads = min( maxThreads, numPnts );
numBlocks = iDivUp ( numPnts, numThreads );
}
void FluidClearCUDA ()
{
CUDA_SAFE_CALL ( cudaFree ( bufPnts ) );
CUDA_SAFE_CALL ( cudaFree ( bufPntSort ) );
CUDA_SAFE_CALL ( cudaFree ( bufHash[0] ) );
CUDA_SAFE_CALL ( cudaFree ( bufHash[1] ) );
CUDA_SAFE_CALL ( cudaFree ( bufGrid ) );
}
void FluidSetupCUDA ( int num, int stride, float3 min, float3 max, float3 res, float3 size, int chk )
{
fcuda.min = make_float3(min.x, min.y, min.z);
fcuda.max = make_float3(max.x, max.y, max.z);
fcuda.res = make_float3(res.x, res.y, res.z);
fcuda.size = make_float3(size.x, size.y, size.z);
fcuda.pnts = num;
fcuda.delta.x = res.x / size.x;
fcuda.delta.y = res.y / size.y;
fcuda.delta.z = res.z / size.z;
fcuda.cells = res.x*res.y*res.z;
fcuda.chk = chk;
computeNumBlocks ( fcuda.pnts, 256, fcuda.numBlocks, fcuda.numThreads); // particles
computeNumBlocks ( fcuda.cells, 256, fcuda.gridBlocks, fcuda.gridThreads); // grid cell
fcuda.szPnts = (fcuda.numBlocks * fcuda.numThreads) * stride;
fcuda.szHash = (fcuda.numBlocks * fcuda.numThreads) * sizeof(uint2); // <cell, particle> pairs
fcuda.szGrid = (fcuda.gridBlocks * fcuda.gridThreads) * sizeof(uint);
fcuda.stride = stride;
printf ( "pnts: %d, t:%dx%d=%d, bufPnts:%d, bufHash:%d\n", fcuda.pnts, fcuda.numBlocks, fcuda.numThreads, fcuda.numBlocks*fcuda.numThreads, fcuda.szPnts, fcuda.szHash );
printf ( "grds: %d, t:%dx%d=%d, bufGrid:%d, Res: %dx%dx%d\n", fcuda.cells, fcuda.gridBlocks, fcuda.gridThreads, fcuda.gridBlocks*fcuda.gridThreads, fcuda.szGrid, (int) fcuda.res.x, (int) fcuda.res.y, (int) fcuda.res.z );
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufPnts, fcuda.szPnts ) );
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufPntSort, fcuda.szPnts ) );
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufHash[0], fcuda.szHash ) );
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufHash[1], fcuda.szHash ) );
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufGrid, fcuda.szGrid ) );
printf ( "POINTERS\n");
printf ( "bufPnts: %p\n", bufPnts );
printf ( "bufPntSort: %p\n", bufPntSort );
printf ( "bufHash0: %p\n", bufHash[0] );
printf ( "bufHash1: %p\n", bufHash[1] );
printf ( "bufGrid: %p\n", bufGrid );
CUDA_SAFE_CALL ( cudaMemcpyToSymbol ( simData, &fcuda, sizeof(FluidParams) ) );
cudaThreadSynchronize ();
}
void FluidParamCUDA ( float sim_scale, float smooth_rad, float mass, float rest, float stiff, float visc )
{
fcuda.sim_scale = sim_scale;
fcuda.smooth_rad = smooth_rad;
fcuda.r2 = smooth_rad * smooth_rad;
fcuda.pmass = mass;
fcuda.rest_dens = rest;
fcuda.stiffness = stiff;
fcuda.visc = visc;
fcuda.pdist = pow ( fcuda.pmass / fcuda.rest_dens, 1/3.0f );
fcuda.poly6kern = 315.0f / (64.0f * 3.141592 * pow( smooth_rad, 9.0f) );
fcuda.spikykern = -45.0f / (3.141592 * pow( smooth_rad, 6.0f) );
fcuda.lapkern = 45.0f / (3.141592 * pow( smooth_rad, 6.0f) );
CUDA_SAFE_CALL( cudaMemcpyToSymbol ( simData, &fcuda, sizeof(FluidParams) ) );
cudaThreadSynchronize ();
}
void TransferToCUDA ( char* data, int* grid, int numPoints )
{
CUDA_SAFE_CALL( cudaMemcpy ( bufPnts, data, numPoints * fcuda.stride, cudaMemcpyHostToDevice ) );
cudaThreadSynchronize ();
}
void TransferFromCUDA ( char* data, int* grid, int numPoints )
{
CUDA_SAFE_CALL( cudaMemcpy ( data, bufPntSort, numPoints * fcuda.stride, cudaMemcpyDeviceToHost ) );
cudaThreadSynchronize ();
CUDA_SAFE_CALL( cudaMemcpy ( grid, bufGrid, fcuda.cells * sizeof(uint), cudaMemcpyDeviceToHost ) );
}
void Grid_InsertParticlesCUDA ()
{
CUDA_SAFE_CALL( cudaMemset ( bufHash[0], 0, fcuda.szHash ) );
hashParticles<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPnts, (uint2*) bufHash[0], fcuda.pnts );
CUT_CHECK_ERROR( "Kernel execution failed");
cudaThreadSynchronize ();
//int buf[20000];
/*printf ( "HASH: %d (%d)\n", fcuda.pnts, fcuda.numBlocks*fcuda.numThreads );
CUDA_SAFE_CALL( cudaMemcpy ( buf, bufHash[0], fcuda.pnts * 2*sizeof(uint), cudaMemcpyDeviceToHost ) );
//for (int n=0; n < fcuda.numBlocks*fcuda.numThreads; n++) {
for (int n=0; n < 100; n++) {
printf ( "%d: <%d,%d>\n", n, buf[n*2], buf[n*2+1] );
}*/
RadixSort( (KeyValuePair *) bufHash[0], (KeyValuePair *) bufHash[1], fcuda.pnts, 32);
CUT_CHECK_ERROR( "Kernel execution failed");
cudaThreadSynchronize ();
/*printf ( "HASH: %d (%d)\n", fcuda.pnts, fcuda.numBlocks*fcuda.numThreads );
CUDA_SAFE_CALL( cudaMemcpy ( buf, bufHash[0], fcuda.pnts * 2*sizeof(uint), cudaMemcpyDeviceToHost ) );
//for (int n=0; n < fcuda.numBlocks*fcuda.numThreads; n++) {
for (int n=0; n < 100; n++) {
printf ( "%d: <%d,%d>\n", n, buf[n*2], buf[n*2+1] );
}*/
// insertParticles<<< fcuda.gridBlocks, fcuda.gridThreads>>> ( bufPnts, (uint2*) bufHash[0], bufGrid, fcuda.pnts, fcuda.cells );
CUDA_SAFE_CALL( cudaMemset ( bufGrid, NULL_HASH, fcuda.cells * sizeof(uint) ) );
insertParticlesRadix<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPnts, (uint2*) bufHash[0], bufGrid, bufPntSort, fcuda.pnts, fcuda.cells );
CUT_CHECK_ERROR( "Kernel execution failed");
cudaThreadSynchronize ();
/*printf ( "GRID: %d\n", fcuda.cells );
CUDA_SAFE_CALL( cudaMemcpy ( buf, bufGrid, fcuda.cells * sizeof(uint), cudaMemcpyDeviceToHost ) );
*for (int n=0; n < 100; n++) {
printf ( "%d: %d\n", n, buf[n]);
}*/
}
void SPH_ComputePressureCUDA ()
{
computePressure<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, bufGrid, (uint2*) bufHash[0], fcuda.pnts );
CUT_CHECK_ERROR( "Kernel execution failed");
cudaThreadSynchronize ();
}
void SPH_ComputeForceCUDA ()
{
//-- standard force
//computeForce<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, bufGrid, (uint2*) bufHash[0], fcuda.pnts );
// Force using neighbor table
computeForceNbr<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, fcuda.pnts );
CUT_CHECK_ERROR( "Kernel execution failed");
cudaThreadSynchronize ();
}
void SPH_AdvanceCUDA ( float dt, float ss )
{
advanceParticles<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, fcuda.pnts, dt, ss );
CUT_CHECK_ERROR( "Kernel execution failed");
cudaThreadSynchronize ();
}
} // extern C
//----------- Per frame: Malloc/Free, Host<->Device
// transfer point data to device
/*char* pntData;
int size = (fcuda.numBlocks*fcuda.numThreads) * stride;
cudaMalloc( (void**) &pntData, size);
cudaMemcpy( pntData, data, numPoints*stride, cudaMemcpyHostToDevice);
insertParticles<<< fcuda.numBlocks, fcuda.numThreads >>> ( pntData, stride, numPoints );
cudaMemcpy( data, pntData, numPoints*stride, cudaMemcpyDeviceToHost);
cudaFree( pntData );*/

View File

@@ -0,0 +1,63 @@
/*
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
ZLib license
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include <vector_types.h>
#include <driver_types.h> // for cudaStream_t
typedef unsigned int uint; // should be 4-bytes on CUDA
typedef unsigned char uchar; // should be 1-bytes on CUDA
struct FluidParams {
int numThreads, numBlocks;
int gridThreads, gridBlocks;
int szPnts, szHash, szGrid;
int stride, pnts, cells;
int chk;
float smooth_rad, r2, sim_scale, visc;
float3 min, max, res, size, delta;
float pdist, pmass, rest_dens, stiffness;
float poly6kern, spikykern, lapkern;
};
extern "C"
{
void cudaInit(int argc, char **argv);
void FluidClearCUDA ();
void FluidSetupCUDA ( int num, int stride, float3 min, float3 max, float3 res, float3 size, int chk );
void FluidParamCUDA ( float sim_scale, float smooth_rad, float mass, float rest, float stiff, float visc );
void TransferToCUDA ( char* data, int* grid, int numPoints );
void TransferFromCUDA ( char* data, int* grid, int numPoints );
void Grid_InsertParticlesCUDA ();
void SPH_ComputePressureCUDA ();
void SPH_ComputeForceCUDA ();
void SPH_AdvanceCUDA ( float dt, float ss );
}

View File

@@ -0,0 +1,402 @@
/*
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
ZLib license
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef _PARTICLES_KERNEL_H_
#define _PARTICLES_KERNEL_H_
#include <stdio.h>
#include <math.h>
#include "fluid_system_host.cuh"
#define TOTAL_THREADS 65536
#define BLOCK_THREADS 256
#define MAX_NBR 80
__constant__ FluidParams simData; // simulation data (on device)
__device__ int bufNeighbor[ TOTAL_THREADS*MAX_NBR ];
__device__ float bufNdist[ TOTAL_THREADS*MAX_NBR ];
#define COLOR(r,g,b) ( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) )
#define COLORA(r,g,b,a) ( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) | uint((a)*255.0f) )
#define NULL_HASH 333333
#define OFFSET_CLR 12
#define OFFSET_NEXT 16
#define OFFSET_VEL 20
#define OFFSET_VEVAL 32
#define OFFSET_PRESS 48
#define OFFSET_DENS 52
#define OFFSET_FORCE 56
__global__ void hashParticles ( char* bufPnts, uint2* bufHash, int numPnt )
{
uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // particle index
float3* pos = (float3*) (bufPnts + __mul24(ndx, simData.stride) );
int gz = (pos->z - simData.min.z) * simData.delta.z ;
int gy = (pos->y - simData.min.y) * simData.delta.y ;
int gx = (pos->x - simData.min.x) * simData.delta.x ;
if ( ndx >= numPnt || gx < 0 || gz > simData.res.x-1 || gy < 0 || gy > simData.res.y-1 || gz < 0 || gz > simData.res.z-1 )
bufHash[ndx] = make_uint2( NULL_HASH, ndx );
else
bufHash[ndx] = make_uint2( __mul24(__mul24(gz, (int) simData.res.y)+gy, (int) simData.res.x) + gx, ndx );
__syncthreads ();
}
__global__ void insertParticles ( char* bufPnts, uint2* bufHash, int* bufGrid, int numPnt, int numGrid )
{
uint grid_ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // grid cell index
bufPnts += OFFSET_NEXT;
bufGrid[grid_ndx] = -1;
for (int n=0; n < numPnt; n++) {
if ( bufHash[n].x == grid_ndx ) {
*(int*) (bufPnts + __mul24(bufHash[n].y, simData.stride)) = bufGrid[grid_ndx];
bufGrid[grid_ndx] = bufHash[n].y;
}
}
__syncthreads ();
}
__global__ void insertParticlesRadix ( char* bufPnts, uint2* bufHash, int* bufGrid, char* bufPntSort, int numPnt, int numGrid )
{
uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // particle index
uint2 bufHashSort = bufHash[ndx];
__shared__ uint sharedHash[257];
sharedHash[threadIdx.x+1] = bufHashSort.x;
if ( ndx > 0 && threadIdx.x == 0 ) {
volatile uint2 prevData = bufHash[ndx-1];
sharedHash[0] = prevData.x;
}
__syncthreads ();
if ( (ndx == 0 || bufHashSort.x != sharedHash[threadIdx.x]) && bufHashSort.x != NULL_HASH ) {
bufGrid [ bufHashSort.x ] = ndx;
}
if ( ndx < numPnt ) {
char* src = bufPnts + __mul24( bufHashSort.y, simData.stride );
char* dest = bufPntSort + __mul24( ndx, simData.stride );
*(float3*)(dest) = *(float3*)(src);
*(uint*) (dest + OFFSET_CLR) = *(uint*) (src + OFFSET_CLR);
*(float3*)(dest + OFFSET_VEL) = *(float3*)(src + OFFSET_VEL);
*(float3*)(dest + OFFSET_VEVAL) = *(float3*)(src + OFFSET_VEVAL);
*(float*) (dest + OFFSET_DENS) = 0.0;
*(float*) (dest + OFFSET_PRESS) = 0.0;
*(float3*) (dest + OFFSET_FORCE)= make_float3(0,0,0);
*(int*) (dest + OFFSET_NEXT) = bufHashSort.x;
}
__syncthreads ();
}
//__shared__ int ncount [ BLOCK_THREADS ];
__device__ float contributePressure ( int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
{
float3* qpos;
float3 dist;
float dsq, c, sum;
float d = simData.sim_scale;
int nbr = __mul24(pndx, MAX_NBR);
sum = 0.0;
for ( ; qndx < simData.pnts; qndx++ ) {
if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
if ( qndx != pndx ) {
qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));
dist.x = ( p->x - qpos->x )*d; // dist in cm
dist.y = ( p->y - qpos->y )*d;
dist.z = ( p->z - qpos->z )*d;
dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);
if ( dsq < simData.r2 ) {
c = simData.r2 - dsq;
sum += c * c * c;
if ( bufNeighbor[nbr] < MAX_NBR ) {
bufNeighbor[ nbr+bufNeighbor[nbr] ] = qndx;
bufNdist[ nbr+bufNeighbor[nbr] ] = sqrt(dsq);
bufNeighbor[nbr]++;
}
}
}
//curr = *(int*) (bufPnts + __mul24(curr, simData.stride) + OFFSET_NEXT);
}
return sum;
}
/*if ( ncount[threadIdx.x] < MAX_NBR ) {
bufNeighbor [ nbr + ncount[threadIdx.x] ] = curr;
bufNdist [ nbr + ncount[threadIdx.x] ] = sqrt(dsq);
ncount[threadIdx.x]++;
}*/
__global__ void computePressure ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
{
uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // particle index
//if ( ndx < 1024 ) {
float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));
// Find 2x2x2 grid cells
// - Use registers only, no arrays (local-memory too slow)
int3 cell;
int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;
float gs = simData.smooth_rad / simData.sim_scale;
cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));
gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
gc1 = gc0 + 1;
gc2 = gc0 + simData.res.x;
gc3 = gc2 + 1;
if ( cell.z+1 < simData.res.z ) {
gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
gc5 = gc4 + 1;
gc6 = gc4 + simData.res.x;
gc7 = gc6 + 1;
}
if ( cell.x+1 >= simData.res.x ) {
gc1 = -1; gc3 = -1;
gc5 = -1; gc7 = -1;
}
if ( cell.y+1 >= simData.res.y ) {
gc2 = -1; gc3 = -1;
gc6 = -1; gc7 = -1;
}
// Sum Pressure
float sum = 0.0;
bufNeighbor[ __mul24(ndx, MAX_NBR) ] = 1;
if (gc0 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
if (gc1 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );
if (gc2 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );
if (gc3 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );
if (gc4 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
if (gc5 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );
if (gc6 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
if (gc7 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
// Compute Density & Pressure
sum = sum * simData.pmass * simData.poly6kern;
if ( sum == 0.0 ) sum = 1.0;
*(float*) ((char*)pos + OFFSET_PRESS) = ( sum - simData.rest_dens ) * simData.stiffness;
*(float*) ((char*)pos + OFFSET_DENS) = 1.0f / sum;
//}
//__syncthreads ();
}
__device__ void contributeForce ( float3& force, int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
{
float press = *(float*) ((char*)p + OFFSET_PRESS);
float dens = *(float*) ((char*)p + OFFSET_DENS);
float3 veval = *(float3*) ((char*)p + OFFSET_VEVAL );
float3 qeval, dist;
float c, ndistj, dsq;
float pterm, dterm, vterm;
float3* qpos;
float d = simData.sim_scale;
vterm = simData.lapkern * simData.visc;
for ( ; qndx < simData.pnts; qndx++ ) {
if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
if ( qndx != pndx ) {
qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));
dist.x = ( p->x - qpos->x )*d; // dist in cm
dist.y = ( p->y - qpos->y )*d;
dist.z = ( p->z - qpos->z )*d;
dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);
if ( dsq < simData.r2 ) {
ndistj = sqrt(dsq);
c = ( simData.smooth_rad - ndistj );
dist.x = ( p->x - qpos->x )*d; // dist in cm
dist.y = ( p->y - qpos->y )*d;
dist.z = ( p->z - qpos->z )*d;
pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);
qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;
}
}
}
}
__global__ void computeForce ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
{
uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // particle index
//if ( ndx < numPnt ) {
float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));
// Find 2x2x2 grid cells
// - Use registers only, no arrays (local-memory too slow)
int3 cell;
int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;
float gs = simData.smooth_rad / simData.sim_scale;
cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));
gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
gc1 = gc0 + 1;
gc2 = gc0 + simData.res.x;
gc3 = gc2 + 1;
if ( cell.z+1 < simData.res.z ) {
gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
gc5 = gc4 + 1;
gc6 = gc4 + simData.res.x;
gc7 = gc6 + 1;
}
if ( cell.x+1 >= simData.res.x ) {
gc1 = -1; gc3 = -1;
gc5 = -1; gc7 = -1;
}
if ( cell.y+1 >= simData.res.y ) {
gc2 = -1; gc3 = -1;
gc6 = -1; gc7 = -1;
}
// Sum Pressure
float3 force = make_float3(0,0,0);
if (gc0 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
if (gc1 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );
if (gc2 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );
if (gc3 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );
if (gc4 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
if (gc5 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );
if (gc6 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
if (gc7 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
// Update Force
*(float3*) ((char*)pos + OFFSET_FORCE ) = force;
//}
//__syncthreads ();
}
__global__ void computeForceNbr ( char* bufPntSort, int numPnt )
{
uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // particle index
if ( ndx < numPnt ) {
float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));
float3* qpos;
float press = *(float*) ((char*)pos + OFFSET_PRESS);
float dens = *(float*) ((char*)pos + OFFSET_DENS);
float3 veval = *(float3*) ((char*)pos + OFFSET_VEVAL );
float3 qeval, dist, force;
float d = simData.sim_scale;
float c, ndistj;
float pterm, dterm, vterm;
vterm = simData.lapkern * simData.visc;
int nbr = __mul24(ndx, MAX_NBR);
int ncnt = bufNeighbor[ nbr ];
force = make_float3(0,0,0);
for (int j=1; j < ncnt; j++) { // base 1, n[0] = count
ndistj = bufNdist[ nbr+j ];
qpos = (float3*) (bufPntSort + __mul24( bufNeighbor[ nbr+j ], simData.stride) );
c = ( simData.smooth_rad - ndistj );
dist.x = ( pos->x - qpos->x )*d; // dist in cm
dist.y = ( pos->y - qpos->y )*d;
dist.z = ( pos->z - qpos->z )*d;
pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);
qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;
}
*(float3*) ((char*)pos + OFFSET_FORCE ) = force;
}
}
__global__ void advanceParticles ( char* bufPntSort, int numPnt, float dt, float ss )
{
uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // particle index
if ( ndx < numPnt ) {
// Get particle vars
float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));
float3* vel = (float3*) ((char*)pos + OFFSET_VEL );
float3* vel_eval = (float3*) ((char*)pos + OFFSET_VEVAL );
float3 accel = *(float3*) ((char*)pos + OFFSET_FORCE );
float3 vcurr, vnext;
// Leapfrog integration
accel.x *= 0.00020543; // NOTE - To do: SPH_PMASS should be passed in
accel.y *= 0.00020543;
accel.z *= 0.00020543;
accel.z -= 9.8;
vcurr = *vel;
vnext.x = accel.x*dt + vcurr.x;
vnext.y = accel.y*dt + vcurr.y;
vnext.z = accel.z*dt + vcurr.z; // v(t+1/2) = v(t-1/2) + a(t) dt
accel.x = (vcurr.x + vnext.x) * 0.5; // v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5 used to compute forces later
accel.y = (vcurr.y + vnext.y) * 0.5; // v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5 used to compute forces later
accel.z = (vcurr.z + vnext.z) * 0.5; // v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5 used to compute forces later
*vel_eval = accel;
*vel = vnext;
dt /= simData.sim_scale;
vnext.x = pos->x + vnext.x*dt;
vnext.y = pos->y + vnext.y*dt;
vnext.z = pos->z + vnext.z*dt;
*pos = vnext; // p(t+1) = p(t) + v(t+1/2) dt
}
__syncthreads ();
}
#endif

View File

@@ -0,0 +1,45 @@
/*
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
Copyright (C) 2009. Rama Hoetzlein, http://www.rchoetzlein.com
ZLib license
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef _PARTICLES_KERNEL_H_
#define _PARTICLES_KERNEL_H_
#include <stdio.h>
#include <math.h>
#include "cutil_math.h"
#include "math_constants.h"
// Insert particles in grid
__global__ void insertParticles ( char* pntData, uint pntStride )
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
float4 p = *(float4*) (pntData + index*pntStride);
// get address in grid
int3 gridPos = calcGridPos(p);
addParticleToCell(gridPos, index, gridCounters, gridCells);
}
#endif

View File

@@ -0,0 +1,79 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
/* Radixsort project with key/value and arbitrary datset size support
* which demonstrates the use of CUDA in a multi phase sorting
* computation.
* Host code.
*/
#include "radixsort.cuh"
#include "radixsort_kernel.cu"
extern "C"
{
////////////////////////////////////////////////////////////////////////////////
//! Perform a radix sort
//! Sorting performed in place on passed arrays.
//!
//! @param pData0 input and output array - data will be sorted
//! @param pData1 additional array to allow ping pong computation
//! @param elements number of elements to sort
////////////////////////////////////////////////////////////////////////////////
void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits)
{
// Round element count to total number of threads for efficiency
uint elements_rounded_to_3072;
int modval = elements % 3072;
if( modval == 0 )
elements_rounded_to_3072 = elements;
else
elements_rounded_to_3072 = elements + (3072 - (modval));
// Iterate over n bytes of y bit word, using each byte to sort the list in turn
for (uint shift = 0; shift < bits; shift += RADIX)
{
// Perform one round of radix sorting
// Generate per radix group sums radix counts across a radix group
RadixSum<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, GRFSIZE>>>(pData0, elements, elements_rounded_to_3072, shift);
// Prefix sum in radix groups, and then between groups throughout a block
RadixPrefixSum<<<PREFIX_NUM_BLOCKS, PREFIX_NUM_THREADS_PER_BLOCK, PREFIX_GRFSIZE>>>();
// Sum the block offsets and then shuffle data into bins
RadixAddOffsetsAndShuffle<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, SHUFFLE_GRFSIZE>>>(pData0, pData1, elements, elements_rounded_to_3072, shift);
// Exchange data pointers
KeyValuePair* pTemp = pData0;
pData0 = pData1;
pData1 = pTemp;
}
}
}

View File

@@ -0,0 +1,63 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
/* Radixsort project which demonstrates the use of CUDA in a multi phase
* sorting computation.
* Type definitions.
*/
#ifndef _RADIXSORT_H_
#define _RADIXSORT_H_
#include <host_defines.h>
#define SYNCIT __syncthreads()
// Use 16 bit keys/values
#define SIXTEEN 0
typedef unsigned int uint;
typedef unsigned short ushort;
#if SIXTEEN
typedef struct __align__(4) {
ushort key;
ushort value;
#else
typedef struct __align__(8) {
uint key;
uint value;
#endif
} KeyValuePair;
extern "C" {
void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits);
}
#endif // #ifndef _RADIXSORT_H_

View File

@@ -0,0 +1,577 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
/* Radixsort project with key/value and arbitrary datset size support
* which demonstrates the use of CUDA in a multi phase sorting
* computation.
* Device code.
*/
#ifndef _RADIXSORT_KERNEL_H_
#define _RADIXSORT_KERNEL_H_
#include <stdio.h>
#include "radixsort.cuh"
#define SYNCIT __syncthreads()
static const int NUM_SMS = 16;
static const int NUM_THREADS_PER_SM = 192;
static const int NUM_THREADS_PER_BLOCK = 64;
//static const int NUM_THREADS = NUM_THREADS_PER_SM * NUM_SMS;
static const int NUM_BLOCKS = (NUM_THREADS_PER_SM / NUM_THREADS_PER_BLOCK) * NUM_SMS;
static const int RADIX = 8; // Number of bits per radix sort pass
static const int RADICES = 1 << RADIX; // Number of radices
static const int RADIXMASK = RADICES - 1; // Mask for each radix sort pass
#if SIXTEEN
static const int RADIXBITS = 16; // Number of bits to sort over
#else
static const int RADIXBITS = 32; // Number of bits to sort over
#endif
static const int RADIXTHREADS = 16; // Number of threads sharing each radix counter
static const int RADIXGROUPS = NUM_THREADS_PER_BLOCK / RADIXTHREADS; // Number of radix groups per CTA
static const int TOTALRADIXGROUPS = NUM_BLOCKS * RADIXGROUPS; // Number of radix groups for each radix
static const int SORTRADIXGROUPS = TOTALRADIXGROUPS * RADICES; // Total radix count
static const int GRFELEMENTS = (NUM_THREADS_PER_BLOCK / RADIXTHREADS) * RADICES;
static const int GRFSIZE = GRFELEMENTS * sizeof(uint);
// Prefix sum variables
static const int PREFIX_NUM_THREADS_PER_SM = NUM_THREADS_PER_SM;
static const int PREFIX_NUM_THREADS_PER_BLOCK = PREFIX_NUM_THREADS_PER_SM;
static const int PREFIX_NUM_BLOCKS = (PREFIX_NUM_THREADS_PER_SM / PREFIX_NUM_THREADS_PER_BLOCK) * NUM_SMS;
static const int PREFIX_BLOCKSIZE = SORTRADIXGROUPS / PREFIX_NUM_BLOCKS;
static const int PREFIX_GRFELEMENTS = PREFIX_BLOCKSIZE + 2 * PREFIX_NUM_THREADS_PER_BLOCK;
static const int PREFIX_GRFSIZE = PREFIX_GRFELEMENTS * sizeof(uint);
// Shuffle variables
static const int SHUFFLE_GRFOFFSET = RADIXGROUPS * RADICES;
static const int SHUFFLE_GRFELEMENTS = SHUFFLE_GRFOFFSET + PREFIX_NUM_BLOCKS;
static const int SHUFFLE_GRFSIZE = SHUFFLE_GRFELEMENTS * sizeof(uint);
#define SDATA( index) CUT_BANK_CHECKER(sdata, index)
// Prefix sum data
uint gRadixSum[TOTALRADIXGROUPS * RADICES];
__device__ uint dRadixSum[TOTALRADIXGROUPS * RADICES];
uint gRadixBlockSum[PREFIX_NUM_BLOCKS];
__device__ uint dRadixBlockSum[PREFIX_NUM_BLOCKS];
extern __shared__ uint sRadixSum[];
////////////////////////////////////////////////////////////////////////////////
//! Perform a radix sum on the list to be sorted. Each SM holds a set of
//! radix counters for each group of RADIXGROUPS thread in the GRF.
//!
//! @param pData input data
//! @param elements total number of elements
//! @param elements_rounded_to_3072 total number of elements rounded up to the
//! nearest multiple of 3072
//! @param shift the shift (0 to 24) that we are using to obtain the correct
//! byte
////////////////////////////////////////////////////////////////////////////////
__global__ void RadixSum(KeyValuePair *pData, uint elements, uint elements_rounded_to_3072, uint shift)
{
uint pos = threadIdx.x;
// Zero radix counts
while (pos < GRFELEMENTS)
{
sRadixSum[pos] = 0;
pos += NUM_THREADS_PER_BLOCK;
}
// Sum up data
// Source addresses computed so that each thread is reading from a block of
// consecutive addresses so there are no conflicts between threads
// They then loop over their combined region and the next batch works elsewhere.
// So threads 0 to 16 work on memory 0 to 320.
// First reading 0,1,2,3...15 then 16,17,18,19...31 and so on
// optimising parallel access to shared memory by a thread accessing 16*threadID
// The next radix group runs from 320 to 640 and the same applies in that region
uint tmod = threadIdx.x % RADIXTHREADS;
uint tpos = threadIdx.x / RADIXTHREADS;
// Take the rounded element list size so that all threads have a certain size dataset to work with
// and no zero size datasets confusing the issue
// By using a multiple of 3072 we ensure that all threads have elements
// to work with until the last phase, at which point we individually test
uint element_fraction = elements_rounded_to_3072 / TOTALRADIXGROUPS;
// Generate range
// Note that it is possible for both pos and end to be past the end of the element set
// which will be caught later.
pos = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction;
uint end = pos + element_fraction;
pos += tmod;
//printf("pos: %d\n", pos);
__syncthreads();
while (pos < end )
{
uint key = 0;
// Read first data element if we are in the set of elements
//if( pos < elements )
//key = pData[pos].key;
KeyValuePair kvp;
// Read first data element, both items at once as the memory will want to coalesce like that anyway
if (pos < elements)
kvp = pData[pos];
else
kvp.key = 0;
key = kvp.key;
// Calculate position of radix counter to increment
// There are RADICES radices in each pass (256)
// and hence this many counters for bin grouping
// Multiply by RADIXGROUPS (4) to spread through memory
// and into 4 radix groups
uint p = ((key >> shift) & RADIXMASK) * RADIXGROUPS;
// Increment radix counters
// Each radix group has its own set of counters
// so we add the thread position [0-3], ie the group index.
// We slow down here and take at least 16 cycles to write to the summation boxes
// but other groups will only conflict with themselves and so can also be writing
// 16 cycles here at least avoids retries.
uint ppos = p + tpos;
// If we are past the last element we don't want to do anything
// We do have to check each time, however, to ensure that all
// threads sync on each sync here.
if (tmod == 0 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 1 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 2 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 3 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 4 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 5 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 6 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 7 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 8 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 9 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 10 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 11 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 12 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 13 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 14 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
if (tmod == 15 && pos < elements)
sRadixSum[ppos]++;
SYNCIT;
pos += RADIXTHREADS;
}
__syncthreads();
__syncthreads();
// Output radix sums into separate memory regions for each radix group
// So this memory then is layed out:
// 0...... 192..... 384 ................ 192*256
// ie all 256 bins for each radix group
// in there:
// 0.............192
// 0 4 8 12... - block idx * 4
// And in the block boxes we see the 4 radix groups for that block
// So 0-192 should contain bin 0 for each radix group, and so on
uint offset = blockIdx.x * RADIXGROUPS;
uint row = threadIdx.x / RADIXGROUPS;
uint column = threadIdx.x % RADIXGROUPS;
while (row < RADICES)
{
dRadixSum[offset + row * TOTALRADIXGROUPS + column] = sRadixSum[row * RADIXGROUPS + column];
row += NUM_THREADS_PER_BLOCK / RADIXGROUPS;
}
}
////////////////////////////////////////////////////////////////////////////////
//! Performs first part of parallel prefix sum - individual sums of each radix
//! count. By the end of this we have prefix sums on a block level in dRadixSum
//! and totals for blocks in dRadixBlockSum.
////////////////////////////////////////////////////////////////////////////////
__global__ void RadixPrefixSum()
{
// Read radix groups in offset by one in the GRF so a zero can be inserted at the beginning
// and the final sum of all radix counts summed here is tacked onto the end for reading by
// the next stage
// Each block in this case is the full number of threads per SM (and hence the total number
// of radix groups), 192. We should then have the total set of offsets for an entire radix
// group by the end of this stage
// Device mem addressing
uint brow = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS);
uint drow = threadIdx.x / TOTALRADIXGROUPS; // In default parameterisation this is always 0
uint dcolumn = threadIdx.x % TOTALRADIXGROUPS; // And similarly this is always the same as threadIdx.x
uint dpos = (brow + drow) * TOTALRADIXGROUPS + dcolumn;
uint end = ((blockIdx.x + 1) * (RADICES / PREFIX_NUM_BLOCKS)) * TOTALRADIXGROUPS;
// Shared mem addressing
uint srow = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
uint scolumn = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
uint spos = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn;
// Read (RADICES / PREFIX_NUM_BLOCKS) radix counts into the GRF alongside each other
while (dpos < end)
{
sRadixSum[spos] = dRadixSum[dpos];
spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) *
(PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS;
}
__syncthreads();
// Perform preliminary sum on each thread's stretch of data
// Each thread having a block of 16, with spacers between 0...16 18...33 and so on
int pos = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
end = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
uint sum = 0;
while (pos < end)
{
sum += sRadixSum[pos];
sRadixSum[pos] = sum;
pos++;
}
__syncthreads();
// Calculate internal offsets by performing a more traditional parallel
// prefix sum of the topmost member of each thread's work data. Right now,
// these are stored between the work data for each thread, allowing us to
// eliminate GRF conflicts as well as hold the offsets needed to complete the sum
// In other words we have:
// 0....15 16 17....32 33 34....
// Where this first stage updates the intermediate values (so 16=15, 33=32 etc)
int m = (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
pos = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) +
(PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
sRadixSum[pos] = sRadixSum[pos - 1];
__syncthreads();
// This stage then performs a parallel prefix sum (ie use powers of 2 to propagate in log n stages)
// to update 17, 34 etc with the totals to that point (so 34 becomes [34] + [17]) and so on.
while (m < PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1))
{
int p = pos - m;
uint t = ((p > 0) ? sRadixSum[p] : 0);
__syncthreads();
sRadixSum[pos] += t;
__syncthreads();
m *= 2;
}
__syncthreads();
// Add internal offsets to each thread's work data.
// So now we take 17 and add it to all values 18 to 33 so all offsets for that block
// are updated.
pos = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
end = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
int p = pos - 1;
sum = ((p > 0) ? sRadixSum[p] : 0);
while (pos < end)
{
sRadixSum[pos] += sum;
pos++;
}
__syncthreads();
// Write summed data back out to global memory in the same way as we read it in
// We now have prefix sum values internal to groups
brow = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS);
drow = threadIdx.x / TOTALRADIXGROUPS;
dcolumn = threadIdx.x % TOTALRADIXGROUPS;
srow = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
scolumn = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
dpos = (brow + drow) * TOTALRADIXGROUPS + dcolumn + 1;
spos = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn;
end = ((blockIdx.x + 1) * RADICES / PREFIX_NUM_BLOCKS) * TOTALRADIXGROUPS;
while (dpos < end)
{
dRadixSum[dpos] = sRadixSum[spos];
dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS;
spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) *
(PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
}
// Write last element to summation
// Storing block sums in a separate array
if (threadIdx.x == 0) {
dRadixBlockSum[blockIdx.x] = sRadixSum[PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) - 1];
dRadixSum[blockIdx.x * PREFIX_BLOCKSIZE] = 0;
}
}
////////////////////////////////////////////////////////////////////////////////
//! Initially perform prefix sum of block totals to obtain final set of offsets.
//! Then make use of radix sums to perform a shuffling of the data into the
//! correct bins.
//!
//! @param pSrc input data
//! @param pDst output data
//! @param elements total number of elements
//! @param shift the shift (0 to 24) that we are using to obtain the correct
//! byte
////////////////////////////////////////////////////////////////////////////////
__global__ void RadixAddOffsetsAndShuffle(KeyValuePair* pSrc, KeyValuePair* pDst, uint elements, uint elements_rounded_to_3072, int shift)
{
// Read offsets from previous blocks
if (threadIdx.x == 0)
sRadixSum[SHUFFLE_GRFOFFSET] = 0;
if (threadIdx.x < PREFIX_NUM_BLOCKS - 1)
sRadixSum[SHUFFLE_GRFOFFSET + threadIdx.x + 1] = dRadixBlockSum[threadIdx.x];
__syncthreads();
// Parallel prefix sum over block sums
int pos = threadIdx.x;
int n = 1;
while (n < PREFIX_NUM_BLOCKS)
{
int ppos = pos - n;
uint t0 = ((pos < PREFIX_NUM_BLOCKS) && (ppos >= 0)) ? sRadixSum[SHUFFLE_GRFOFFSET + ppos] : 0;
__syncthreads();
if (pos < PREFIX_NUM_BLOCKS)
sRadixSum[SHUFFLE_GRFOFFSET + pos] += t0;
__syncthreads();
n *= 2;
}
// Read radix count data and add appropriate block offset
// for each radix at the memory location for this thread
// (where the other threads in the block will be reading
// as well, hence the large stride).
// There is one counter box per radix group per radix
// per block (4*256*3)
// We use 64 threads to read the 4 radix groups set of radices
// for the block.
int row = threadIdx.x / RADIXGROUPS;
int column = threadIdx.x % RADIXGROUPS;
int spos = row * RADIXGROUPS + column;
int dpos = row * TOTALRADIXGROUPS + column + blockIdx.x * RADIXGROUPS;
while (spos < SHUFFLE_GRFOFFSET)
{
sRadixSum[spos] = dRadixSum[dpos] + sRadixSum[SHUFFLE_GRFOFFSET + dpos / (TOTALRADIXGROUPS * RADICES / PREFIX_NUM_BLOCKS)];
spos += NUM_THREADS_PER_BLOCK;
dpos += (NUM_THREADS_PER_BLOCK / RADIXGROUPS) * TOTALRADIXGROUPS;
}
__syncthreads();
//int pos;
// Shuffle data
// Each of the subbins for a block should be filled via the counters, properly interleaved
// Then, as we now iterate over each data value, we increment the subbins (each thread in the
// radix group in turn to avoid miss writes due to conflicts) and set locations correctly.
uint element_fraction = elements_rounded_to_3072 / TOTALRADIXGROUPS;
int tmod = threadIdx.x % RADIXTHREADS;
int tpos = threadIdx.x / RADIXTHREADS;
pos = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction;
uint end = pos + element_fraction; //(blockIdx.x * RADIXGROUPS + tpos + 1) * element_fraction;
pos += tmod;
__syncthreads();
while (pos < end )
{
KeyValuePair kvp;
#if 1 // old load
// Read first data element, both items at once as the memory will want to coalesce like that anyway
if (pos < elements)
{
kvp = pSrc[pos];
}
else
kvp.key = 0;
#else // casting to float2 to get it to combine loads
int2 kvpf2;
// Read first data element, both items at once as the memory will want to coalesce like that anyway
if (pos < elements)
{
// kvp = pSrc[pos];
kvpf2 = ((int2*)pSrc)[pos];
// printf("kvp: %f %f kvpf2: %f %f\n", kvp.key, kvp.value, kvpf2.x, kvpf2.y);
}
else
//kvp.key = 0;
kvpf2.x = 0;
kvp.key = kvpf2.x;
kvp.value = kvpf2.y;
#endif
uint index;
// Calculate position of radix counter to increment
uint p = ((kvp.key >> shift) & RADIXMASK) * RADIXGROUPS;
// Move data, keeping counts updated.
// Increment radix counters, relying on hexadecathread
// warp to prevent this code from stepping all over itself.
uint ppos = p + tpos;
if (tmod == 0 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 1 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 2 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 3 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 4 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 5 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 6 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 7 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 8 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 9 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 10 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 11 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 12 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 13 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 14 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
if (tmod == 15 && pos < elements)
{
index = sRadixSum[ppos]++;
pDst[index] = kvp;
}
SYNCIT;
pos += RADIXTHREADS;
}
__syncthreads();
}
#endif // #ifndef _RADIXSORT_KERNEL_H_