Added initial Smoothed Particle Hydrodynamics implementation (SPH), for CPU and CUDA.
This software is contributed under the ZLib license by Rama Hoetzlein, http://www.rchoetzlein.com We plan to integrate the SPH into the core Bullet library, including interaction with rigid bodies and soft bodies.
This commit is contained in:
22
Extras/sph/fluids/fluid.cpp
Normal file
22
Extras/sph/fluids/fluid.cpp
Normal file
@@ -0,0 +1,22 @@
|
||||
/*
|
||||
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
|
||||
Copyright (C) 2009. Rama Hoetzlein, http://www.rchoetzlein.com
|
||||
|
||||
ZLib license
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
#include "fluid.h"
|
||||
44
Extras/sph/fluids/fluid.h
Normal file
44
Extras/sph/fluids/fluid.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
|
||||
Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
|
||||
|
||||
ZLib license
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#ifndef DEF_FLUID
|
||||
#define DEF_FLUID
|
||||
|
||||
#include "vector.h"
|
||||
|
||||
#include "common_defs.h"
|
||||
|
||||
struct Fluid {
|
||||
public:
|
||||
Vector3DF pos; // Basic particle (must match Particle class)
|
||||
DWORD clr;
|
||||
int next;
|
||||
Vector3DF vel;
|
||||
Vector3DF vel_eval;
|
||||
unsigned short age;
|
||||
|
||||
float pressure; // Smoothed Particle Hydrodynamics
|
||||
float density;
|
||||
Vector3DF sph_force;
|
||||
};
|
||||
|
||||
#endif /*PARTICLE_H_*/
|
||||
869
Extras/sph/fluids/fluid_system.cpp
Normal file
869
Extras/sph/fluids/fluid_system.cpp
Normal file
@@ -0,0 +1,869 @@
|
||||
/*
|
||||
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
|
||||
Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
|
||||
|
||||
ZLib license
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <conio.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <gl/glut.h>
|
||||
#else
|
||||
#include <GL/glut.h>
|
||||
#endif
|
||||
|
||||
#include "common_defs.h"
|
||||
#include "mtime.h"
|
||||
#include "fluid_system.h"
|
||||
|
||||
#ifdef BUILD_CUDA
|
||||
#include "fluid_system_host.cuh"
|
||||
#endif
|
||||
|
||||
#define EPSILON 0.00001f //for collision detection
|
||||
|
||||
FluidSystem::FluidSystem ()
|
||||
{
|
||||
}
|
||||
|
||||
void FluidSystem::Initialize ( int mode, int total )
|
||||
{
|
||||
if ( mode != BFLUID ) {
|
||||
printf ( "ERROR: FluidSystem not initialized as BFLUID.\n");
|
||||
}
|
||||
PointSet::Initialize ( mode, total );
|
||||
|
||||
FreeBuffers ();
|
||||
AddBuffer ( BFLUID, sizeof ( Fluid ), total );
|
||||
AddAttribute ( 0, "pos", sizeof ( Vector3DF ), false );
|
||||
AddAttribute ( 0, "color", sizeof ( DWORD ), false );
|
||||
AddAttribute ( 0, "vel", sizeof ( Vector3DF ), false );
|
||||
AddAttribute ( 0, "ndx", sizeof ( unsigned short ), false );
|
||||
AddAttribute ( 0, "age", sizeof ( unsigned short ), false );
|
||||
|
||||
AddAttribute ( 0, "pressure", sizeof ( double ), false );
|
||||
AddAttribute ( 0, "density", sizeof ( double ), false );
|
||||
AddAttribute ( 0, "sph_force", sizeof ( Vector3DF ), false );
|
||||
AddAttribute ( 0, "next", sizeof ( Fluid* ), false );
|
||||
AddAttribute ( 0, "tag", sizeof ( bool ), false );
|
||||
|
||||
SPH_Setup ();
|
||||
Reset ( total );
|
||||
}
|
||||
|
||||
void FluidSystem::Reset ( int nmax )
|
||||
{
|
||||
ResetBuffer ( 0, nmax );
|
||||
|
||||
m_DT = 0.003; // 0.001; // .001 = for point grav
|
||||
|
||||
// Reset parameters
|
||||
m_Param [ MAX_FRAC ] = 1.0;
|
||||
m_Param [ POINT_GRAV ] = 0.0;
|
||||
m_Param [ PLANE_GRAV ] = 1.0;
|
||||
|
||||
m_Param [ BOUND_ZMIN_SLOPE ] = 0.0;
|
||||
m_Param [ FORCE_XMAX_SIN ] = 0.0;
|
||||
m_Param [ FORCE_XMIN_SIN ] = 0.0;
|
||||
m_Toggle [ WRAP_X ] = false;
|
||||
m_Toggle [ WALL_BARRIER ] = false;
|
||||
m_Toggle [ LEVY_BARRIER ] = false;
|
||||
m_Toggle [ DRAIN_BARRIER ] = false;
|
||||
m_Param [ SPH_INTSTIFF ] = 1.00;
|
||||
m_Param [ SPH_VISC ] = 0.2;
|
||||
m_Param [ SPH_INTSTIFF ] = 0.50;
|
||||
m_Param [ SPH_EXTSTIFF ] = 20000;
|
||||
m_Param [ SPH_SMOOTHRADIUS ] = 0.01;
|
||||
|
||||
m_Vec [ POINT_GRAV_POS ].Set ( 0, 0, 50 );
|
||||
m_Vec [ PLANE_GRAV_DIR ].Set ( 0, 0, -9.8 );
|
||||
m_Vec [ EMIT_POS ].Set ( 0, 0, 0 );
|
||||
m_Vec [ EMIT_RATE ].Set ( 0, 0, 0 );
|
||||
m_Vec [ EMIT_ANG ].Set ( 0, 90, 1.0 );
|
||||
m_Vec [ EMIT_DANG ].Set ( 0, 0, 0 );
|
||||
}
|
||||
|
||||
int FluidSystem::AddPoint ()
|
||||
{
|
||||
xref ndx;
|
||||
Fluid* f = (Fluid*) AddElem ( 0, ndx );
|
||||
f->sph_force.Set(0,0,0);
|
||||
f->vel.Set(0,0,0);
|
||||
f->vel_eval.Set(0,0,0);
|
||||
f->next = 0x0;
|
||||
f->pressure = 0;
|
||||
f->density = 0;
|
||||
return ndx;
|
||||
}
|
||||
|
||||
int FluidSystem::AddPointReuse ()
|
||||
{
|
||||
xref ndx;
|
||||
Fluid* f;
|
||||
if ( NumPoints() <= mBuf[0].max-2 )
|
||||
f = (Fluid*) AddElem ( 0, ndx );
|
||||
else
|
||||
f = (Fluid*) RandomElem ( 0, ndx );
|
||||
|
||||
f->sph_force.Set(0,0,0);
|
||||
f->vel.Set(0,0,0);
|
||||
f->vel_eval.Set(0,0,0);
|
||||
f->next = 0x0;
|
||||
f->pressure = 0;
|
||||
f->density = 0;
|
||||
return ndx;
|
||||
}
|
||||
|
||||
void FluidSystem::Run ()
|
||||
{
|
||||
bool bTiming = true;
|
||||
|
||||
mint::Time start, stop;
|
||||
|
||||
float ss = m_Param [ SPH_PDIST ] / m_Param[ SPH_SIMSCALE ]; // simulation scale (not Schutzstaffel)
|
||||
|
||||
if ( m_Vec[EMIT_RATE].x > 0 && (++m_Frame) % (int) m_Vec[EMIT_RATE].x == 0 ) {
|
||||
//m_Frame = 0;
|
||||
Emit ( ss );
|
||||
}
|
||||
|
||||
#ifdef NOGRID
|
||||
// Slow method - O(n^2)
|
||||
SPH_ComputePressureSlow ();
|
||||
SPH_ComputeForceSlow ();
|
||||
#else
|
||||
|
||||
if ( m_Toggle[USE_CUDA] ) {
|
||||
|
||||
#ifdef BUILD_CUDA
|
||||
// -- GPU --
|
||||
start.SetSystemTime ( ACC_NSEC );
|
||||
TransferToCUDA ( mBuf[0].data, (int*) &m_Grid[0], NumPoints() );
|
||||
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "TO: %s\n", stop.GetReadableTime().c_str() ); }
|
||||
|
||||
start.SetSystemTime ( ACC_NSEC );
|
||||
Grid_InsertParticlesCUDA ();
|
||||
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "INSERT (CUDA): %s\n", stop.GetReadableTime().c_str() ); }
|
||||
|
||||
start.SetSystemTime ( ACC_NSEC );
|
||||
SPH_ComputePressureCUDA ();
|
||||
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "PRESS (CUDA): %s\n", stop.GetReadableTime().c_str() ); }
|
||||
|
||||
start.SetSystemTime ( ACC_NSEC );
|
||||
SPH_ComputeForceCUDA ();
|
||||
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "FORCE (CUDA): %s\n", stop.GetReadableTime().c_str() ); }
|
||||
|
||||
//** CUDA integrator is incomplete..
|
||||
// Once integrator is done, we can remove TransferTo/From steps
|
||||
/*start.SetSystemTime ( ACC_NSEC );
|
||||
SPH_AdvanceCUDA( m_DT, m_DT/m_Param[SPH_SIMSCALE] );
|
||||
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "ADV (CUDA): %s\n", stop.GetReadableTime().c_str() ); }*/
|
||||
|
||||
start.SetSystemTime ( ACC_NSEC );
|
||||
TransferFromCUDA ( mBuf[0].data, (int*) &m_Grid[0], NumPoints() );
|
||||
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "FROM: %s\n", stop.GetReadableTime().c_str() ); }
|
||||
|
||||
// .. Do advance on CPU
|
||||
Advance();
|
||||
|
||||
#endif
|
||||
|
||||
} else {
|
||||
// -- CPU only --
|
||||
|
||||
start.SetSystemTime ( ACC_NSEC );
|
||||
Grid_InsertParticles ();
|
||||
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "INSERT: %s\n", stop.GetReadableTime().c_str() ); }
|
||||
|
||||
start.SetSystemTime ( ACC_NSEC );
|
||||
SPH_ComputePressureGrid ();
|
||||
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "PRESS: %s\n", stop.GetReadableTime().c_str() ); }
|
||||
|
||||
start.SetSystemTime ( ACC_NSEC );
|
||||
SPH_ComputeForceGridNC ();
|
||||
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "FORCE: %s\n", stop.GetReadableTime().c_str() ); }
|
||||
|
||||
start.SetSystemTime ( ACC_NSEC );
|
||||
Advance();
|
||||
if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "ADV: %s\n", stop.GetReadableTime().c_str() ); }
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
void FluidSystem::SPH_DrawDomain ()
|
||||
{
|
||||
Vector3DF min, max;
|
||||
min = m_Vec[SPH_VOLMIN];
|
||||
max = m_Vec[SPH_VOLMAX];
|
||||
min.z += 0.5;
|
||||
|
||||
glColor3f ( 0.0, 0.0, 1.0 );
|
||||
glBegin ( GL_LINES );
|
||||
glVertex3f ( min.x, min.y, min.z ); glVertex3f ( max.x, min.y, min.z );
|
||||
glVertex3f ( min.x, max.y, min.z ); glVertex3f ( max.x, max.y, min.z );
|
||||
glVertex3f ( min.x, min.y, min.z ); glVertex3f ( min.x, max.y, min.z );
|
||||
glVertex3f ( max.x, min.y, min.z ); glVertex3f ( max.x, max.y, min.z );
|
||||
glEnd ();
|
||||
}
|
||||
|
||||
void FluidSystem::Advance ()
|
||||
{
|
||||
char *dat1, *dat1_end;
|
||||
Fluid* p;
|
||||
Vector3DF norm, z;
|
||||
Vector3DF dir, accel;
|
||||
Vector3DF vnext;
|
||||
Vector3DF min, max;
|
||||
double adj;
|
||||
float SL, SL2, ss, radius;
|
||||
float stiff, damp, speed, diff;
|
||||
SL = m_Param[SPH_LIMIT];
|
||||
SL2 = SL*SL;
|
||||
|
||||
stiff = m_Param[SPH_EXTSTIFF];
|
||||
damp = m_Param[SPH_EXTDAMP];
|
||||
radius = m_Param[SPH_PRADIUS];
|
||||
min = m_Vec[SPH_VOLMIN];
|
||||
max = m_Vec[SPH_VOLMAX];
|
||||
ss = m_Param[SPH_SIMSCALE];
|
||||
|
||||
dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
|
||||
for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride ) {
|
||||
p = (Fluid*) dat1;
|
||||
|
||||
// Compute Acceleration
|
||||
accel = p->sph_force;
|
||||
accel *= m_Param[SPH_PMASS];
|
||||
|
||||
// Velocity limiting
|
||||
speed = accel.x*accel.x + accel.y*accel.y + accel.z*accel.z;
|
||||
if ( speed > SL2 ) {
|
||||
accel *= SL / sqrt(speed);
|
||||
}
|
||||
|
||||
// Boundary Conditions
|
||||
|
||||
// Z-axis walls
|
||||
diff = 2 * radius - ( p->pos.z - min.z - (p->pos.x - m_Vec[SPH_VOLMIN].x) * m_Param[BOUND_ZMIN_SLOPE] )*ss;
|
||||
if (diff > EPSILON ) {
|
||||
norm.Set ( -m_Param[BOUND_ZMIN_SLOPE], 0, 1.0 - m_Param[BOUND_ZMIN_SLOPE] );
|
||||
adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
|
||||
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
|
||||
}
|
||||
|
||||
diff = 2 * radius - ( max.z - p->pos.z )*ss;
|
||||
if (diff > EPSILON) {
|
||||
norm.Set ( 0, 0, -1 );
|
||||
adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
|
||||
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
|
||||
}
|
||||
|
||||
// X-axis walls
|
||||
if ( !m_Toggle[WRAP_X] ) {
|
||||
diff = 2 * radius - ( p->pos.x - min.x + (sin(m_Time*10.0)-1+(p->pos.y*0.025)*0.25) * m_Param[FORCE_XMIN_SIN] )*ss;
|
||||
//diff = 2 * radius - ( p->pos.x - min.x + (sin(m_Time*10.0)-1) * m_Param[FORCE_XMIN_SIN] )*ss;
|
||||
if (diff > EPSILON ) {
|
||||
norm.Set ( 1.0, 0, 0 );
|
||||
adj = (m_Param[ FORCE_XMIN_SIN ] + 1) * stiff * diff - damp * norm.Dot ( p->vel_eval ) ;
|
||||
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
|
||||
}
|
||||
|
||||
diff = 2 * radius - ( max.x - p->pos.x + (sin(m_Time*10.0)-1) * m_Param[FORCE_XMAX_SIN] )*ss;
|
||||
if (diff > EPSILON) {
|
||||
norm.Set ( -1, 0, 0 );
|
||||
adj = (m_Param[ FORCE_XMAX_SIN ]+1) * stiff * diff - damp * norm.Dot ( p->vel_eval );
|
||||
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
|
||||
}
|
||||
}
|
||||
|
||||
// Y-axis walls
|
||||
diff = 2 * radius - ( p->pos.y - min.y )*ss;
|
||||
if (diff > EPSILON) {
|
||||
norm.Set ( 0, 1, 0 );
|
||||
adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
|
||||
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
|
||||
}
|
||||
diff = 2 * radius - ( max.y - p->pos.y )*ss;
|
||||
if (diff > EPSILON) {
|
||||
norm.Set ( 0, -1, 0 );
|
||||
adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
|
||||
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
|
||||
}
|
||||
|
||||
// Wall barrier
|
||||
if ( m_Toggle[WALL_BARRIER] ) {
|
||||
diff = 2 * radius - ( p->pos.x - 0 )*ss;
|
||||
if (diff < 2*radius && diff > EPSILON && fabs(p->pos.y) < 3 && p->pos.z < 10) {
|
||||
norm.Set ( 1.0, 0, 0 );
|
||||
adj = 2*stiff * diff - damp * norm.Dot ( p->vel_eval ) ;
|
||||
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
|
||||
}
|
||||
}
|
||||
|
||||
// Levy barrier
|
||||
if ( m_Toggle[LEVY_BARRIER] ) {
|
||||
diff = 2 * radius - ( p->pos.x - 0 )*ss;
|
||||
if (diff < 2*radius && diff > EPSILON && fabs(p->pos.y) > 5 && p->pos.z < 10) {
|
||||
norm.Set ( 1.0, 0, 0 );
|
||||
adj = 2*stiff * diff - damp * norm.Dot ( p->vel_eval ) ;
|
||||
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
|
||||
}
|
||||
}
|
||||
// Drain barrier
|
||||
if ( m_Toggle[DRAIN_BARRIER] ) {
|
||||
diff = 2 * radius - ( p->pos.z - min.z-15 )*ss;
|
||||
if (diff < 2*radius && diff > EPSILON && (fabs(p->pos.x)>3 || fabs(p->pos.y)>3) ) {
|
||||
norm.Set ( 0, 0, 1);
|
||||
adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
|
||||
accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
|
||||
}
|
||||
}
|
||||
|
||||
// Plane gravity
|
||||
if ( m_Param[PLANE_GRAV] > 0)
|
||||
accel += m_Vec[PLANE_GRAV_DIR];
|
||||
|
||||
// Point gravity
|
||||
if ( m_Param[POINT_GRAV] > 0 ) {
|
||||
norm.x = ( p->pos.x - m_Vec[POINT_GRAV_POS].x );
|
||||
norm.y = ( p->pos.y - m_Vec[POINT_GRAV_POS].y );
|
||||
norm.z = ( p->pos.z - m_Vec[POINT_GRAV_POS].z );
|
||||
norm.Normalize ();
|
||||
norm *= m_Param[POINT_GRAV];
|
||||
accel -= norm;
|
||||
}
|
||||
|
||||
// Leapfrog Integration ----------------------------
|
||||
vnext = accel;
|
||||
vnext *= m_DT;
|
||||
vnext += p->vel; // v(t+1/2) = v(t-1/2) + a(t) dt
|
||||
p->vel_eval = p->vel;
|
||||
p->vel_eval += vnext;
|
||||
p->vel_eval *= 0.5; // v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5 used to compute forces later
|
||||
p->vel = vnext;
|
||||
vnext *= m_DT/ss;
|
||||
p->pos += vnext; // p(t+1) = p(t) + v(t+1/2) dt
|
||||
|
||||
if ( m_Param[CLR_MODE]==1.0 ) {
|
||||
adj = fabs(vnext.x)+fabs(vnext.y)+fabs(vnext.z) / 7000.0;
|
||||
adj = (adj > 1.0) ? 1.0 : adj;
|
||||
p->clr = COLORA( 0, adj, adj, 1 );
|
||||
}
|
||||
if ( m_Param[CLR_MODE]==2.0 ) {
|
||||
float v = 0.5 + ( p->pressure / 1500.0);
|
||||
if ( v < 0.1 ) v = 0.1;
|
||||
if ( v > 1.0 ) v = 1.0;
|
||||
p->clr = COLORA ( v, 1-v, 0, 1 );
|
||||
}
|
||||
|
||||
|
||||
// Euler integration -------------------------------
|
||||
/* accel += m_Gravity;
|
||||
accel *= m_DT;
|
||||
p->vel += accel; // v(t+1) = v(t) + a(t) dt
|
||||
p->vel_eval += accel;
|
||||
p->vel_eval *= m_DT/d;
|
||||
p->pos += p->vel_eval;
|
||||
p->vel_eval = p->vel; */
|
||||
|
||||
|
||||
if ( m_Toggle[WRAP_X] ) {
|
||||
diff = p->pos.x - (m_Vec[SPH_VOLMIN].x + 2); // -- Simulates object in center of flow
|
||||
if ( diff <= 0 ) {
|
||||
p->pos.x = (m_Vec[SPH_VOLMAX].x - 2) + diff*2;
|
||||
p->pos.z = 10;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m_Time += m_DT;
|
||||
}
|
||||
|
||||
//------------------------------------------------------ SPH Setup
|
||||
//
|
||||
// Range = +/- 10.0 * 0.006 (r) = 0.12 m (= 120 mm = 4.7 inch)
|
||||
// Container Volume (Vc) = 0.001728 m^3
|
||||
// Rest Density (D) = 1000.0 kg / m^3
|
||||
// Particle Mass (Pm) = 0.00020543 kg (mass = vol * density)
|
||||
// Number of Particles (N) = 4000.0
|
||||
// Water Mass (M) = 0.821 kg (= 821 grams)
|
||||
// Water Volume (V) = 0.000821 m^3 (= 3.4 cups, .21 gals)
|
||||
// Smoothing Radius (R) = 0.02 m (= 20 mm = ~3/4 inch)
|
||||
// Particle Radius (Pr) = 0.00366 m (= 4 mm = ~1/8 inch)
|
||||
// Particle Volume (Pv) = 2.054e-7 m^3 (= .268 milliliters)
|
||||
// Rest Distance (Pd) = 0.0059 m
|
||||
//
|
||||
// Given: D, Pm, N
|
||||
// Pv = Pm / D 0.00020543 kg / 1000 kg/m^3 = 2.054e-7 m^3
|
||||
// Pv = 4/3*pi*Pr^3 cuberoot( 2.054e-7 m^3 * 3/(4pi) ) = 0.00366 m
|
||||
// M = Pm * N 0.00020543 kg * 4000.0 = 0.821 kg
|
||||
// V = M / D 0.821 kg / 1000 kg/m^3 = 0.000821 m^3
|
||||
// V = Pv * N 2.054e-7 m^3 * 4000 = 0.000821 m^3
|
||||
// Pd = cuberoot(Pm/D) cuberoot(0.00020543/1000) = 0.0059 m
|
||||
//
|
||||
// Ideal grid cell size (gs) = 2 * smoothing radius = 0.02*2 = 0.04
|
||||
// Ideal domain size = k*gs/d = k*0.02*2/0.005 = k*8 = {8, 16, 24, 32, 40, 48, ..}
|
||||
// (k = number of cells, gs = cell size, d = simulation scale)
|
||||
|
||||
void FluidSystem::SPH_Setup ()
|
||||
{
|
||||
m_Param [ SPH_SIMSCALE ] = 0.004; // unit size
|
||||
m_Param [ SPH_VISC ] = 0.2; // pascal-second (Pa.s) = 1 kg m^-1 s^-1 (see wikipedia page on viscosity)
|
||||
m_Param [ SPH_RESTDENSITY ] = 600.0; // kg / m^3
|
||||
m_Param [ SPH_PMASS ] = 0.00020543; // kg
|
||||
m_Param [ SPH_PRADIUS ] = 0.004; // m
|
||||
m_Param [ SPH_PDIST ] = 0.0059; // m
|
||||
m_Param [ SPH_SMOOTHRADIUS ] = 0.01; // m
|
||||
m_Param [ SPH_INTSTIFF ] = 1.00;
|
||||
m_Param [ SPH_EXTSTIFF ] = 10000.0;
|
||||
m_Param [ SPH_EXTDAMP ] = 256.0;
|
||||
m_Param [ SPH_LIMIT ] = 200.0; // m / s
|
||||
|
||||
m_Toggle [ SPH_GRID ] = false;
|
||||
m_Toggle [ SPH_DEBUG ] = false;
|
||||
|
||||
SPH_ComputeKernels ();
|
||||
}
|
||||
|
||||
void FluidSystem::SPH_ComputeKernels ()
|
||||
{
|
||||
m_Param [ SPH_PDIST ] = pow ( m_Param[SPH_PMASS] / m_Param[SPH_RESTDENSITY], 1/3.0 );
|
||||
m_R2 = m_Param [SPH_SMOOTHRADIUS] * m_Param[SPH_SMOOTHRADIUS];
|
||||
m_Poly6Kern = 315.0f / (64.0f * 3.141592 * pow( m_Param[SPH_SMOOTHRADIUS], 9) ); // Wpoly6 kernel (denominator part) - 2003 Muller, p.4
|
||||
m_SpikyKern = -45.0f / (3.141592 * pow( m_Param[SPH_SMOOTHRADIUS], 6) ); // Laplacian of viscocity (denominator): PI h^6
|
||||
m_LapKern = 45.0f / (3.141592 * pow( m_Param[SPH_SMOOTHRADIUS], 6) );
|
||||
}
|
||||
|
||||
void FluidSystem::SPH_CreateExample ( int n, int nmax )
|
||||
{
|
||||
Vector3DF pos;
|
||||
Vector3DF min, max;
|
||||
|
||||
Reset ( nmax );
|
||||
|
||||
switch ( n ) {
|
||||
case 0: // Wave pool
|
||||
|
||||
//-- TEST CASE: 2x2x2 grid, 32 particles. NOTE: Set PRADIUS to 0.0004 to reduce wall influence
|
||||
// grid 0: 3*3*2 = 18 particles
|
||||
// grid 1,2: 3*1*2 = 6 particles
|
||||
// grid 3: 1*1*2 = 2 particles
|
||||
// grid 4,5,6: 0 = 0 particles
|
||||
/*m_Vec [ SPH_VOLMIN ].Set ( -2.5, -2.5, 0 );
|
||||
m_Vec [ SPH_VOLMAX ].Set ( 2.5, 2.5, 5.0 );
|
||||
m_Vec [ SPH_INITMIN ].Set ( -2.5, -2.5, 0 );
|
||||
m_Vec [ SPH_INITMAX ].Set ( 2.5, 2.5, 1.6 );*/
|
||||
|
||||
m_Vec [ SPH_VOLMIN ].Set ( -30, -30, 0 );
|
||||
m_Vec [ SPH_VOLMAX ].Set ( 30, 30, 40 );
|
||||
|
||||
//m_Vec [ SPH_INITMIN ].Set ( -5, -5, 10 );
|
||||
//m_Vec [ SPH_INITMAX ].Set ( 5, 5, 20 );
|
||||
|
||||
m_Vec [ SPH_INITMIN ].Set ( -20, -26, 10 );
|
||||
m_Vec [ SPH_INITMAX ].Set ( 20, 26, 40 );
|
||||
|
||||
m_Param [ FORCE_XMIN_SIN ] = 12.0;
|
||||
m_Param [ BOUND_ZMIN_SLOPE ] = 0.05;
|
||||
break;
|
||||
case 1: // Dam break
|
||||
m_Vec [ SPH_VOLMIN ].Set ( -30, -14, 0 );
|
||||
m_Vec [ SPH_VOLMAX ].Set ( 30, 14, 60 );
|
||||
m_Vec [ SPH_INITMIN ].Set ( 0, -13, 0 );
|
||||
m_Vec [ SPH_INITMAX ].Set ( 29, 13, 30 );
|
||||
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
|
||||
break;
|
||||
case 2: // Dual-Wave pool
|
||||
m_Vec [ SPH_VOLMIN ].Set ( -60, -5, 0 );
|
||||
m_Vec [ SPH_VOLMAX ].Set ( 60, 5, 50 );
|
||||
m_Vec [ SPH_INITMIN ].Set ( -46, -5, 0 );
|
||||
m_Vec [ SPH_INITMAX ].Set ( 46, 5, 15 );
|
||||
m_Param [ FORCE_XMIN_SIN ] = 8.0;
|
||||
m_Param [ FORCE_XMAX_SIN ] = 8.0;
|
||||
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
|
||||
break;
|
||||
case 3: // Swirl Stream
|
||||
m_Vec [ SPH_VOLMIN ].Set ( -30, -30, 0 );
|
||||
m_Vec [ SPH_VOLMAX ].Set ( 30, 30, 50 );
|
||||
m_Vec [ SPH_INITMIN ].Set ( -30, -30, 0 );
|
||||
m_Vec [ SPH_INITMAX ].Set ( 30, 30, 40 );
|
||||
m_Vec [ EMIT_POS ].Set ( -20, -20, 22 );
|
||||
m_Vec [ EMIT_RATE ].Set ( 1, 4, 0 );
|
||||
m_Vec [ EMIT_ANG ].Set ( 0, 120, 1.5 );
|
||||
m_Vec [ EMIT_DANG ].Set ( 0, 0, 0 );
|
||||
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
|
||||
break;
|
||||
case 4: // Shockwave
|
||||
m_Vec [ SPH_VOLMIN ].Set ( -60, -15, 0 );
|
||||
m_Vec [ SPH_VOLMAX ].Set ( 60, 15, 50 );
|
||||
m_Vec [ SPH_INITMIN ].Set ( -59, -14, 0 );
|
||||
m_Vec [ SPH_INITMAX ].Set ( 59, 14, 30 );
|
||||
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
|
||||
m_Toggle [ WALL_BARRIER ] = true;
|
||||
m_Toggle [ WRAP_X ] = true;
|
||||
break;
|
||||
case 5: // Zero gravity
|
||||
m_Vec [ SPH_VOLMIN ].Set ( -40, -40, 0 );
|
||||
m_Vec [ SPH_VOLMAX ].Set ( 40, 40, 50 );
|
||||
m_Vec [ SPH_INITMIN ].Set ( -20, -20, 20 );
|
||||
m_Vec [ SPH_INITMAX ].Set ( 20, 20, 40 );
|
||||
m_Vec [ EMIT_POS ].Set ( -20, 0, 40 );
|
||||
m_Vec [ EMIT_RATE ].Set ( 2, 1, 0 );
|
||||
m_Vec [ EMIT_ANG ].Set ( 0, 120, 0.25 );
|
||||
m_Vec [ PLANE_GRAV_DIR ].Set ( 0, 0, 0 );
|
||||
m_Param [ SPH_INTSTIFF ] = 0.20;
|
||||
break;
|
||||
case 6: // Point gravity
|
||||
m_Vec [ SPH_VOLMIN ].Set ( -40, -40, 0 );
|
||||
m_Vec [ SPH_VOLMAX ].Set ( 40, 40, 50 );
|
||||
m_Vec [ SPH_INITMIN ].Set ( -20, -20, 20 );
|
||||
m_Vec [ SPH_INITMAX ].Set ( 20, 20, 40 );
|
||||
m_Param [ SPH_INTSTIFF ] = 0.50;
|
||||
m_Vec [ EMIT_POS ].Set ( -20, 20, 25 );
|
||||
m_Vec [ EMIT_RATE ].Set ( 1, 4, 0 );
|
||||
m_Vec [ EMIT_ANG ].Set ( -20, 100, 2.0 );
|
||||
m_Vec [ POINT_GRAV_POS ].Set ( 0, 0, 25 );
|
||||
m_Vec [ PLANE_GRAV_DIR ].Set ( 0, 0, 0 );
|
||||
m_Param [ POINT_GRAV ] = 3.5;
|
||||
break;
|
||||
case 7: // Levy break
|
||||
m_Vec [ SPH_VOLMIN ].Set ( -40, -40, 0 );
|
||||
m_Vec [ SPH_VOLMAX ].Set ( 40, 40, 50 );
|
||||
m_Vec [ SPH_INITMIN ].Set ( 10, -40, 0 );
|
||||
m_Vec [ SPH_INITMAX ].Set ( 40, 40, 50 );
|
||||
m_Vec [ EMIT_POS ].Set ( 34, 27, 16.6 );
|
||||
m_Vec [ EMIT_RATE ].Set ( 2, 9, 0 );
|
||||
m_Vec [ EMIT_ANG ].Set ( 118, 200, 1.0 );
|
||||
m_Toggle [ LEVY_BARRIER ] = true;
|
||||
m_Param [ BOUND_ZMIN_SLOPE ] = 0.1;
|
||||
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
|
||||
break;
|
||||
case 8: // Drain
|
||||
m_Vec [ SPH_VOLMIN ].Set ( -20, -20, 0 );
|
||||
m_Vec [ SPH_VOLMAX ].Set ( 20, 20, 50 );
|
||||
m_Vec [ SPH_INITMIN ].Set ( -15, -20, 20 );
|
||||
m_Vec [ SPH_INITMAX ].Set ( 20, 20, 50 );
|
||||
m_Vec [ EMIT_POS ].Set ( -16, -16, 30 );
|
||||
m_Vec [ EMIT_RATE ].Set ( 1, 4, 0 );
|
||||
m_Vec [ EMIT_ANG ].Set ( -20, 140, 1.8 );
|
||||
m_Toggle [ DRAIN_BARRIER ] = true;
|
||||
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
|
||||
break;
|
||||
case 9: // Tumbler
|
||||
m_Vec [ SPH_VOLMIN ].Set ( -30, -30, 0 );
|
||||
m_Vec [ SPH_VOLMAX ].Set ( 30, 30, 50 );
|
||||
m_Vec [ SPH_INITMIN ].Set ( 24, -29, 20 );
|
||||
m_Vec [ SPH_INITMAX ].Set ( 29, 29, 40 );
|
||||
m_Param [ SPH_VISC ] = 0.1;
|
||||
m_Param [ SPH_INTSTIFF ] = 0.50;
|
||||
m_Param [ SPH_EXTSTIFF ] = 8000;
|
||||
//m_Param [ SPH_SMOOTHRADIUS ] = 0.01;
|
||||
m_Param [ BOUND_ZMIN_SLOPE ] = 0.4;
|
||||
m_Param [ FORCE_XMIN_SIN ] = 12.00;
|
||||
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
|
||||
break;
|
||||
case 10: // Large sim
|
||||
m_Vec [ SPH_VOLMIN ].Set ( -35, -35, 0 );
|
||||
m_Vec [ SPH_VOLMAX ].Set ( 35, 35, 60 );
|
||||
m_Vec [ SPH_INITMIN ].Set ( -5, -35, 0 );
|
||||
m_Vec [ SPH_INITMAX ].Set ( 30, 0, 60 );
|
||||
m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
|
||||
break;
|
||||
}
|
||||
|
||||
SPH_ComputeKernels ();
|
||||
|
||||
m_Param [ SPH_SIMSIZE ] = m_Param [ SPH_SIMSCALE ] * (m_Vec[SPH_VOLMAX].z - m_Vec[SPH_VOLMIN].z);
|
||||
m_Param [ SPH_PDIST ] = pow ( m_Param[SPH_PMASS] / m_Param[SPH_RESTDENSITY], 1/3.0 );
|
||||
|
||||
float ss = m_Param [ SPH_PDIST ]*0.87 / m_Param[ SPH_SIMSCALE ];
|
||||
printf ( "Spacing: %f\n", ss);
|
||||
AddVolume ( m_Vec[SPH_INITMIN], m_Vec[SPH_INITMAX], ss ); // Create the particles
|
||||
|
||||
float cell_size = m_Param[SPH_SMOOTHRADIUS]*2.0; // Grid cell size (2r)
|
||||
Grid_Setup ( m_Vec[SPH_VOLMIN], m_Vec[SPH_VOLMAX], m_Param[SPH_SIMSCALE], cell_size, 1.0 ); // Setup grid
|
||||
Grid_InsertParticles (); // Insert particles
|
||||
|
||||
Vector3DF vmin, vmax;
|
||||
vmin = m_Vec[SPH_VOLMIN];
|
||||
vmin -= Vector3DF(2,2,2);
|
||||
vmax = m_Vec[SPH_VOLMAX];
|
||||
vmax += Vector3DF(2,2,-2);
|
||||
|
||||
#ifdef BUILD_CUDA
|
||||
FluidClearCUDA ();
|
||||
Sleep ( 500 );
|
||||
|
||||
FluidSetupCUDA ( NumPoints(), sizeof(Fluid), *(float3*)& m_GridMin, *(float3*)& m_GridMax, *(float3*)& m_GridRes, *(float3*)& m_GridSize, (int) m_Vec[EMIT_RATE].x );
|
||||
|
||||
Sleep ( 500 );
|
||||
|
||||
FluidParamCUDA ( m_Param[SPH_SIMSCALE], m_Param[SPH_SMOOTHRADIUS], m_Param[SPH_PMASS], m_Param[SPH_RESTDENSITY], m_Param[SPH_INTSTIFF], m_Param[SPH_VISC] );
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// Compute Pressures - Very slow yet simple. O(n^2)
|
||||
void FluidSystem::SPH_ComputePressureSlow ()
|
||||
{
|
||||
char *dat1, *dat1_end;
|
||||
char *dat2, *dat2_end;
|
||||
Fluid *p, *q;
|
||||
int cnt = 0;
|
||||
double dx, dy, dz, sum, dsq, c;
|
||||
double d, d2, mR, mR2;
|
||||
d = m_Param[SPH_SIMSCALE];
|
||||
d2 = d*d;
|
||||
mR = m_Param[SPH_SMOOTHRADIUS];
|
||||
mR2 = mR*mR;
|
||||
|
||||
dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
|
||||
for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride ) {
|
||||
p = (Fluid*) dat1;
|
||||
|
||||
sum = 0.0;
|
||||
cnt = 0;
|
||||
|
||||
dat2_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
|
||||
for ( dat2 = mBuf[0].data; dat2 < dat2_end; dat2 += mBuf[0].stride ) {
|
||||
q = (Fluid*) dat2;
|
||||
|
||||
if ( p==q ) continue;
|
||||
dx = ( p->pos.x - q->pos.x)*d; // dist in cm
|
||||
dy = ( p->pos.y - q->pos.y)*d;
|
||||
dz = ( p->pos.z - q->pos.z)*d;
|
||||
dsq = (dx*dx + dy*dy + dz*dz);
|
||||
if ( mR2 > dsq ) {
|
||||
c = m_R2 - dsq;
|
||||
sum += c * c * c;
|
||||
cnt++;
|
||||
//if ( p == m_CurrP ) q->tag = true;
|
||||
}
|
||||
}
|
||||
p->density = sum * m_Param[SPH_PMASS] * m_Poly6Kern ;
|
||||
p->pressure = ( p->density - m_Param[SPH_RESTDENSITY] ) * m_Param[SPH_INTSTIFF];
|
||||
p->density = 1.0f / p->density;
|
||||
}
|
||||
}
|
||||
|
||||
// Compute Pressures - Using spatial grid, and also create neighbor table
|
||||
void FluidSystem::SPH_ComputePressureGrid ()
|
||||
{
|
||||
char *dat1, *dat1_end;
|
||||
Fluid* p;
|
||||
Fluid* pcurr;
|
||||
int pndx;
|
||||
int i, cnt = 0;
|
||||
float dx, dy, dz, sum, dsq, c;
|
||||
float d, d2, mR, mR2;
|
||||
float radius = m_Param[SPH_SMOOTHRADIUS] / m_Param[SPH_SIMSCALE];
|
||||
d = m_Param[SPH_SIMSCALE];
|
||||
d2 = d*d;
|
||||
mR = m_Param[SPH_SMOOTHRADIUS];
|
||||
mR2 = mR*mR;
|
||||
|
||||
dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
|
||||
i = 0;
|
||||
for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride, i++ ) {
|
||||
p = (Fluid*) dat1;
|
||||
|
||||
sum = 0.0;
|
||||
m_NC[i] = 0;
|
||||
|
||||
Grid_FindCells ( p->pos, radius );
|
||||
for (int cell=0; cell < 8; cell++) {
|
||||
if ( m_GridCell[cell] != -1 ) {
|
||||
pndx = m_Grid [ m_GridCell[cell] ];
|
||||
while ( pndx != -1 ) {
|
||||
pcurr = (Fluid*) (mBuf[0].data + pndx*mBuf[0].stride);
|
||||
if ( pcurr == p ) {pndx = pcurr->next; continue; }
|
||||
dx = ( p->pos.x - pcurr->pos.x)*d; // dist in cm
|
||||
dy = ( p->pos.y - pcurr->pos.y)*d;
|
||||
dz = ( p->pos.z - pcurr->pos.z)*d;
|
||||
dsq = (dx*dx + dy*dy + dz*dz);
|
||||
if ( mR2 > dsq ) {
|
||||
c = m_R2 - dsq;
|
||||
sum += c * c * c;
|
||||
if ( m_NC[i] < MAX_NEIGHBOR ) {
|
||||
m_Neighbor[i][ m_NC[i] ] = pndx;
|
||||
m_NDist[i][ m_NC[i] ] = sqrt(dsq);
|
||||
m_NC[i]++;
|
||||
}
|
||||
}
|
||||
pndx = pcurr->next;
|
||||
}
|
||||
}
|
||||
m_GridCell[cell] = -1;
|
||||
}
|
||||
p->density = sum * m_Param[SPH_PMASS] * m_Poly6Kern ;
|
||||
p->pressure = ( p->density - m_Param[SPH_RESTDENSITY] ) * m_Param[SPH_INTSTIFF];
|
||||
p->density = 1.0f / p->density;
|
||||
}
|
||||
}
|
||||
|
||||
// Compute Forces - Very slow, but simple. O(n^2)
|
||||
void FluidSystem::SPH_ComputeForceSlow ()
|
||||
{
|
||||
char *dat1, *dat1_end;
|
||||
char *dat2, *dat2_end;
|
||||
Fluid *p, *q;
|
||||
Vector3DF force, fcurr;
|
||||
register double pterm, vterm, dterm;
|
||||
double c, r, d, sum, dsq;
|
||||
double dx, dy, dz;
|
||||
double mR, mR2, visc;
|
||||
|
||||
d = m_Param[SPH_SIMSCALE];
|
||||
mR = m_Param[SPH_SMOOTHRADIUS];
|
||||
mR2 = (mR*mR);
|
||||
visc = m_Param[SPH_VISC];
|
||||
vterm = m_LapKern * visc;
|
||||
|
||||
dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
|
||||
for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride ) {
|
||||
p = (Fluid*) dat1;
|
||||
|
||||
sum = 0.0;
|
||||
force.Set ( 0, 0, 0 );
|
||||
|
||||
dat2_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
|
||||
for ( dat2 = mBuf[0].data; dat2 < dat2_end; dat2 += mBuf[0].stride ) {
|
||||
q = (Fluid*) dat2;
|
||||
|
||||
if ( p == q ) continue;
|
||||
dx = ( p->pos.x - q->pos.x )*d; // dist in cm
|
||||
dy = ( p->pos.y - q->pos.y )*d;
|
||||
dz = ( p->pos.z - q->pos.z )*d;
|
||||
dsq = (dx*dx + dy*dy + dz*dz);
|
||||
if ( mR2 > dsq ) {
|
||||
r = sqrt ( dsq );
|
||||
c = (mR - r);
|
||||
pterm = -0.5f * c * m_SpikyKern * ( p->pressure + q->pressure) / r;
|
||||
dterm = c * p->density * q->density;
|
||||
force.x += ( pterm * dx + vterm * (q->vel_eval.x - p->vel_eval.x) ) * dterm;
|
||||
force.y += ( pterm * dy + vterm * (q->vel_eval.y - p->vel_eval.y) ) * dterm;
|
||||
force.z += ( pterm * dz + vterm * (q->vel_eval.z - p->vel_eval.z) ) * dterm;
|
||||
}
|
||||
}
|
||||
p->sph_force = force;
|
||||
}
|
||||
}
|
||||
|
||||
// Compute Forces - Using spatial grid. Faster.
|
||||
void FluidSystem::SPH_ComputeForceGrid ()
|
||||
{
|
||||
char *dat1, *dat1_end;
|
||||
Fluid *p;
|
||||
Fluid *pcurr;
|
||||
int pndx;
|
||||
Vector3DF force, fcurr;
|
||||
register double pterm, vterm, dterm;
|
||||
double c, d, dsq, r;
|
||||
double dx, dy, dz;
|
||||
double mR, mR2, visc;
|
||||
float radius = m_Param[SPH_SMOOTHRADIUS] / m_Param[SPH_SIMSCALE];
|
||||
|
||||
d = m_Param[SPH_SIMSCALE];
|
||||
mR = m_Param[SPH_SMOOTHRADIUS];
|
||||
mR2 = (mR*mR);
|
||||
visc = m_Param[SPH_VISC];
|
||||
|
||||
dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
|
||||
for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride ) {
|
||||
p = (Fluid*) dat1;
|
||||
|
||||
force.Set ( 0, 0, 0 );
|
||||
|
||||
Grid_FindCells ( p->pos, radius );
|
||||
for (int cell=0; cell < 8; cell++) {
|
||||
if ( m_GridCell[cell] != -1 ) {
|
||||
pndx = m_Grid [ m_GridCell[cell] ];
|
||||
while ( pndx != -1 ) {
|
||||
pcurr = (Fluid*) (mBuf[0].data + pndx*mBuf[0].stride);
|
||||
if ( pcurr == p ) {pndx = pcurr->next; continue; }
|
||||
|
||||
dx = ( p->pos.x - pcurr->pos.x)*d; // dist in cm
|
||||
dy = ( p->pos.y - pcurr->pos.y)*d;
|
||||
dz = ( p->pos.z - pcurr->pos.z)*d;
|
||||
dsq = (dx*dx + dy*dy + dz*dz);
|
||||
if ( mR2 > dsq ) {
|
||||
r = sqrt ( dsq );
|
||||
c = (mR - r);
|
||||
pterm = -0.5f * c * m_SpikyKern * ( p->pressure + pcurr->pressure) / r;
|
||||
dterm = c * p->density * pcurr->density;
|
||||
vterm = m_LapKern * visc;
|
||||
force.x += ( pterm * dx + vterm * (pcurr->vel_eval.x - p->vel_eval.x) ) * dterm;
|
||||
force.y += ( pterm * dy + vterm * (pcurr->vel_eval.y - p->vel_eval.y) ) * dterm;
|
||||
force.z += ( pterm * dz + vterm * (pcurr->vel_eval.z - p->vel_eval.z) ) * dterm;
|
||||
}
|
||||
pndx = pcurr->next;
|
||||
}
|
||||
}
|
||||
}
|
||||
p->sph_force = force;
|
||||
}
|
||||
}
|
||||
|
||||
// Compute Forces - Using spatial grid with saved neighbor table. Fastest.
|
||||
void FluidSystem::SPH_ComputeForceGridNC ()
|
||||
{
|
||||
char *dat1, *dat1_end;
|
||||
Fluid *p;
|
||||
Fluid *pcurr;
|
||||
Vector3DF force, fcurr;
|
||||
register float pterm, vterm, dterm;
|
||||
int i;
|
||||
float c, d;
|
||||
float dx, dy, dz;
|
||||
float mR, mR2, visc;
|
||||
|
||||
d = m_Param[SPH_SIMSCALE];
|
||||
mR = m_Param[SPH_SMOOTHRADIUS];
|
||||
mR2 = (mR*mR);
|
||||
visc = m_Param[SPH_VISC];
|
||||
|
||||
dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
|
||||
i = 0;
|
||||
|
||||
for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride, i++ ) {
|
||||
p = (Fluid*) dat1;
|
||||
|
||||
force.Set ( 0, 0, 0 );
|
||||
for (int j=0; j < m_NC[i]; j++ ) {
|
||||
pcurr = (Fluid*) (mBuf[0].data + m_Neighbor[i][j]*mBuf[0].stride);
|
||||
dx = ( p->pos.x - pcurr->pos.x)*d; // dist in cm
|
||||
dy = ( p->pos.y - pcurr->pos.y)*d;
|
||||
dz = ( p->pos.z - pcurr->pos.z)*d;
|
||||
c = ( mR - m_NDist[i][j] );
|
||||
pterm = -0.5f * c * m_SpikyKern * ( p->pressure + pcurr->pressure) / m_NDist[i][j];
|
||||
dterm = c * p->density * pcurr->density;
|
||||
vterm = m_LapKern * visc;
|
||||
force.x += ( pterm * dx + vterm * (pcurr->vel_eval.x - p->vel_eval.x) ) * dterm;
|
||||
force.y += ( pterm * dy + vterm * (pcurr->vel_eval.y - p->vel_eval.y) ) * dterm;
|
||||
force.z += ( pterm * dz + vterm * (pcurr->vel_eval.z - p->vel_eval.z) ) * dterm;
|
||||
}
|
||||
p->sph_force = force;
|
||||
}
|
||||
}
|
||||
|
||||
71
Extras/sph/fluids/fluid_system.cu
Normal file
71
Extras/sph/fluids/fluid_system.cu
Normal file
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
|
||||
Copyright (C) 2009. Rama Hoetzlein, http://www.rchoetzlein.com
|
||||
|
||||
ZLib license
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#include <cutil.h>
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <string.h>
|
||||
|
||||
#if defined(__APPLE__) || defined(MACOSX)
|
||||
#include <GLUT/glut.h>
|
||||
#else
|
||||
#include <GL/glut.h>
|
||||
#endif
|
||||
#include <cuda_gl_interop.h>
|
||||
|
||||
#include "fluid_system_kern.cu"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
|
||||
// Compute number of blocks to create
|
||||
int iDivUp (int a, int b) {
|
||||
return (a % b != 0) ? (a / b + 1) : (a / b);
|
||||
}
|
||||
void computeNumBlocks (int numPnts, int minThreads, int &numBlocks, int &numThreads)
|
||||
{
|
||||
numThreads = min( minThreads, numPnts );
|
||||
numBlocks = iDivUp ( numPnts, numThreads );
|
||||
}
|
||||
|
||||
|
||||
void Grid_InsertParticlesCUDA ( uchar* data, uint stride, uint numPoints )
|
||||
{
|
||||
int numThreads, numBlocks;
|
||||
computeNumBlocks (numPoints, 256, numBlocks, numThreads);
|
||||
|
||||
// transfer point data to device
|
||||
char* pntData;
|
||||
size = numPoints * stride;
|
||||
cudaMalloc( (void**) &pntData, size);
|
||||
cudaMemcpy( pntData, data, size, cudaMemcpyHostToDevice);
|
||||
|
||||
// execute the kernel
|
||||
insertParticles<<< numBlocks, numThreads >>> ( pntData, stride );
|
||||
|
||||
// transfer data back to host
|
||||
cudaMemcpy( data, pntData, cudaMemcpyDeviceToHost);
|
||||
|
||||
// check if kernel invocation generated an error
|
||||
CUT_CHECK_ERROR("Kernel execution failed");
|
||||
CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboPos));
|
||||
}
|
||||
106
Extras/sph/fluids/fluid_system.h
Normal file
106
Extras/sph/fluids/fluid_system.h
Normal file
@@ -0,0 +1,106 @@
|
||||
/*
|
||||
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
|
||||
Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
|
||||
|
||||
ZLib license
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef DEF_FLUID_SYS
|
||||
#define DEF_FLUID_SYS
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "point_set.h"
|
||||
#include "fluid.h"
|
||||
|
||||
// Scalar params
|
||||
#define SPH_SIMSIZE 4
|
||||
#define SPH_SIMSCALE 5
|
||||
#define SPH_VISC 6
|
||||
#define SPH_RESTDENSITY 7
|
||||
#define SPH_PMASS 8
|
||||
#define SPH_PRADIUS 9
|
||||
#define SPH_PDIST 10
|
||||
#define SPH_SMOOTHRADIUS 11
|
||||
#define SPH_INTSTIFF 12
|
||||
#define SPH_EXTSTIFF 13
|
||||
#define SPH_EXTDAMP 14
|
||||
#define SPH_LIMIT 15
|
||||
#define BOUND_ZMIN_SLOPE 16
|
||||
#define FORCE_XMAX_SIN 17
|
||||
#define FORCE_XMIN_SIN 18
|
||||
#define MAX_FRAC 19
|
||||
#define CLR_MODE 20
|
||||
|
||||
// Vector params
|
||||
#define SPH_VOLMIN 7
|
||||
#define SPH_VOLMAX 8
|
||||
#define SPH_INITMIN 9
|
||||
#define SPH_INITMAX 10
|
||||
|
||||
// Toggles
|
||||
#define SPH_GRID 0
|
||||
#define SPH_DEBUG 1
|
||||
#define WRAP_X 2
|
||||
#define WALL_BARRIER 3
|
||||
#define LEVY_BARRIER 4
|
||||
#define DRAIN_BARRIER 5
|
||||
#define USE_CUDA 6
|
||||
|
||||
#define MAX_PARAM 21
|
||||
#define BFLUID 2
|
||||
|
||||
class FluidSystem : public PointSet {
|
||||
public:
|
||||
FluidSystem ();
|
||||
|
||||
// Basic Particle System
|
||||
virtual void Initialize ( int mode, int nmax );
|
||||
virtual void Reset ( int nmax );
|
||||
virtual void Run ();
|
||||
virtual void Advance ();
|
||||
virtual int AddPoint ();
|
||||
virtual int AddPointReuse ();
|
||||
Fluid* AddFluid () { return (Fluid*) GetElem(0, AddPointReuse()); }
|
||||
Fluid* GetFluid (int n) { return (Fluid*) GetElem(0, n); }
|
||||
|
||||
// Smoothed Particle Hydrodynamics
|
||||
void SPH_Setup ();
|
||||
void SPH_CreateExample ( int n, int nmax );
|
||||
void SPH_DrawDomain ();
|
||||
void SPH_ComputeKernels ();
|
||||
|
||||
void SPH_ComputePressureSlow (); // O(n^2)
|
||||
void SPH_ComputePressureGrid (); // O(kn) - spatial grid
|
||||
|
||||
void SPH_ComputeForceSlow (); // O(n^2)
|
||||
void SPH_ComputeForceGrid (); // O(kn) - spatial grid
|
||||
void SPH_ComputeForceGridNC (); // O(cn) - neighbor table
|
||||
|
||||
private:
|
||||
|
||||
// Smoothed Particle Hydrodynamics
|
||||
double m_R2, m_Poly6Kern, m_LapKern, m_SpikyKern; // Kernel functions
|
||||
};
|
||||
|
||||
#endif
|
||||
247
Extras/sph/fluids/fluid_system_host.cu
Normal file
247
Extras/sph/fluids/fluid_system_host.cu
Normal file
@@ -0,0 +1,247 @@
|
||||
/*
|
||||
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
|
||||
Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
|
||||
|
||||
ZLib license
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#include "C:\CUDA\common\inc\cutil.h" // cutil32.lib
|
||||
#include <string.h>
|
||||
|
||||
|
||||
#if defined(__APPLE__) || defined(MACOSX)
|
||||
#include <GLUT/glut.h>
|
||||
#else
|
||||
#include <GL/glut.h>
|
||||
#endif
|
||||
#include <cuda_gl_interop.h>
|
||||
|
||||
#include "radixsort.cu"
|
||||
#include "fluid_system_kern.cu" // build kernel
|
||||
|
||||
FluidParams fcuda;
|
||||
|
||||
__device__ char* bufPnts; // point data (array of Fluid structs)
|
||||
__device__ char* bufPntSort; // point data (array of Fluid structs)
|
||||
__device__ uint* bufHash[2]; // point grid hash
|
||||
__device__ int* bufGrid;
|
||||
|
||||
|
||||
extern "C"
|
||||
{
|
||||
// Initialize CUDA
|
||||
void cudaInit(int argc, char **argv)
|
||||
{
|
||||
CUT_DEVICE_INIT(argc, argv);
|
||||
|
||||
cudaDeviceProp p;
|
||||
cudaGetDeviceProperties ( &p, 0);
|
||||
|
||||
printf ( "-- CUDA --\n" );
|
||||
printf ( "Name: %s\n", p.name );
|
||||
printf ( "Revision: %d.%d\n", p.major, p.minor );
|
||||
printf ( "Global Mem: %d\n", p.totalGlobalMem );
|
||||
printf ( "Shared/Blk: %d\n", p.sharedMemPerBlock );
|
||||
printf ( "Regs/Blk: %d\n", p.regsPerBlock );
|
||||
printf ( "Warp Size: %d\n", p.warpSize );
|
||||
printf ( "Mem Pitch: %d\n", p.memPitch );
|
||||
printf ( "Thrds/Blk: %d\n", p.maxThreadsPerBlock );
|
||||
printf ( "Const Mem: %d\n", p.totalConstMem );
|
||||
printf ( "Clock Rate: %d\n", p.clockRate );
|
||||
|
||||
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufPnts, 10 ) );
|
||||
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufPntSort, 10 ) );
|
||||
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufHash, 10 ) );
|
||||
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufGrid, 10 ) );
|
||||
};
|
||||
|
||||
// Compute number of blocks to create
|
||||
int iDivUp (int a, int b) {
|
||||
return (a % b != 0) ? (a / b + 1) : (a / b);
|
||||
}
|
||||
void computeNumBlocks (int numPnts, int maxThreads, int &numBlocks, int &numThreads)
|
||||
{
|
||||
numThreads = min( maxThreads, numPnts );
|
||||
numBlocks = iDivUp ( numPnts, numThreads );
|
||||
}
|
||||
|
||||
void FluidClearCUDA ()
|
||||
{
|
||||
CUDA_SAFE_CALL ( cudaFree ( bufPnts ) );
|
||||
CUDA_SAFE_CALL ( cudaFree ( bufPntSort ) );
|
||||
CUDA_SAFE_CALL ( cudaFree ( bufHash[0] ) );
|
||||
CUDA_SAFE_CALL ( cudaFree ( bufHash[1] ) );
|
||||
CUDA_SAFE_CALL ( cudaFree ( bufGrid ) );
|
||||
}
|
||||
|
||||
|
||||
void FluidSetupCUDA ( int num, int stride, float3 min, float3 max, float3 res, float3 size, int chk )
|
||||
{
|
||||
fcuda.min = make_float3(min.x, min.y, min.z);
|
||||
fcuda.max = make_float3(max.x, max.y, max.z);
|
||||
fcuda.res = make_float3(res.x, res.y, res.z);
|
||||
fcuda.size = make_float3(size.x, size.y, size.z);
|
||||
fcuda.pnts = num;
|
||||
fcuda.delta.x = res.x / size.x;
|
||||
fcuda.delta.y = res.y / size.y;
|
||||
fcuda.delta.z = res.z / size.z;
|
||||
fcuda.cells = res.x*res.y*res.z;
|
||||
fcuda.chk = chk;
|
||||
|
||||
computeNumBlocks ( fcuda.pnts, 256, fcuda.numBlocks, fcuda.numThreads); // particles
|
||||
computeNumBlocks ( fcuda.cells, 256, fcuda.gridBlocks, fcuda.gridThreads); // grid cell
|
||||
|
||||
fcuda.szPnts = (fcuda.numBlocks * fcuda.numThreads) * stride;
|
||||
fcuda.szHash = (fcuda.numBlocks * fcuda.numThreads) * sizeof(uint2); // <cell, particle> pairs
|
||||
fcuda.szGrid = (fcuda.gridBlocks * fcuda.gridThreads) * sizeof(uint);
|
||||
fcuda.stride = stride;
|
||||
printf ( "pnts: %d, t:%dx%d=%d, bufPnts:%d, bufHash:%d\n", fcuda.pnts, fcuda.numBlocks, fcuda.numThreads, fcuda.numBlocks*fcuda.numThreads, fcuda.szPnts, fcuda.szHash );
|
||||
printf ( "grds: %d, t:%dx%d=%d, bufGrid:%d, Res: %dx%dx%d\n", fcuda.cells, fcuda.gridBlocks, fcuda.gridThreads, fcuda.gridBlocks*fcuda.gridThreads, fcuda.szGrid, (int) fcuda.res.x, (int) fcuda.res.y, (int) fcuda.res.z );
|
||||
|
||||
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufPnts, fcuda.szPnts ) );
|
||||
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufPntSort, fcuda.szPnts ) );
|
||||
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufHash[0], fcuda.szHash ) );
|
||||
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufHash[1], fcuda.szHash ) );
|
||||
CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufGrid, fcuda.szGrid ) );
|
||||
|
||||
printf ( "POINTERS\n");
|
||||
printf ( "bufPnts: %p\n", bufPnts );
|
||||
printf ( "bufPntSort: %p\n", bufPntSort );
|
||||
printf ( "bufHash0: %p\n", bufHash[0] );
|
||||
printf ( "bufHash1: %p\n", bufHash[1] );
|
||||
printf ( "bufGrid: %p\n", bufGrid );
|
||||
|
||||
CUDA_SAFE_CALL ( cudaMemcpyToSymbol ( simData, &fcuda, sizeof(FluidParams) ) );
|
||||
cudaThreadSynchronize ();
|
||||
}
|
||||
|
||||
void FluidParamCUDA ( float sim_scale, float smooth_rad, float mass, float rest, float stiff, float visc )
|
||||
{
|
||||
fcuda.sim_scale = sim_scale;
|
||||
fcuda.smooth_rad = smooth_rad;
|
||||
fcuda.r2 = smooth_rad * smooth_rad;
|
||||
fcuda.pmass = mass;
|
||||
fcuda.rest_dens = rest;
|
||||
fcuda.stiffness = stiff;
|
||||
fcuda.visc = visc;
|
||||
|
||||
fcuda.pdist = pow ( fcuda.pmass / fcuda.rest_dens, 1/3.0f );
|
||||
fcuda.poly6kern = 315.0f / (64.0f * 3.141592 * pow( smooth_rad, 9.0f) );
|
||||
fcuda.spikykern = -45.0f / (3.141592 * pow( smooth_rad, 6.0f) );
|
||||
fcuda.lapkern = 45.0f / (3.141592 * pow( smooth_rad, 6.0f) );
|
||||
|
||||
CUDA_SAFE_CALL( cudaMemcpyToSymbol ( simData, &fcuda, sizeof(FluidParams) ) );
|
||||
cudaThreadSynchronize ();
|
||||
}
|
||||
|
||||
void TransferToCUDA ( char* data, int* grid, int numPoints )
|
||||
{
|
||||
CUDA_SAFE_CALL( cudaMemcpy ( bufPnts, data, numPoints * fcuda.stride, cudaMemcpyHostToDevice ) );
|
||||
cudaThreadSynchronize ();
|
||||
}
|
||||
|
||||
void TransferFromCUDA ( char* data, int* grid, int numPoints )
|
||||
{
|
||||
CUDA_SAFE_CALL( cudaMemcpy ( data, bufPntSort, numPoints * fcuda.stride, cudaMemcpyDeviceToHost ) );
|
||||
cudaThreadSynchronize ();
|
||||
|
||||
CUDA_SAFE_CALL( cudaMemcpy ( grid, bufGrid, fcuda.cells * sizeof(uint), cudaMemcpyDeviceToHost ) );
|
||||
}
|
||||
|
||||
void Grid_InsertParticlesCUDA ()
|
||||
{
|
||||
CUDA_SAFE_CALL( cudaMemset ( bufHash[0], 0, fcuda.szHash ) );
|
||||
|
||||
hashParticles<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPnts, (uint2*) bufHash[0], fcuda.pnts );
|
||||
CUT_CHECK_ERROR( "Kernel execution failed");
|
||||
cudaThreadSynchronize ();
|
||||
|
||||
//int buf[20000];
|
||||
/*printf ( "HASH: %d (%d)\n", fcuda.pnts, fcuda.numBlocks*fcuda.numThreads );
|
||||
CUDA_SAFE_CALL( cudaMemcpy ( buf, bufHash[0], fcuda.pnts * 2*sizeof(uint), cudaMemcpyDeviceToHost ) );
|
||||
//for (int n=0; n < fcuda.numBlocks*fcuda.numThreads; n++) {
|
||||
for (int n=0; n < 100; n++) {
|
||||
printf ( "%d: <%d,%d>\n", n, buf[n*2], buf[n*2+1] );
|
||||
}*/
|
||||
|
||||
RadixSort( (KeyValuePair *) bufHash[0], (KeyValuePair *) bufHash[1], fcuda.pnts, 32);
|
||||
CUT_CHECK_ERROR( "Kernel execution failed");
|
||||
cudaThreadSynchronize ();
|
||||
|
||||
/*printf ( "HASH: %d (%d)\n", fcuda.pnts, fcuda.numBlocks*fcuda.numThreads );
|
||||
CUDA_SAFE_CALL( cudaMemcpy ( buf, bufHash[0], fcuda.pnts * 2*sizeof(uint), cudaMemcpyDeviceToHost ) );
|
||||
//for (int n=0; n < fcuda.numBlocks*fcuda.numThreads; n++) {
|
||||
for (int n=0; n < 100; n++) {
|
||||
printf ( "%d: <%d,%d>\n", n, buf[n*2], buf[n*2+1] );
|
||||
}*/
|
||||
|
||||
// insertParticles<<< fcuda.gridBlocks, fcuda.gridThreads>>> ( bufPnts, (uint2*) bufHash[0], bufGrid, fcuda.pnts, fcuda.cells );
|
||||
|
||||
CUDA_SAFE_CALL( cudaMemset ( bufGrid, NULL_HASH, fcuda.cells * sizeof(uint) ) );
|
||||
|
||||
insertParticlesRadix<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPnts, (uint2*) bufHash[0], bufGrid, bufPntSort, fcuda.pnts, fcuda.cells );
|
||||
CUT_CHECK_ERROR( "Kernel execution failed");
|
||||
cudaThreadSynchronize ();
|
||||
|
||||
/*printf ( "GRID: %d\n", fcuda.cells );
|
||||
CUDA_SAFE_CALL( cudaMemcpy ( buf, bufGrid, fcuda.cells * sizeof(uint), cudaMemcpyDeviceToHost ) );
|
||||
*for (int n=0; n < 100; n++) {
|
||||
printf ( "%d: %d\n", n, buf[n]);
|
||||
}*/
|
||||
}
|
||||
|
||||
void SPH_ComputePressureCUDA ()
|
||||
{
|
||||
computePressure<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, bufGrid, (uint2*) bufHash[0], fcuda.pnts );
|
||||
CUT_CHECK_ERROR( "Kernel execution failed");
|
||||
cudaThreadSynchronize ();
|
||||
}
|
||||
|
||||
void SPH_ComputeForceCUDA ()
|
||||
{
|
||||
//-- standard force
|
||||
//computeForce<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, bufGrid, (uint2*) bufHash[0], fcuda.pnts );
|
||||
|
||||
// Force using neighbor table
|
||||
computeForceNbr<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, fcuda.pnts );
|
||||
CUT_CHECK_ERROR( "Kernel execution failed");
|
||||
cudaThreadSynchronize ();
|
||||
}
|
||||
|
||||
void SPH_AdvanceCUDA ( float dt, float ss )
|
||||
{
|
||||
advanceParticles<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, fcuda.pnts, dt, ss );
|
||||
CUT_CHECK_ERROR( "Kernel execution failed");
|
||||
cudaThreadSynchronize ();
|
||||
}
|
||||
|
||||
} // extern C
|
||||
|
||||
|
||||
|
||||
|
||||
//----------- Per frame: Malloc/Free, Host<->Device
|
||||
// transfer point data to device
|
||||
/*char* pntData;
|
||||
int size = (fcuda.numBlocks*fcuda.numThreads) * stride;
|
||||
cudaMalloc( (void**) &pntData, size);
|
||||
cudaMemcpy( pntData, data, numPoints*stride, cudaMemcpyHostToDevice);
|
||||
insertParticles<<< fcuda.numBlocks, fcuda.numThreads >>> ( pntData, stride, numPoints );
|
||||
cudaMemcpy( data, pntData, numPoints*stride, cudaMemcpyDeviceToHost);
|
||||
cudaFree( pntData );*/
|
||||
63
Extras/sph/fluids/fluid_system_host.cuh
Normal file
63
Extras/sph/fluids/fluid_system_host.cuh
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
|
||||
Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
|
||||
|
||||
ZLib license
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#include <vector_types.h>
|
||||
#include <driver_types.h> // for cudaStream_t
|
||||
|
||||
typedef unsigned int uint; // should be 4-bytes on CUDA
|
||||
typedef unsigned char uchar; // should be 1-bytes on CUDA
|
||||
|
||||
struct FluidParams {
|
||||
int numThreads, numBlocks;
|
||||
int gridThreads, gridBlocks;
|
||||
int szPnts, szHash, szGrid;
|
||||
int stride, pnts, cells;
|
||||
int chk;
|
||||
float smooth_rad, r2, sim_scale, visc;
|
||||
float3 min, max, res, size, delta;
|
||||
|
||||
float pdist, pmass, rest_dens, stiffness;
|
||||
float poly6kern, spikykern, lapkern;
|
||||
|
||||
};
|
||||
|
||||
extern "C"
|
||||
{
|
||||
|
||||
void cudaInit(int argc, char **argv);
|
||||
|
||||
void FluidClearCUDA ();
|
||||
void FluidSetupCUDA ( int num, int stride, float3 min, float3 max, float3 res, float3 size, int chk );
|
||||
void FluidParamCUDA ( float sim_scale, float smooth_rad, float mass, float rest, float stiff, float visc );
|
||||
|
||||
void TransferToCUDA ( char* data, int* grid, int numPoints );
|
||||
void TransferFromCUDA ( char* data, int* grid, int numPoints );
|
||||
|
||||
void Grid_InsertParticlesCUDA ();
|
||||
void SPH_ComputePressureCUDA ();
|
||||
void SPH_ComputeForceCUDA ();
|
||||
void SPH_AdvanceCUDA ( float dt, float ss );
|
||||
|
||||
}
|
||||
|
||||
|
||||
402
Extras/sph/fluids/fluid_system_kern.cu
Normal file
402
Extras/sph/fluids/fluid_system_kern.cu
Normal file
@@ -0,0 +1,402 @@
|
||||
/*
|
||||
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
|
||||
Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
|
||||
|
||||
ZLib license
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#ifndef _PARTICLES_KERNEL_H_
|
||||
#define _PARTICLES_KERNEL_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "fluid_system_host.cuh"
|
||||
|
||||
#define TOTAL_THREADS 65536
|
||||
#define BLOCK_THREADS 256
|
||||
#define MAX_NBR 80
|
||||
|
||||
__constant__ FluidParams simData; // simulation data (on device)
|
||||
|
||||
__device__ int bufNeighbor[ TOTAL_THREADS*MAX_NBR ];
|
||||
__device__ float bufNdist[ TOTAL_THREADS*MAX_NBR ];
|
||||
|
||||
#define COLOR(r,g,b) ( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) )
|
||||
#define COLORA(r,g,b,a) ( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) | uint((a)*255.0f) )
|
||||
|
||||
#define NULL_HASH 333333
|
||||
|
||||
#define OFFSET_CLR 12
|
||||
#define OFFSET_NEXT 16
|
||||
#define OFFSET_VEL 20
|
||||
#define OFFSET_VEVAL 32
|
||||
#define OFFSET_PRESS 48
|
||||
#define OFFSET_DENS 52
|
||||
#define OFFSET_FORCE 56
|
||||
|
||||
|
||||
__global__ void hashParticles ( char* bufPnts, uint2* bufHash, int numPnt )
|
||||
{
|
||||
uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // particle index
|
||||
float3* pos = (float3*) (bufPnts + __mul24(ndx, simData.stride) );
|
||||
int gz = (pos->z - simData.min.z) * simData.delta.z ;
|
||||
int gy = (pos->y - simData.min.y) * simData.delta.y ;
|
||||
int gx = (pos->x - simData.min.x) * simData.delta.x ;
|
||||
if ( ndx >= numPnt || gx < 0 || gz > simData.res.x-1 || gy < 0 || gy > simData.res.y-1 || gz < 0 || gz > simData.res.z-1 )
|
||||
bufHash[ndx] = make_uint2( NULL_HASH, ndx );
|
||||
else
|
||||
bufHash[ndx] = make_uint2( __mul24(__mul24(gz, (int) simData.res.y)+gy, (int) simData.res.x) + gx, ndx );
|
||||
|
||||
__syncthreads ();
|
||||
}
|
||||
|
||||
__global__ void insertParticles ( char* bufPnts, uint2* bufHash, int* bufGrid, int numPnt, int numGrid )
|
||||
{
|
||||
uint grid_ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // grid cell index
|
||||
|
||||
bufPnts += OFFSET_NEXT;
|
||||
bufGrid[grid_ndx] = -1;
|
||||
for (int n=0; n < numPnt; n++) {
|
||||
if ( bufHash[n].x == grid_ndx ) {
|
||||
*(int*) (bufPnts + __mul24(bufHash[n].y, simData.stride)) = bufGrid[grid_ndx];
|
||||
bufGrid[grid_ndx] = bufHash[n].y;
|
||||
}
|
||||
}
|
||||
__syncthreads ();
|
||||
}
|
||||
|
||||
__global__ void insertParticlesRadix ( char* bufPnts, uint2* bufHash, int* bufGrid, char* bufPntSort, int numPnt, int numGrid )
|
||||
{
|
||||
uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // particle index
|
||||
|
||||
uint2 bufHashSort = bufHash[ndx];
|
||||
|
||||
__shared__ uint sharedHash[257];
|
||||
sharedHash[threadIdx.x+1] = bufHashSort.x;
|
||||
if ( ndx > 0 && threadIdx.x == 0 ) {
|
||||
volatile uint2 prevData = bufHash[ndx-1];
|
||||
sharedHash[0] = prevData.x;
|
||||
}
|
||||
__syncthreads ();
|
||||
|
||||
if ( (ndx == 0 || bufHashSort.x != sharedHash[threadIdx.x]) && bufHashSort.x != NULL_HASH ) {
|
||||
bufGrid [ bufHashSort.x ] = ndx;
|
||||
}
|
||||
if ( ndx < numPnt ) {
|
||||
char* src = bufPnts + __mul24( bufHashSort.y, simData.stride );
|
||||
char* dest = bufPntSort + __mul24( ndx, simData.stride );
|
||||
|
||||
*(float3*)(dest) = *(float3*)(src);
|
||||
*(uint*) (dest + OFFSET_CLR) = *(uint*) (src + OFFSET_CLR);
|
||||
*(float3*)(dest + OFFSET_VEL) = *(float3*)(src + OFFSET_VEL);
|
||||
*(float3*)(dest + OFFSET_VEVAL) = *(float3*)(src + OFFSET_VEVAL);
|
||||
|
||||
*(float*) (dest + OFFSET_DENS) = 0.0;
|
||||
*(float*) (dest + OFFSET_PRESS) = 0.0;
|
||||
*(float3*) (dest + OFFSET_FORCE)= make_float3(0,0,0);
|
||||
*(int*) (dest + OFFSET_NEXT) = bufHashSort.x;
|
||||
}
|
||||
|
||||
__syncthreads ();
|
||||
|
||||
}
|
||||
|
||||
//__shared__ int ncount [ BLOCK_THREADS ];
|
||||
|
||||
__device__ float contributePressure ( int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
|
||||
{
|
||||
float3* qpos;
|
||||
float3 dist;
|
||||
float dsq, c, sum;
|
||||
float d = simData.sim_scale;
|
||||
int nbr = __mul24(pndx, MAX_NBR);
|
||||
|
||||
sum = 0.0;
|
||||
for ( ; qndx < simData.pnts; qndx++ ) {
|
||||
|
||||
if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
|
||||
|
||||
if ( qndx != pndx ) {
|
||||
qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));
|
||||
|
||||
dist.x = ( p->x - qpos->x )*d; // dist in cm
|
||||
dist.y = ( p->y - qpos->y )*d;
|
||||
dist.z = ( p->z - qpos->z )*d;
|
||||
dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);
|
||||
if ( dsq < simData.r2 ) {
|
||||
c = simData.r2 - dsq;
|
||||
sum += c * c * c;
|
||||
if ( bufNeighbor[nbr] < MAX_NBR ) {
|
||||
bufNeighbor[ nbr+bufNeighbor[nbr] ] = qndx;
|
||||
bufNdist[ nbr+bufNeighbor[nbr] ] = sqrt(dsq);
|
||||
bufNeighbor[nbr]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
//curr = *(int*) (bufPnts + __mul24(curr, simData.stride) + OFFSET_NEXT);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
/*if ( ncount[threadIdx.x] < MAX_NBR ) {
|
||||
bufNeighbor [ nbr + ncount[threadIdx.x] ] = curr;
|
||||
bufNdist [ nbr + ncount[threadIdx.x] ] = sqrt(dsq);
|
||||
ncount[threadIdx.x]++;
|
||||
}*/
|
||||
|
||||
__global__ void computePressure ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
|
||||
{
|
||||
uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // particle index
|
||||
|
||||
//if ( ndx < 1024 ) {
|
||||
|
||||
float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));
|
||||
|
||||
// Find 2x2x2 grid cells
|
||||
// - Use registers only, no arrays (local-memory too slow)
|
||||
int3 cell;
|
||||
int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;
|
||||
float gs = simData.smooth_rad / simData.sim_scale;
|
||||
|
||||
cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
|
||||
cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
|
||||
cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));
|
||||
gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
|
||||
gc1 = gc0 + 1;
|
||||
gc2 = gc0 + simData.res.x;
|
||||
gc3 = gc2 + 1;
|
||||
if ( cell.z+1 < simData.res.z ) {
|
||||
gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
|
||||
gc5 = gc4 + 1;
|
||||
gc6 = gc4 + simData.res.x;
|
||||
gc7 = gc6 + 1;
|
||||
}
|
||||
if ( cell.x+1 >= simData.res.x ) {
|
||||
gc1 = -1; gc3 = -1;
|
||||
gc5 = -1; gc7 = -1;
|
||||
}
|
||||
if ( cell.y+1 >= simData.res.y ) {
|
||||
gc2 = -1; gc3 = -1;
|
||||
gc6 = -1; gc7 = -1;
|
||||
}
|
||||
// Sum Pressure
|
||||
float sum = 0.0;
|
||||
bufNeighbor[ __mul24(ndx, MAX_NBR) ] = 1;
|
||||
if (gc0 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
|
||||
if (gc1 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );
|
||||
if (gc2 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );
|
||||
if (gc3 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );
|
||||
if (gc4 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
|
||||
if (gc5 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );
|
||||
if (gc6 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
|
||||
if (gc7 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
|
||||
|
||||
// Compute Density & Pressure
|
||||
sum = sum * simData.pmass * simData.poly6kern;
|
||||
if ( sum == 0.0 ) sum = 1.0;
|
||||
*(float*) ((char*)pos + OFFSET_PRESS) = ( sum - simData.rest_dens ) * simData.stiffness;
|
||||
*(float*) ((char*)pos + OFFSET_DENS) = 1.0f / sum;
|
||||
|
||||
//}
|
||||
//__syncthreads ();
|
||||
}
|
||||
|
||||
__device__ void contributeForce ( float3& force, int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
|
||||
{
|
||||
float press = *(float*) ((char*)p + OFFSET_PRESS);
|
||||
float dens = *(float*) ((char*)p + OFFSET_DENS);
|
||||
float3 veval = *(float3*) ((char*)p + OFFSET_VEVAL );
|
||||
float3 qeval, dist;
|
||||
float c, ndistj, dsq;
|
||||
float pterm, dterm, vterm;
|
||||
float3* qpos;
|
||||
float d = simData.sim_scale;
|
||||
|
||||
vterm = simData.lapkern * simData.visc;
|
||||
|
||||
for ( ; qndx < simData.pnts; qndx++ ) {
|
||||
|
||||
if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
|
||||
|
||||
if ( qndx != pndx ) {
|
||||
qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));
|
||||
|
||||
dist.x = ( p->x - qpos->x )*d; // dist in cm
|
||||
dist.y = ( p->y - qpos->y )*d;
|
||||
dist.z = ( p->z - qpos->z )*d;
|
||||
dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);
|
||||
if ( dsq < simData.r2 ) {
|
||||
ndistj = sqrt(dsq);
|
||||
c = ( simData.smooth_rad - ndistj );
|
||||
dist.x = ( p->x - qpos->x )*d; // dist in cm
|
||||
dist.y = ( p->y - qpos->y )*d;
|
||||
dist.z = ( p->z - qpos->z )*d;
|
||||
pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
|
||||
dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);
|
||||
qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
|
||||
force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
|
||||
force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
|
||||
force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__ void computeForce ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
|
||||
{
|
||||
uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // particle index
|
||||
|
||||
//if ( ndx < numPnt ) {
|
||||
|
||||
float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));
|
||||
|
||||
// Find 2x2x2 grid cells
|
||||
// - Use registers only, no arrays (local-memory too slow)
|
||||
int3 cell;
|
||||
int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;
|
||||
float gs = simData.smooth_rad / simData.sim_scale;
|
||||
|
||||
cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
|
||||
cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
|
||||
cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));
|
||||
gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
|
||||
gc1 = gc0 + 1;
|
||||
gc2 = gc0 + simData.res.x;
|
||||
gc3 = gc2 + 1;
|
||||
if ( cell.z+1 < simData.res.z ) {
|
||||
gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
|
||||
gc5 = gc4 + 1;
|
||||
gc6 = gc4 + simData.res.x;
|
||||
gc7 = gc6 + 1;
|
||||
}
|
||||
if ( cell.x+1 >= simData.res.x ) {
|
||||
gc1 = -1; gc3 = -1;
|
||||
gc5 = -1; gc7 = -1;
|
||||
}
|
||||
if ( cell.y+1 >= simData.res.y ) {
|
||||
gc2 = -1; gc3 = -1;
|
||||
gc6 = -1; gc7 = -1;
|
||||
}
|
||||
// Sum Pressure
|
||||
float3 force = make_float3(0,0,0);
|
||||
if (gc0 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
|
||||
if (gc1 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );
|
||||
if (gc2 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );
|
||||
if (gc3 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );
|
||||
if (gc4 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
|
||||
if (gc5 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );
|
||||
if (gc6 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
|
||||
if (gc7 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
|
||||
|
||||
// Update Force
|
||||
*(float3*) ((char*)pos + OFFSET_FORCE ) = force;
|
||||
|
||||
//}
|
||||
//__syncthreads ();
|
||||
}
|
||||
|
||||
|
||||
__global__ void computeForceNbr ( char* bufPntSort, int numPnt )
|
||||
{
|
||||
uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // particle index
|
||||
|
||||
if ( ndx < numPnt ) {
|
||||
|
||||
float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));
|
||||
|
||||
float3* qpos;
|
||||
float press = *(float*) ((char*)pos + OFFSET_PRESS);
|
||||
float dens = *(float*) ((char*)pos + OFFSET_DENS);
|
||||
float3 veval = *(float3*) ((char*)pos + OFFSET_VEVAL );
|
||||
float3 qeval, dist, force;
|
||||
float d = simData.sim_scale;
|
||||
float c, ndistj;
|
||||
float pterm, dterm, vterm;
|
||||
vterm = simData.lapkern * simData.visc;
|
||||
int nbr = __mul24(ndx, MAX_NBR);
|
||||
|
||||
int ncnt = bufNeighbor[ nbr ];
|
||||
|
||||
force = make_float3(0,0,0);
|
||||
for (int j=1; j < ncnt; j++) { // base 1, n[0] = count
|
||||
ndistj = bufNdist[ nbr+j ];
|
||||
qpos = (float3*) (bufPntSort + __mul24( bufNeighbor[ nbr+j ], simData.stride) );
|
||||
c = ( simData.smooth_rad - ndistj );
|
||||
dist.x = ( pos->x - qpos->x )*d; // dist in cm
|
||||
dist.y = ( pos->y - qpos->y )*d;
|
||||
dist.z = ( pos->z - qpos->z )*d;
|
||||
pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
|
||||
dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);
|
||||
qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
|
||||
force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
|
||||
force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
|
||||
force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;
|
||||
}
|
||||
*(float3*) ((char*)pos + OFFSET_FORCE ) = force;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void advanceParticles ( char* bufPntSort, int numPnt, float dt, float ss )
|
||||
{
|
||||
uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; // particle index
|
||||
|
||||
if ( ndx < numPnt ) {
|
||||
|
||||
// Get particle vars
|
||||
float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));
|
||||
float3* vel = (float3*) ((char*)pos + OFFSET_VEL );
|
||||
float3* vel_eval = (float3*) ((char*)pos + OFFSET_VEVAL );
|
||||
float3 accel = *(float3*) ((char*)pos + OFFSET_FORCE );
|
||||
float3 vcurr, vnext;
|
||||
|
||||
// Leapfrog integration
|
||||
accel.x *= 0.00020543; // NOTE - To do: SPH_PMASS should be passed in
|
||||
accel.y *= 0.00020543;
|
||||
accel.z *= 0.00020543;
|
||||
accel.z -= 9.8;
|
||||
|
||||
vcurr = *vel;
|
||||
vnext.x = accel.x*dt + vcurr.x;
|
||||
vnext.y = accel.y*dt + vcurr.y;
|
||||
vnext.z = accel.z*dt + vcurr.z; // v(t+1/2) = v(t-1/2) + a(t) dt
|
||||
|
||||
accel.x = (vcurr.x + vnext.x) * 0.5; // v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5 used to compute forces later
|
||||
accel.y = (vcurr.y + vnext.y) * 0.5; // v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5 used to compute forces later
|
||||
accel.z = (vcurr.z + vnext.z) * 0.5; // v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5 used to compute forces later
|
||||
|
||||
*vel_eval = accel;
|
||||
*vel = vnext;
|
||||
|
||||
dt /= simData.sim_scale;
|
||||
vnext.x = pos->x + vnext.x*dt;
|
||||
vnext.y = pos->y + vnext.y*dt;
|
||||
vnext.z = pos->z + vnext.z*dt;
|
||||
*pos = vnext; // p(t+1) = p(t) + v(t+1/2) dt
|
||||
}
|
||||
|
||||
__syncthreads ();
|
||||
}
|
||||
|
||||
#endif
|
||||
45
Extras/sph/fluids/fluid_system_kern.cuh
Normal file
45
Extras/sph/fluids/fluid_system_kern.cuh
Normal file
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
|
||||
Copyright (C) 2009. Rama Hoetzlein, http://www.rchoetzlein.com
|
||||
|
||||
ZLib license
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#ifndef _PARTICLES_KERNEL_H_
|
||||
#define _PARTICLES_KERNEL_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include "cutil_math.h"
|
||||
#include "math_constants.h"
|
||||
|
||||
// Insert particles in grid
|
||||
|
||||
__global__ void insertParticles ( char* pntData, uint pntStride )
|
||||
{
|
||||
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
|
||||
float4 p = *(float4*) (pntData + index*pntStride);
|
||||
|
||||
// get address in grid
|
||||
int3 gridPos = calcGridPos(p);
|
||||
|
||||
addParticleToCell(gridPos, index, gridCounters, gridCells);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
79
Extras/sph/fluids/radixsort.cu
Normal file
79
Extras/sph/fluids/radixsort.cu
Normal file
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO USER:
|
||||
*
|
||||
* This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
* OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
*
|
||||
* U.S. Government End Users. This source code is a "commercial item" as
|
||||
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
* "commercial computer software" and "commercial computer software
|
||||
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
* and is provided to the U.S. Government only as a commercial end item.
|
||||
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
* source code with only those rights set forth herein.
|
||||
*/
|
||||
|
||||
/* Radixsort project with key/value and arbitrary datset size support
|
||||
* which demonstrates the use of CUDA in a multi phase sorting
|
||||
* computation.
|
||||
* Host code.
|
||||
*/
|
||||
|
||||
#include "radixsort.cuh"
|
||||
#include "radixsort_kernel.cu"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Perform a radix sort
|
||||
//! Sorting performed in place on passed arrays.
|
||||
//!
|
||||
//! @param pData0 input and output array - data will be sorted
|
||||
//! @param pData1 additional array to allow ping pong computation
|
||||
//! @param elements number of elements to sort
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits)
|
||||
{
|
||||
// Round element count to total number of threads for efficiency
|
||||
uint elements_rounded_to_3072;
|
||||
int modval = elements % 3072;
|
||||
if( modval == 0 )
|
||||
elements_rounded_to_3072 = elements;
|
||||
else
|
||||
elements_rounded_to_3072 = elements + (3072 - (modval));
|
||||
|
||||
// Iterate over n bytes of y bit word, using each byte to sort the list in turn
|
||||
for (uint shift = 0; shift < bits; shift += RADIX)
|
||||
{
|
||||
// Perform one round of radix sorting
|
||||
|
||||
// Generate per radix group sums radix counts across a radix group
|
||||
RadixSum<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, GRFSIZE>>>(pData0, elements, elements_rounded_to_3072, shift);
|
||||
// Prefix sum in radix groups, and then between groups throughout a block
|
||||
RadixPrefixSum<<<PREFIX_NUM_BLOCKS, PREFIX_NUM_THREADS_PER_BLOCK, PREFIX_GRFSIZE>>>();
|
||||
// Sum the block offsets and then shuffle data into bins
|
||||
RadixAddOffsetsAndShuffle<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, SHUFFLE_GRFSIZE>>>(pData0, pData1, elements, elements_rounded_to_3072, shift);
|
||||
|
||||
// Exchange data pointers
|
||||
KeyValuePair* pTemp = pData0;
|
||||
pData0 = pData1;
|
||||
pData1 = pTemp;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
63
Extras/sph/fluids/radixsort.cuh
Normal file
63
Extras/sph/fluids/radixsort.cuh
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO USER:
|
||||
*
|
||||
* This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
* OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
*
|
||||
* U.S. Government End Users. This source code is a "commercial item" as
|
||||
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
* "commercial computer software" and "commercial computer software
|
||||
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
* and is provided to the U.S. Government only as a commercial end item.
|
||||
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
* source code with only those rights set forth herein.
|
||||
*/
|
||||
|
||||
/* Radixsort project which demonstrates the use of CUDA in a multi phase
|
||||
* sorting computation.
|
||||
* Type definitions.
|
||||
*/
|
||||
|
||||
#ifndef _RADIXSORT_H_
|
||||
#define _RADIXSORT_H_
|
||||
|
||||
#include <host_defines.h>
|
||||
|
||||
#define SYNCIT __syncthreads()
|
||||
|
||||
// Use 16 bit keys/values
|
||||
#define SIXTEEN 0
|
||||
|
||||
typedef unsigned int uint;
|
||||
typedef unsigned short ushort;
|
||||
|
||||
#if SIXTEEN
|
||||
typedef struct __align__(4) {
|
||||
ushort key;
|
||||
ushort value;
|
||||
#else
|
||||
typedef struct __align__(8) {
|
||||
uint key;
|
||||
uint value;
|
||||
#endif
|
||||
} KeyValuePair;
|
||||
|
||||
extern "C" {
|
||||
void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits);
|
||||
}
|
||||
|
||||
#endif // #ifndef _RADIXSORT_H_
|
||||
577
Extras/sph/fluids/radixsort_kernel.cu
Normal file
577
Extras/sph/fluids/radixsort_kernel.cu
Normal file
@@ -0,0 +1,577 @@
|
||||
/*
|
||||
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO USER:
|
||||
*
|
||||
* This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
* OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
*
|
||||
* U.S. Government End Users. This source code is a "commercial item" as
|
||||
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
* "commercial computer software" and "commercial computer software
|
||||
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
* and is provided to the U.S. Government only as a commercial end item.
|
||||
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
* source code with only those rights set forth herein.
|
||||
*/
|
||||
|
||||
/* Radixsort project with key/value and arbitrary datset size support
|
||||
* which demonstrates the use of CUDA in a multi phase sorting
|
||||
* computation.
|
||||
* Device code.
|
||||
*/
|
||||
|
||||
#ifndef _RADIXSORT_KERNEL_H_
|
||||
#define _RADIXSORT_KERNEL_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include "radixsort.cuh"
|
||||
|
||||
#define SYNCIT __syncthreads()
|
||||
|
||||
static const int NUM_SMS = 16;
|
||||
static const int NUM_THREADS_PER_SM = 192;
|
||||
static const int NUM_THREADS_PER_BLOCK = 64;
|
||||
//static const int NUM_THREADS = NUM_THREADS_PER_SM * NUM_SMS;
|
||||
static const int NUM_BLOCKS = (NUM_THREADS_PER_SM / NUM_THREADS_PER_BLOCK) * NUM_SMS;
|
||||
static const int RADIX = 8; // Number of bits per radix sort pass
|
||||
static const int RADICES = 1 << RADIX; // Number of radices
|
||||
static const int RADIXMASK = RADICES - 1; // Mask for each radix sort pass
|
||||
#if SIXTEEN
|
||||
static const int RADIXBITS = 16; // Number of bits to sort over
|
||||
#else
|
||||
static const int RADIXBITS = 32; // Number of bits to sort over
|
||||
#endif
|
||||
static const int RADIXTHREADS = 16; // Number of threads sharing each radix counter
|
||||
static const int RADIXGROUPS = NUM_THREADS_PER_BLOCK / RADIXTHREADS; // Number of radix groups per CTA
|
||||
static const int TOTALRADIXGROUPS = NUM_BLOCKS * RADIXGROUPS; // Number of radix groups for each radix
|
||||
static const int SORTRADIXGROUPS = TOTALRADIXGROUPS * RADICES; // Total radix count
|
||||
static const int GRFELEMENTS = (NUM_THREADS_PER_BLOCK / RADIXTHREADS) * RADICES;
|
||||
static const int GRFSIZE = GRFELEMENTS * sizeof(uint);
|
||||
|
||||
// Prefix sum variables
|
||||
static const int PREFIX_NUM_THREADS_PER_SM = NUM_THREADS_PER_SM;
|
||||
static const int PREFIX_NUM_THREADS_PER_BLOCK = PREFIX_NUM_THREADS_PER_SM;
|
||||
static const int PREFIX_NUM_BLOCKS = (PREFIX_NUM_THREADS_PER_SM / PREFIX_NUM_THREADS_PER_BLOCK) * NUM_SMS;
|
||||
static const int PREFIX_BLOCKSIZE = SORTRADIXGROUPS / PREFIX_NUM_BLOCKS;
|
||||
static const int PREFIX_GRFELEMENTS = PREFIX_BLOCKSIZE + 2 * PREFIX_NUM_THREADS_PER_BLOCK;
|
||||
static const int PREFIX_GRFSIZE = PREFIX_GRFELEMENTS * sizeof(uint);
|
||||
|
||||
// Shuffle variables
|
||||
static const int SHUFFLE_GRFOFFSET = RADIXGROUPS * RADICES;
|
||||
static const int SHUFFLE_GRFELEMENTS = SHUFFLE_GRFOFFSET + PREFIX_NUM_BLOCKS;
|
||||
static const int SHUFFLE_GRFSIZE = SHUFFLE_GRFELEMENTS * sizeof(uint);
|
||||
|
||||
|
||||
#define SDATA( index) CUT_BANK_CHECKER(sdata, index)
|
||||
|
||||
// Prefix sum data
|
||||
uint gRadixSum[TOTALRADIXGROUPS * RADICES];
|
||||
__device__ uint dRadixSum[TOTALRADIXGROUPS * RADICES];
|
||||
uint gRadixBlockSum[PREFIX_NUM_BLOCKS];
|
||||
__device__ uint dRadixBlockSum[PREFIX_NUM_BLOCKS];
|
||||
|
||||
extern __shared__ uint sRadixSum[];
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Perform a radix sum on the list to be sorted. Each SM holds a set of
|
||||
//! radix counters for each group of RADIXGROUPS thread in the GRF.
|
||||
//!
|
||||
//! @param pData input data
|
||||
//! @param elements total number of elements
|
||||
//! @param elements_rounded_to_3072 total number of elements rounded up to the
|
||||
//! nearest multiple of 3072
|
||||
//! @param shift the shift (0 to 24) that we are using to obtain the correct
|
||||
//! byte
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__global__ void RadixSum(KeyValuePair *pData, uint elements, uint elements_rounded_to_3072, uint shift)
|
||||
{
|
||||
uint pos = threadIdx.x;
|
||||
|
||||
// Zero radix counts
|
||||
while (pos < GRFELEMENTS)
|
||||
{
|
||||
sRadixSum[pos] = 0;
|
||||
pos += NUM_THREADS_PER_BLOCK;
|
||||
}
|
||||
|
||||
// Sum up data
|
||||
// Source addresses computed so that each thread is reading from a block of
|
||||
// consecutive addresses so there are no conflicts between threads
|
||||
// They then loop over their combined region and the next batch works elsewhere.
|
||||
// So threads 0 to 16 work on memory 0 to 320.
|
||||
// First reading 0,1,2,3...15 then 16,17,18,19...31 and so on
|
||||
// optimising parallel access to shared memory by a thread accessing 16*threadID
|
||||
// The next radix group runs from 320 to 640 and the same applies in that region
|
||||
uint tmod = threadIdx.x % RADIXTHREADS;
|
||||
uint tpos = threadIdx.x / RADIXTHREADS;
|
||||
|
||||
// Take the rounded element list size so that all threads have a certain size dataset to work with
|
||||
// and no zero size datasets confusing the issue
|
||||
// By using a multiple of 3072 we ensure that all threads have elements
|
||||
// to work with until the last phase, at which point we individually test
|
||||
uint element_fraction = elements_rounded_to_3072 / TOTALRADIXGROUPS;
|
||||
|
||||
// Generate range
|
||||
// Note that it is possible for both pos and end to be past the end of the element set
|
||||
// which will be caught later.
|
||||
pos = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction;
|
||||
uint end = pos + element_fraction;
|
||||
pos += tmod;
|
||||
//printf("pos: %d\n", pos);
|
||||
__syncthreads();
|
||||
|
||||
while (pos < end )
|
||||
{
|
||||
uint key = 0;
|
||||
|
||||
// Read first data element if we are in the set of elements
|
||||
//if( pos < elements )
|
||||
//key = pData[pos].key;
|
||||
KeyValuePair kvp;
|
||||
// Read first data element, both items at once as the memory will want to coalesce like that anyway
|
||||
if (pos < elements)
|
||||
kvp = pData[pos];
|
||||
else
|
||||
kvp.key = 0;
|
||||
key = kvp.key;
|
||||
|
||||
|
||||
// Calculate position of radix counter to increment
|
||||
// There are RADICES radices in each pass (256)
|
||||
// and hence this many counters for bin grouping
|
||||
// Multiply by RADIXGROUPS (4) to spread through memory
|
||||
// and into 4 radix groups
|
||||
uint p = ((key >> shift) & RADIXMASK) * RADIXGROUPS;
|
||||
|
||||
// Increment radix counters
|
||||
// Each radix group has its own set of counters
|
||||
// so we add the thread position [0-3], ie the group index.
|
||||
// We slow down here and take at least 16 cycles to write to the summation boxes
|
||||
// but other groups will only conflict with themselves and so can also be writing
|
||||
// 16 cycles here at least avoids retries.
|
||||
uint ppos = p + tpos;
|
||||
|
||||
// If we are past the last element we don't want to do anything
|
||||
// We do have to check each time, however, to ensure that all
|
||||
// threads sync on each sync here.
|
||||
if (tmod == 0 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 1 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 2 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 3 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 4 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 5 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 6 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 7 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 8 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 9 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 10 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 11 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 12 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 13 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 14 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
if (tmod == 15 && pos < elements)
|
||||
sRadixSum[ppos]++;
|
||||
SYNCIT;
|
||||
|
||||
pos += RADIXTHREADS;
|
||||
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Output radix sums into separate memory regions for each radix group
|
||||
// So this memory then is layed out:
|
||||
// 0...... 192..... 384 ................ 192*256
|
||||
// ie all 256 bins for each radix group
|
||||
// in there:
|
||||
// 0.............192
|
||||
// 0 4 8 12... - block idx * 4
|
||||
// And in the block boxes we see the 4 radix groups for that block
|
||||
// So 0-192 should contain bin 0 for each radix group, and so on
|
||||
uint offset = blockIdx.x * RADIXGROUPS;
|
||||
uint row = threadIdx.x / RADIXGROUPS;
|
||||
uint column = threadIdx.x % RADIXGROUPS;
|
||||
while (row < RADICES)
|
||||
{
|
||||
dRadixSum[offset + row * TOTALRADIXGROUPS + column] = sRadixSum[row * RADIXGROUPS + column];
|
||||
row += NUM_THREADS_PER_BLOCK / RADIXGROUPS;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Performs first part of parallel prefix sum - individual sums of each radix
|
||||
//! count. By the end of this we have prefix sums on a block level in dRadixSum
|
||||
//! and totals for blocks in dRadixBlockSum.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__global__ void RadixPrefixSum()
|
||||
{
|
||||
// Read radix groups in offset by one in the GRF so a zero can be inserted at the beginning
|
||||
// and the final sum of all radix counts summed here is tacked onto the end for reading by
|
||||
// the next stage
|
||||
// Each block in this case is the full number of threads per SM (and hence the total number
|
||||
// of radix groups), 192. We should then have the total set of offsets for an entire radix
|
||||
// group by the end of this stage
|
||||
// Device mem addressing
|
||||
|
||||
uint brow = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS);
|
||||
uint drow = threadIdx.x / TOTALRADIXGROUPS; // In default parameterisation this is always 0
|
||||
uint dcolumn = threadIdx.x % TOTALRADIXGROUPS; // And similarly this is always the same as threadIdx.x
|
||||
uint dpos = (brow + drow) * TOTALRADIXGROUPS + dcolumn;
|
||||
uint end = ((blockIdx.x + 1) * (RADICES / PREFIX_NUM_BLOCKS)) * TOTALRADIXGROUPS;
|
||||
// Shared mem addressing
|
||||
uint srow = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
|
||||
uint scolumn = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
|
||||
uint spos = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn;
|
||||
|
||||
// Read (RADICES / PREFIX_NUM_BLOCKS) radix counts into the GRF alongside each other
|
||||
while (dpos < end)
|
||||
{
|
||||
sRadixSum[spos] = dRadixSum[dpos];
|
||||
spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) *
|
||||
(PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
|
||||
dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Perform preliminary sum on each thread's stretch of data
|
||||
// Each thread having a block of 16, with spacers between 0...16 18...33 and so on
|
||||
int pos = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
|
||||
end = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
|
||||
uint sum = 0;
|
||||
while (pos < end)
|
||||
{
|
||||
sum += sRadixSum[pos];
|
||||
sRadixSum[pos] = sum;
|
||||
pos++;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
|
||||
// Calculate internal offsets by performing a more traditional parallel
|
||||
// prefix sum of the topmost member of each thread's work data. Right now,
|
||||
// these are stored between the work data for each thread, allowing us to
|
||||
// eliminate GRF conflicts as well as hold the offsets needed to complete the sum
|
||||
// In other words we have:
|
||||
// 0....15 16 17....32 33 34....
|
||||
// Where this first stage updates the intermediate values (so 16=15, 33=32 etc)
|
||||
int m = (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
|
||||
pos = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) +
|
||||
(PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
|
||||
sRadixSum[pos] = sRadixSum[pos - 1];
|
||||
__syncthreads();
|
||||
// This stage then performs a parallel prefix sum (ie use powers of 2 to propagate in log n stages)
|
||||
// to update 17, 34 etc with the totals to that point (so 34 becomes [34] + [17]) and so on.
|
||||
while (m < PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1))
|
||||
{
|
||||
int p = pos - m;
|
||||
uint t = ((p > 0) ? sRadixSum[p] : 0);
|
||||
__syncthreads();
|
||||
sRadixSum[pos] += t;
|
||||
__syncthreads();
|
||||
m *= 2;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
|
||||
|
||||
// Add internal offsets to each thread's work data.
|
||||
// So now we take 17 and add it to all values 18 to 33 so all offsets for that block
|
||||
// are updated.
|
||||
pos = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
|
||||
end = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
|
||||
int p = pos - 1;
|
||||
sum = ((p > 0) ? sRadixSum[p] : 0);
|
||||
while (pos < end)
|
||||
{
|
||||
sRadixSum[pos] += sum;
|
||||
pos++;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Write summed data back out to global memory in the same way as we read it in
|
||||
// We now have prefix sum values internal to groups
|
||||
brow = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS);
|
||||
drow = threadIdx.x / TOTALRADIXGROUPS;
|
||||
dcolumn = threadIdx.x % TOTALRADIXGROUPS;
|
||||
srow = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
|
||||
scolumn = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
|
||||
dpos = (brow + drow) * TOTALRADIXGROUPS + dcolumn + 1;
|
||||
spos = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn;
|
||||
end = ((blockIdx.x + 1) * RADICES / PREFIX_NUM_BLOCKS) * TOTALRADIXGROUPS;
|
||||
while (dpos < end)
|
||||
{
|
||||
dRadixSum[dpos] = sRadixSum[spos];
|
||||
dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS;
|
||||
spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) *
|
||||
(PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
|
||||
}
|
||||
|
||||
// Write last element to summation
|
||||
// Storing block sums in a separate array
|
||||
if (threadIdx.x == 0) {
|
||||
dRadixBlockSum[blockIdx.x] = sRadixSum[PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) - 1];
|
||||
dRadixSum[blockIdx.x * PREFIX_BLOCKSIZE] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Initially perform prefix sum of block totals to obtain final set of offsets.
|
||||
//! Then make use of radix sums to perform a shuffling of the data into the
|
||||
//! correct bins.
|
||||
//!
|
||||
//! @param pSrc input data
|
||||
//! @param pDst output data
|
||||
//! @param elements total number of elements
|
||||
//! @param shift the shift (0 to 24) that we are using to obtain the correct
|
||||
//! byte
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__global__ void RadixAddOffsetsAndShuffle(KeyValuePair* pSrc, KeyValuePair* pDst, uint elements, uint elements_rounded_to_3072, int shift)
|
||||
{
|
||||
// Read offsets from previous blocks
|
||||
if (threadIdx.x == 0)
|
||||
sRadixSum[SHUFFLE_GRFOFFSET] = 0;
|
||||
|
||||
if (threadIdx.x < PREFIX_NUM_BLOCKS - 1)
|
||||
sRadixSum[SHUFFLE_GRFOFFSET + threadIdx.x + 1] = dRadixBlockSum[threadIdx.x];
|
||||
__syncthreads();
|
||||
|
||||
// Parallel prefix sum over block sums
|
||||
int pos = threadIdx.x;
|
||||
int n = 1;
|
||||
while (n < PREFIX_NUM_BLOCKS)
|
||||
{
|
||||
int ppos = pos - n;
|
||||
uint t0 = ((pos < PREFIX_NUM_BLOCKS) && (ppos >= 0)) ? sRadixSum[SHUFFLE_GRFOFFSET + ppos] : 0;
|
||||
__syncthreads();
|
||||
if (pos < PREFIX_NUM_BLOCKS)
|
||||
sRadixSum[SHUFFLE_GRFOFFSET + pos] += t0;
|
||||
__syncthreads();
|
||||
n *= 2;
|
||||
}
|
||||
|
||||
// Read radix count data and add appropriate block offset
|
||||
// for each radix at the memory location for this thread
|
||||
// (where the other threads in the block will be reading
|
||||
// as well, hence the large stride).
|
||||
// There is one counter box per radix group per radix
|
||||
// per block (4*256*3)
|
||||
// We use 64 threads to read the 4 radix groups set of radices
|
||||
// for the block.
|
||||
int row = threadIdx.x / RADIXGROUPS;
|
||||
int column = threadIdx.x % RADIXGROUPS;
|
||||
int spos = row * RADIXGROUPS + column;
|
||||
int dpos = row * TOTALRADIXGROUPS + column + blockIdx.x * RADIXGROUPS;
|
||||
while (spos < SHUFFLE_GRFOFFSET)
|
||||
{
|
||||
sRadixSum[spos] = dRadixSum[dpos] + sRadixSum[SHUFFLE_GRFOFFSET + dpos / (TOTALRADIXGROUPS * RADICES / PREFIX_NUM_BLOCKS)];
|
||||
spos += NUM_THREADS_PER_BLOCK;
|
||||
dpos += (NUM_THREADS_PER_BLOCK / RADIXGROUPS) * TOTALRADIXGROUPS;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
//int pos;
|
||||
// Shuffle data
|
||||
// Each of the subbins for a block should be filled via the counters, properly interleaved
|
||||
// Then, as we now iterate over each data value, we increment the subbins (each thread in the
|
||||
// radix group in turn to avoid miss writes due to conflicts) and set locations correctly.
|
||||
uint element_fraction = elements_rounded_to_3072 / TOTALRADIXGROUPS;
|
||||
int tmod = threadIdx.x % RADIXTHREADS;
|
||||
int tpos = threadIdx.x / RADIXTHREADS;
|
||||
|
||||
pos = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction;
|
||||
uint end = pos + element_fraction; //(blockIdx.x * RADIXGROUPS + tpos + 1) * element_fraction;
|
||||
pos += tmod;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
while (pos < end )
|
||||
{
|
||||
KeyValuePair kvp;
|
||||
#if 1 // old load
|
||||
// Read first data element, both items at once as the memory will want to coalesce like that anyway
|
||||
if (pos < elements)
|
||||
{
|
||||
kvp = pSrc[pos];
|
||||
}
|
||||
else
|
||||
kvp.key = 0;
|
||||
|
||||
#else // casting to float2 to get it to combine loads
|
||||
int2 kvpf2;
|
||||
|
||||
// Read first data element, both items at once as the memory will want to coalesce like that anyway
|
||||
if (pos < elements)
|
||||
{
|
||||
// kvp = pSrc[pos];
|
||||
kvpf2 = ((int2*)pSrc)[pos];
|
||||
// printf("kvp: %f %f kvpf2: %f %f\n", kvp.key, kvp.value, kvpf2.x, kvpf2.y);
|
||||
}
|
||||
else
|
||||
//kvp.key = 0;
|
||||
kvpf2.x = 0;
|
||||
|
||||
kvp.key = kvpf2.x;
|
||||
kvp.value = kvpf2.y;
|
||||
#endif
|
||||
|
||||
uint index;
|
||||
|
||||
// Calculate position of radix counter to increment
|
||||
uint p = ((kvp.key >> shift) & RADIXMASK) * RADIXGROUPS;
|
||||
|
||||
// Move data, keeping counts updated.
|
||||
// Increment radix counters, relying on hexadecathread
|
||||
// warp to prevent this code from stepping all over itself.
|
||||
uint ppos = p + tpos;
|
||||
if (tmod == 0 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 1 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 2 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 3 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 4 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 5 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 6 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 7 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 8 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 9 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 10 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 11 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 12 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 13 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 14 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
if (tmod == 15 && pos < elements)
|
||||
{
|
||||
index = sRadixSum[ppos]++;
|
||||
pDst[index] = kvp;
|
||||
}
|
||||
SYNCIT;
|
||||
|
||||
pos += RADIXTHREADS;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#endif // #ifndef _RADIXSORT_KERNEL_H_
|
||||
Reference in New Issue
Block a user