Added initial Smoothed Particle Hydrodynamics implementation (SPH), for CPU and CUDA.

This software is contributed under the ZLib license by Rama Hoetzlein, http://www.rchoetzlein.com We plan to integrate the SPH into the core Bullet library, including interaction with rigid bodies and soft bodies.
2009-05-13 22:28:03 +00:00
parent 4616b62686
commit e260bcd1b2
50 changed files with 61309 additions and 0 deletions
--- a/Extras/sph/fluids/fluid.cpp
+++ b/Extras/sph/fluids/fluid.cpp
@@ -0,0 +1,22 @@
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2009. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+#include "fluid.h"
--- a/Extras/sph/fluids/fluid.h
+++ b/Extras/sph/fluids/fluid.h
@@ -0,0 +1,44 @@
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef DEF_FLUID
+	#define DEF_FLUID
+	
+	#include "vector.h"
+
+	#include "common_defs.h"
+
+	struct Fluid {
+	public:
+		Vector3DF		pos;			// Basic particle (must match Particle class)
+		DWORD			clr;
+		int				next;
+		Vector3DF		vel;			
+		Vector3DF		vel_eval;		
+		unsigned short	age;
+
+		float			pressure;		// Smoothed Particle Hydrodynamics
+		float			density;	
+		Vector3DF		sph_force;
+	};
+
+#endif /*PARTICLE_H_*/
--- a/Extras/sph/fluids/fluid_system.cpp
+++ b/Extras/sph/fluids/fluid_system.cpp
@@ -0,0 +1,869 @@
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#include <conio.h>
+
+#ifdef _MSC_VER
+	#include <gl/glut.h>
+#else
+	#include <GL/glut.h>
+#endif
+
+#include "common_defs.h"
+#include "mtime.h"
+#include "fluid_system.h"
+
+#ifdef BUILD_CUDA
+	#include "fluid_system_host.cuh"
+#endif
+
+#define EPSILON			0.00001f			//for collision detection
+
+FluidSystem::FluidSystem ()
+{
+}
+
+void FluidSystem::Initialize ( int mode, int total )
+{
+	if ( mode != BFLUID ) {
+		printf ( "ERROR: FluidSystem not initialized as BFLUID.\n");
+	}
+	PointSet::Initialize ( mode, total );
+	
+	FreeBuffers ();
+	AddBuffer ( BFLUID, sizeof ( Fluid ), total );
+	AddAttribute ( 0, "pos", sizeof ( Vector3DF ), false );	
+	AddAttribute ( 0, "color", sizeof ( DWORD ), false );
+	AddAttribute ( 0, "vel", sizeof ( Vector3DF ), false );
+	AddAttribute ( 0, "ndx", sizeof ( unsigned short ), false );
+	AddAttribute ( 0, "age", sizeof ( unsigned short ), false );
+
+	AddAttribute ( 0, "pressure", sizeof ( double ), false );
+	AddAttribute ( 0, "density", sizeof ( double ), false );
+	AddAttribute ( 0, "sph_force", sizeof ( Vector3DF ), false );
+	AddAttribute ( 0, "next", sizeof ( Fluid* ), false );
+	AddAttribute ( 0, "tag", sizeof ( bool ), false );		
+		
+	SPH_Setup ();
+	Reset ( total );	
+}
+
+void FluidSystem::Reset ( int nmax )
+{
+	ResetBuffer ( 0, nmax );
+
+	m_DT = 0.003; //  0.001;			// .001 = for point grav
+
+	// Reset parameters
+	m_Param [ MAX_FRAC ] = 1.0;
+	m_Param [ POINT_GRAV ] = 0.0;
+	m_Param [ PLANE_GRAV ] = 1.0;
+
+	m_Param [ BOUND_ZMIN_SLOPE ] = 0.0;
+	m_Param [ FORCE_XMAX_SIN ] = 0.0;
+	m_Param [ FORCE_XMIN_SIN ] = 0.0;	
+	m_Toggle [ WRAP_X ] = false;
+	m_Toggle [ WALL_BARRIER ] = false;
+	m_Toggle [ LEVY_BARRIER ] = false;
+	m_Toggle [ DRAIN_BARRIER ] = false;
+	m_Param [ SPH_INTSTIFF ] = 1.00;
+	m_Param [ SPH_VISC ] = 0.2;
+	m_Param [ SPH_INTSTIFF ] = 0.50;
+	m_Param [ SPH_EXTSTIFF ] = 20000;
+	m_Param [ SPH_SMOOTHRADIUS ] = 0.01;
+	
+	m_Vec [ POINT_GRAV_POS ].Set ( 0, 0, 50 );
+	m_Vec [ PLANE_GRAV_DIR ].Set ( 0, 0, -9.8 );
+	m_Vec [ EMIT_POS ].Set ( 0, 0, 0 );
+	m_Vec [ EMIT_RATE ].Set ( 0, 0, 0 );
+	m_Vec [ EMIT_ANG ].Set ( 0, 90, 1.0 );
+	m_Vec [ EMIT_DANG ].Set ( 0, 0, 0 );
+}
+
+int FluidSystem::AddPoint ()
+{
+	xref ndx;	
+	Fluid* f = (Fluid*) AddElem ( 0, ndx );	
+	f->sph_force.Set(0,0,0);
+	f->vel.Set(0,0,0);
+	f->vel_eval.Set(0,0,0);
+	f->next = 0x0;
+	f->pressure = 0;
+	f->density = 0;
+	return ndx;
+}
+
+int FluidSystem::AddPointReuse ()
+{
+	xref ndx;	
+	Fluid* f;
+	if ( NumPoints() <= mBuf[0].max-2 )
+		f = (Fluid*) AddElem ( 0, ndx );
+	else
+		f = (Fluid*) RandomElem ( 0, ndx );
+
+	f->sph_force.Set(0,0,0);
+	f->vel.Set(0,0,0);
+	f->vel_eval.Set(0,0,0);
+	f->next = 0x0;
+	f->pressure = 0;
+	f->density = 0;
+	return ndx;
+}
+
+void FluidSystem::Run ()
+{
+	bool bTiming = true;
+
+	mint::Time start, stop;
+	
+	float ss = m_Param [ SPH_PDIST ] / m_Param[ SPH_SIMSCALE ];		// simulation scale (not Schutzstaffel)
+
+	if ( m_Vec[EMIT_RATE].x > 0 && (++m_Frame) % (int) m_Vec[EMIT_RATE].x == 0 ) {
+		//m_Frame = 0;
+		Emit ( ss ); 
+	}
+	
+	#ifdef NOGRID
+		// Slow method - O(n^2)
+		SPH_ComputePressureSlow ();
+		SPH_ComputeForceSlow ();
+	#else
+
+		if ( m_Toggle[USE_CUDA] ) {
+			
+			#ifdef BUILD_CUDA
+				// -- GPU --
+				start.SetSystemTime ( ACC_NSEC );		
+				TransferToCUDA ( mBuf[0].data, (int*) &m_Grid[0], NumPoints() );
+				if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "TO: %s\n", stop.GetReadableTime().c_str() ); }
+			
+				start.SetSystemTime ( ACC_NSEC );		
+				Grid_InsertParticlesCUDA ();
+				if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "INSERT (CUDA): %s\n", stop.GetReadableTime().c_str() ); }
+
+				start.SetSystemTime ( ACC_NSEC );
+				SPH_ComputePressureCUDA ();
+				if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "PRESS (CUDA): %s\n", stop.GetReadableTime().c_str() ); }
+
+				start.SetSystemTime ( ACC_NSEC );
+				SPH_ComputeForceCUDA (); 
+				if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "FORCE (CUDA): %s\n", stop.GetReadableTime().c_str() ); }
+
+				//** CUDA integrator is incomplete..
+				// Once integrator is done, we can remove TransferTo/From steps
+				/*start.SetSystemTime ( ACC_NSEC );
+				SPH_AdvanceCUDA( m_DT, m_DT/m_Param[SPH_SIMSCALE] );
+				if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "ADV (CUDA): %s\n", stop.GetReadableTime().c_str() ); }*/
+
+				start.SetSystemTime ( ACC_NSEC );		
+				TransferFromCUDA ( mBuf[0].data, (int*) &m_Grid[0], NumPoints() );
+				if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "FROM: %s\n", stop.GetReadableTime().c_str() ); }
+
+				// .. Do advance on CPU 
+				Advance();
+
+			#endif
+			
+		} else {
+			// -- CPU only --
+
+			start.SetSystemTime ( ACC_NSEC );
+			Grid_InsertParticles ();
+			if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "INSERT: %s\n", stop.GetReadableTime().c_str() ); }
+		
+			start.SetSystemTime ( ACC_NSEC );
+			SPH_ComputePressureGrid ();
+			if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "PRESS: %s\n", stop.GetReadableTime().c_str() ); }
+
+			start.SetSystemTime ( ACC_NSEC );
+			SPH_ComputeForceGridNC ();		
+			if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "FORCE: %s\n", stop.GetReadableTime().c_str() ); }
+
+			start.SetSystemTime ( ACC_NSEC );
+			Advance();
+			if ( bTiming) { stop.SetSystemTime ( ACC_NSEC ); stop = stop - start; printf ( "ADV: %s\n", stop.GetReadableTime().c_str() ); }
+		}		
+		
+	#endif
+}
+
+
+
+void FluidSystem::SPH_DrawDomain ()
+{
+	Vector3DF min, max;
+	min = m_Vec[SPH_VOLMIN];
+	max = m_Vec[SPH_VOLMAX];
+	min.z += 0.5;
+
+	glColor3f ( 0.0, 0.0, 1.0 );
+	glBegin ( GL_LINES );
+	glVertex3f ( min.x, min.y, min.z );	glVertex3f ( max.x, min.y, min.z );
+	glVertex3f ( min.x, max.y, min.z );	glVertex3f ( max.x, max.y, min.z );
+	glVertex3f ( min.x, min.y, min.z );	glVertex3f ( min.x, max.y, min.z );
+	glVertex3f ( max.x, min.y, min.z );	glVertex3f ( max.x, max.y, min.z );
+	glEnd ();
+}
+
+void FluidSystem::Advance ()
+{
+	char *dat1, *dat1_end;
+	Fluid* p;
+	Vector3DF norm, z;
+	Vector3DF dir, accel;
+	Vector3DF vnext;
+	Vector3DF min, max;
+	double adj;
+	float SL, SL2, ss, radius;
+	float stiff, damp, speed, diff; 
+	SL = m_Param[SPH_LIMIT];
+	SL2 = SL*SL;
+	
+	stiff = m_Param[SPH_EXTSTIFF];
+	damp = m_Param[SPH_EXTDAMP];
+	radius = m_Param[SPH_PRADIUS];
+	min = m_Vec[SPH_VOLMIN];
+	max = m_Vec[SPH_VOLMAX];
+	ss = m_Param[SPH_SIMSCALE];
+
+	dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
+	for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride ) {
+		p = (Fluid*) dat1;		
+
+		// Compute Acceleration		
+		accel = p->sph_force;
+		accel *= m_Param[SPH_PMASS];
+
+		// Velocity limiting 
+		speed = accel.x*accel.x + accel.y*accel.y + accel.z*accel.z;
+		if ( speed > SL2 ) {
+			accel *= SL / sqrt(speed);
+		}		
+	
+		// Boundary Conditions
+
+		// Z-axis walls
+		diff = 2 * radius - ( p->pos.z - min.z - (p->pos.x - m_Vec[SPH_VOLMIN].x) * m_Param[BOUND_ZMIN_SLOPE] )*ss;
+		if (diff > EPSILON ) {			
+			norm.Set ( -m_Param[BOUND_ZMIN_SLOPE], 0, 1.0 - m_Param[BOUND_ZMIN_SLOPE] );
+			adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
+			accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
+		}		
+
+		diff = 2 * radius - ( max.z - p->pos.z )*ss;
+		if (diff > EPSILON) {
+			norm.Set ( 0, 0, -1 );
+			adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
+			accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
+		}
+		
+		// X-axis walls
+		if ( !m_Toggle[WRAP_X] ) {
+			diff = 2 * radius - ( p->pos.x - min.x + (sin(m_Time*10.0)-1+(p->pos.y*0.025)*0.25) * m_Param[FORCE_XMIN_SIN] )*ss;	
+			//diff = 2 * radius - ( p->pos.x - min.x + (sin(m_Time*10.0)-1) * m_Param[FORCE_XMIN_SIN] )*ss;	
+			if (diff > EPSILON ) {
+				norm.Set ( 1.0, 0, 0 );
+				adj = (m_Param[ FORCE_XMIN_SIN ] + 1) * stiff * diff - damp * norm.Dot ( p->vel_eval ) ;
+				accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;					
+			}
+
+			diff = 2 * radius - ( max.x - p->pos.x + (sin(m_Time*10.0)-1) * m_Param[FORCE_XMAX_SIN] )*ss;	
+			if (diff > EPSILON) {
+				norm.Set ( -1, 0, 0 );
+				adj = (m_Param[ FORCE_XMAX_SIN ]+1) * stiff * diff - damp * norm.Dot ( p->vel_eval );
+				accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
+			}
+		}
+
+		// Y-axis walls
+		diff = 2 * radius - ( p->pos.y - min.y )*ss;			
+		if (diff > EPSILON) {
+			norm.Set ( 0, 1, 0 );
+			adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
+			accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
+		}
+		diff = 2 * radius - ( max.y - p->pos.y )*ss;
+		if (diff > EPSILON) {
+			norm.Set ( 0, -1, 0 );
+			adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
+			accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
+		}
+
+		// Wall barrier
+		if ( m_Toggle[WALL_BARRIER] ) {
+			diff = 2 * radius - ( p->pos.x - 0 )*ss;					
+			if (diff < 2*radius && diff > EPSILON && fabs(p->pos.y) < 3 && p->pos.z < 10) {
+				norm.Set ( 1.0, 0, 0 );
+				adj = 2*stiff * diff - damp * norm.Dot ( p->vel_eval ) ;	
+				accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;					
+			}
+		}
+		
+		// Levy barrier
+		if ( m_Toggle[LEVY_BARRIER] ) {
+			diff = 2 * radius - ( p->pos.x - 0 )*ss;					
+			if (diff < 2*radius && diff > EPSILON && fabs(p->pos.y) > 5 && p->pos.z < 10) {
+				norm.Set ( 1.0, 0, 0 );
+				adj = 2*stiff * diff - damp * norm.Dot ( p->vel_eval ) ;	
+				accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;					
+			}
+		}
+		// Drain barrier
+		if ( m_Toggle[DRAIN_BARRIER] ) {
+			diff = 2 * radius - ( p->pos.z - min.z-15 )*ss;
+			if (diff < 2*radius && diff > EPSILON && (fabs(p->pos.x)>3 || fabs(p->pos.y)>3) ) {
+				norm.Set ( 0, 0, 1);
+				adj = stiff * diff - damp * norm.Dot ( p->vel_eval );
+				accel.x += adj * norm.x; accel.y += adj * norm.y; accel.z += adj * norm.z;
+			}
+		}
+
+		// Plane gravity
+		if ( m_Param[PLANE_GRAV] > 0) 
+			accel += m_Vec[PLANE_GRAV_DIR];
+
+		// Point gravity
+		if ( m_Param[POINT_GRAV] > 0 ) {
+			norm.x = ( p->pos.x - m_Vec[POINT_GRAV_POS].x );
+			norm.y = ( p->pos.y - m_Vec[POINT_GRAV_POS].y );
+			norm.z = ( p->pos.z - m_Vec[POINT_GRAV_POS].z );
+			norm.Normalize ();
+			norm *= m_Param[POINT_GRAV];
+			accel -= norm;
+		}
+
+		// Leapfrog Integration ----------------------------
+		vnext = accel;							
+		vnext *= m_DT;
+		vnext += p->vel;						// v(t+1/2) = v(t-1/2) + a(t) dt
+		p->vel_eval = p->vel;
+		p->vel_eval += vnext;
+		p->vel_eval *= 0.5;					// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
+		p->vel = vnext;
+		vnext *= m_DT/ss;
+		p->pos += vnext;						// p(t+1) = p(t) + v(t+1/2) dt
+
+		if ( m_Param[CLR_MODE]==1.0 ) {
+			adj = fabs(vnext.x)+fabs(vnext.y)+fabs(vnext.z) / 7000.0;
+			adj = (adj > 1.0) ? 1.0 : adj;
+			p->clr = COLORA( 0, adj, adj, 1 );
+		}
+		if ( m_Param[CLR_MODE]==2.0 ) {
+			float v = 0.5 + ( p->pressure / 1500.0); 
+			if ( v < 0.1 ) v = 0.1;
+			if ( v > 1.0 ) v = 1.0;
+			p->clr = COLORA ( v, 1-v, 0, 1 );
+		}
+
+
+		// Euler integration -------------------------------
+		/* accel += m_Gravity;
+		accel *= m_DT;
+		p->vel += accel;				// v(t+1) = v(t) + a(t) dt
+		p->vel_eval += accel;
+		p->vel_eval *= m_DT/d;
+		p->pos += p->vel_eval;
+		p->vel_eval = p->vel;  */	
+
+
+		if ( m_Toggle[WRAP_X] ) {
+			diff = p->pos.x - (m_Vec[SPH_VOLMIN].x + 2);			// -- Simulates object in center of flow
+			if ( diff <= 0 ) {
+				p->pos.x = (m_Vec[SPH_VOLMAX].x - 2) + diff*2;				
+				p->pos.z = 10;
+			}
+		}	
+	}
+
+	m_Time += m_DT;
+}
+
+//------------------------------------------------------ SPH Setup 
+//
+//  Range = +/- 10.0 * 0.006 (r) =	   0.12			m (= 120 mm = 4.7 inch)
+//  Container Volume (Vc) =			   0.001728		m^3
+//  Rest Density (D) =				1000.0			kg / m^3
+//  Particle Mass (Pm) =			   0.00020543	kg						(mass = vol * density)
+//  Number of Particles (N) =		4000.0
+//  Water Mass (M) =				   0.821		kg (= 821 grams)
+//  Water Volume (V) =				   0.000821     m^3 (= 3.4 cups, .21 gals)
+//  Smoothing Radius (R) =             0.02			m (= 20 mm = ~3/4 inch)
+//  Particle Radius (Pr) =			   0.00366		m (= 4 mm  = ~1/8 inch)
+//  Particle Volume (Pv) =			   2.054e-7		m^3	(= .268 milliliters)
+//  Rest Distance (Pd) =			   0.0059		m
+//
+//  Given: D, Pm, N
+//    Pv = Pm / D			0.00020543 kg / 1000 kg/m^3 = 2.054e-7 m^3	
+//    Pv = 4/3*pi*Pr^3    cuberoot( 2.054e-7 m^3 * 3/(4pi) ) = 0.00366 m
+//     M = Pm * N			0.00020543 kg * 4000.0 = 0.821 kg		
+//     V =  M / D              0.821 kg / 1000 kg/m^3 = 0.000821 m^3
+//     V = Pv * N			 2.054e-7 m^3 * 4000 = 0.000821 m^3
+//    Pd = cuberoot(Pm/D)    cuberoot(0.00020543/1000) = 0.0059 m 
+//
+// Ideal grid cell size (gs) = 2 * smoothing radius = 0.02*2 = 0.04
+// Ideal domain size = k*gs/d = k*0.02*2/0.005 = k*8 = {8, 16, 24, 32, 40, 48, ..}
+//    (k = number of cells, gs = cell size, d = simulation scale)
+
+void FluidSystem::SPH_Setup ()
+{
+	m_Param [ SPH_SIMSCALE ] =		0.004;			// unit size
+	m_Param [ SPH_VISC ] =			0.2;			// pascal-second (Pa.s) = 1 kg m^-1 s^-1  (see wikipedia page on viscosity)
+	m_Param [ SPH_RESTDENSITY ] =	600.0;			// kg / m^3
+	m_Param [ SPH_PMASS ] =			0.00020543;		// kg
+	m_Param [ SPH_PRADIUS ] =		0.004;			// m
+	m_Param [ SPH_PDIST ] =			0.0059;			// m
+	m_Param [ SPH_SMOOTHRADIUS ] =	0.01;			// m 
+	m_Param [ SPH_INTSTIFF ] =		1.00;
+	m_Param [ SPH_EXTSTIFF ] =		10000.0;
+	m_Param [ SPH_EXTDAMP ] =		256.0;
+	m_Param [ SPH_LIMIT ] =			200.0;			// m / s
+
+	m_Toggle [ SPH_GRID ] =		false;
+	m_Toggle [ SPH_DEBUG ] =	false;
+
+	SPH_ComputeKernels ();
+}
+
+void FluidSystem::SPH_ComputeKernels ()
+{
+	m_Param [ SPH_PDIST ] = pow ( m_Param[SPH_PMASS] / m_Param[SPH_RESTDENSITY], 1/3.0 );
+	m_R2 = m_Param [SPH_SMOOTHRADIUS] * m_Param[SPH_SMOOTHRADIUS];
+	m_Poly6Kern = 315.0f / (64.0f * 3.141592 * pow( m_Param[SPH_SMOOTHRADIUS], 9) );	// Wpoly6 kernel (denominator part) - 2003 Muller, p.4
+	m_SpikyKern = -45.0f / (3.141592 * pow( m_Param[SPH_SMOOTHRADIUS], 6) );			// Laplacian of viscocity (denominator): PI h^6
+	m_LapKern = 45.0f / (3.141592 * pow( m_Param[SPH_SMOOTHRADIUS], 6) );
+}
+
+void FluidSystem::SPH_CreateExample ( int n, int nmax )
+{
+	Vector3DF pos;
+	Vector3DF min, max;
+	
+	Reset ( nmax );
+	
+	switch ( n ) {
+	case 0:		// Wave pool
+				
+		//-- TEST CASE: 2x2x2 grid, 32 particles.  NOTE: Set PRADIUS to 0.0004 to reduce wall influence
+		//     grid 0:    3*3*2 = 18 particles
+		//     grid 1,2:  3*1*2 =  6 particles
+		//     grid 3:    1*1*2 =  2 particles
+		//     grid 4,5,6:    0 =  0 particles
+		/*m_Vec [ SPH_VOLMIN ].Set ( -2.5, -2.5, 0 );
+		m_Vec [ SPH_VOLMAX ].Set ( 2.5, 2.5, 5.0 );	
+		m_Vec [ SPH_INITMIN ].Set ( -2.5, -2.5, 0 );	
+		m_Vec [ SPH_INITMAX ].Set ( 2.5, 2.5, 1.6 );*/  
+		
+		m_Vec [ SPH_VOLMIN ].Set ( -30, -30, 0 );
+		m_Vec [ SPH_VOLMAX ].Set ( 30, 30, 40 );		
+
+		//m_Vec [ SPH_INITMIN ].Set ( -5, -5, 10 );
+		//m_Vec [ SPH_INITMAX ].Set ( 5, 5, 20 );
+		
+		m_Vec [ SPH_INITMIN ].Set ( -20, -26, 10 );
+		m_Vec [ SPH_INITMAX ].Set ( 20, 26, 40 );
+
+		m_Param [ FORCE_XMIN_SIN ] = 12.0;
+		m_Param [ BOUND_ZMIN_SLOPE ] = 0.05;
+		break;
+	case 1:		// Dam break
+		m_Vec [ SPH_VOLMIN ].Set ( -30, -14, 0 );
+		m_Vec [ SPH_VOLMAX ].Set ( 30, 14, 60 );
+		m_Vec [ SPH_INITMIN ].Set ( 0, -13, 0 );
+		m_Vec [ SPH_INITMAX ].Set ( 29, 13, 30 );		
+		m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
+		break;
+	case 2:		// Dual-Wave pool
+		m_Vec [ SPH_VOLMIN ].Set ( -60, -5, 0 );
+		m_Vec [ SPH_VOLMAX ].Set ( 60, 5, 50 );
+		m_Vec [ SPH_INITMIN ].Set ( -46, -5, 0 );
+		m_Vec [ SPH_INITMAX ].Set ( 46, 5, 15 );
+		m_Param [ FORCE_XMIN_SIN ] = 8.0;
+		m_Param [ FORCE_XMAX_SIN ] = 8.0;
+		m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
+		break;
+	case 3:		// Swirl Stream
+		m_Vec [ SPH_VOLMIN ].Set ( -30, -30, 0 );
+		m_Vec [ SPH_VOLMAX ].Set ( 30, 30, 50 );
+		m_Vec [ SPH_INITMIN ].Set ( -30, -30, 0 );
+		m_Vec [ SPH_INITMAX ].Set ( 30, 30, 40 );
+		m_Vec [ EMIT_POS ].Set ( -20, -20, 22 );
+		m_Vec [ EMIT_RATE ].Set ( 1, 4, 0 );
+		m_Vec [ EMIT_ANG ].Set ( 0, 120, 1.5 );
+		m_Vec [ EMIT_DANG ].Set ( 0, 0, 0 );
+		m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
+		break;
+	case 4:		// Shockwave
+		m_Vec [ SPH_VOLMIN ].Set ( -60, -15, 0 );
+		m_Vec [ SPH_VOLMAX ].Set ( 60, 15, 50 );
+		m_Vec [ SPH_INITMIN ].Set ( -59, -14, 0 );
+		m_Vec [ SPH_INITMAX ].Set ( 59, 14, 30 );
+		m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
+		m_Toggle [ WALL_BARRIER ] = true;
+		m_Toggle [ WRAP_X ] = true;
+		break;
+	case 5:		// Zero gravity
+		m_Vec [ SPH_VOLMIN ].Set ( -40, -40, 0 );
+		m_Vec [ SPH_VOLMAX ].Set ( 40, 40, 50 );
+		m_Vec [ SPH_INITMIN ].Set ( -20, -20, 20 );
+		m_Vec [ SPH_INITMAX ].Set ( 20, 20, 40 );
+		m_Vec [ EMIT_POS ].Set ( -20, 0, 40 );
+		m_Vec [ EMIT_RATE ].Set ( 2, 1, 0 );		
+		m_Vec [ EMIT_ANG ].Set ( 0, 120, 0.25 );
+		m_Vec [ PLANE_GRAV_DIR ].Set ( 0, 0, 0 );
+		m_Param [ SPH_INTSTIFF ] = 0.20;		
+		break;
+	case 6:		// Point gravity		
+		m_Vec [ SPH_VOLMIN ].Set ( -40, -40, 0 );
+		m_Vec [ SPH_VOLMAX ].Set ( 40, 40, 50 );
+		m_Vec [ SPH_INITMIN ].Set ( -20, -20, 20 );
+		m_Vec [ SPH_INITMAX ].Set ( 20, 20, 40 );
+		m_Param [ SPH_INTSTIFF ] = 0.50;		
+		m_Vec [ EMIT_POS ].Set ( -20, 20, 25 );
+		m_Vec [ EMIT_RATE ].Set ( 1, 4, 0 );		
+		m_Vec [ EMIT_ANG ].Set ( -20, 100, 2.0 );
+		m_Vec [ POINT_GRAV_POS ].Set ( 0, 0, 25 );
+		m_Vec [ PLANE_GRAV_DIR ].Set ( 0, 0, 0 );
+		m_Param [ POINT_GRAV ] = 3.5;
+		break;
+	case 7:		// Levy break
+		m_Vec [ SPH_VOLMIN ].Set ( -40, -40, 0 );
+		m_Vec [ SPH_VOLMAX ].Set ( 40, 40, 50 );
+		m_Vec [ SPH_INITMIN ].Set ( 10, -40, 0 );
+		m_Vec [ SPH_INITMAX ].Set ( 40, 40, 50 );
+		m_Vec [ EMIT_POS ].Set ( 34, 27, 16.6 );
+		m_Vec [ EMIT_RATE ].Set ( 2, 9, 0 );		
+		m_Vec [ EMIT_ANG ].Set ( 118, 200, 1.0 );
+		m_Toggle [ LEVY_BARRIER ] = true;
+		m_Param [ BOUND_ZMIN_SLOPE ] = 0.1;
+		m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
+		break;
+	case 8:		// Drain
+		m_Vec [ SPH_VOLMIN ].Set ( -20, -20, 0 );
+		m_Vec [ SPH_VOLMAX ].Set ( 20, 20, 50 );
+		m_Vec [ SPH_INITMIN ].Set ( -15, -20, 20 );
+		m_Vec [ SPH_INITMAX ].Set ( 20, 20, 50 );
+		m_Vec [ EMIT_POS ].Set ( -16, -16, 30 );
+		m_Vec [ EMIT_RATE ].Set ( 1, 4, 0 );		
+		m_Vec [ EMIT_ANG ].Set ( -20, 140, 1.8 );
+		m_Toggle [ DRAIN_BARRIER ] = true;
+		m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
+		break;
+	case 9:			// Tumbler
+		m_Vec [ SPH_VOLMIN ].Set ( -30, -30, 0 );
+		m_Vec [ SPH_VOLMAX ].Set ( 30, 30, 50 );
+		m_Vec [ SPH_INITMIN ].Set ( 24, -29, 20 );
+		m_Vec [ SPH_INITMAX ].Set ( 29, 29, 40 );
+		m_Param [ SPH_VISC ] = 0.1;		
+		m_Param [ SPH_INTSTIFF ] = 0.50;
+		m_Param [ SPH_EXTSTIFF ] = 8000;
+		//m_Param [ SPH_SMOOTHRADIUS ] = 0.01;
+		m_Param [ BOUND_ZMIN_SLOPE ] = 0.4;
+		m_Param [ FORCE_XMIN_SIN ] = 12.00;
+		m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
+		break;	
+	case 10:		// Large sim
+		m_Vec [ SPH_VOLMIN ].Set ( -35, -35, 0 );
+		m_Vec [ SPH_VOLMAX ].Set ( 35, 35, 60 );
+		m_Vec [ SPH_INITMIN ].Set ( -5, -35, 0 );
+		m_Vec [ SPH_INITMAX ].Set ( 30, 0, 60 );
+		m_Vec [ PLANE_GRAV_DIR ].Set ( 0.0, 0, -9.8 );
+		break;
+	}	
+
+	SPH_ComputeKernels ();
+
+	m_Param [ SPH_SIMSIZE ] = m_Param [ SPH_SIMSCALE ] * (m_Vec[SPH_VOLMAX].z - m_Vec[SPH_VOLMIN].z);
+	m_Param [ SPH_PDIST ] = pow ( m_Param[SPH_PMASS] / m_Param[SPH_RESTDENSITY], 1/3.0 );	
+
+	float ss = m_Param [ SPH_PDIST ]*0.87 / m_Param[ SPH_SIMSCALE ];	
+	printf ( "Spacing: %f\n", ss);
+	AddVolume ( m_Vec[SPH_INITMIN], m_Vec[SPH_INITMAX], ss );	// Create the particles
+
+	float cell_size = m_Param[SPH_SMOOTHRADIUS]*2.0;			// Grid cell size (2r)	
+	Grid_Setup ( m_Vec[SPH_VOLMIN], m_Vec[SPH_VOLMAX], m_Param[SPH_SIMSCALE], cell_size, 1.0 );												// Setup grid
+	Grid_InsertParticles ();									// Insert particles
+
+	Vector3DF vmin, vmax;
+	vmin =  m_Vec[SPH_VOLMIN];
+	vmin -= Vector3DF(2,2,2);
+	vmax =  m_Vec[SPH_VOLMAX];
+	vmax += Vector3DF(2,2,-2);
+
+	#ifdef BUILD_CUDA
+		FluidClearCUDA ();
+		Sleep ( 500 );
+
+		FluidSetupCUDA ( NumPoints(), sizeof(Fluid), *(float3*)& m_GridMin, *(float3*)& m_GridMax, *(float3*)& m_GridRes, *(float3*)& m_GridSize, (int) m_Vec[EMIT_RATE].x );
+
+		Sleep ( 500 );
+
+		FluidParamCUDA ( m_Param[SPH_SIMSCALE], m_Param[SPH_SMOOTHRADIUS], m_Param[SPH_PMASS], m_Param[SPH_RESTDENSITY], m_Param[SPH_INTSTIFF], m_Param[SPH_VISC] );
+	#endif
+
+}
+
+// Compute Pressures - Very slow yet simple. O(n^2)
+void FluidSystem::SPH_ComputePressureSlow ()
+{
+	char *dat1, *dat1_end;
+	char *dat2, *dat2_end;
+	Fluid *p, *q;
+	int cnt = 0;
+	double dx, dy, dz, sum, dsq, c;
+	double d, d2, mR, mR2;
+	d = m_Param[SPH_SIMSCALE];
+	d2 = d*d;
+	mR = m_Param[SPH_SMOOTHRADIUS];
+	mR2 = mR*mR;	
+
+	dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
+	for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride ) {
+		p = (Fluid*) dat1;
+
+		sum = 0.0;
+		cnt = 0;
+		
+		dat2_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
+		for ( dat2 = mBuf[0].data; dat2 < dat2_end; dat2 += mBuf[0].stride ) {
+			q = (Fluid*) dat2;
+
+			if ( p==q ) continue;
+			dx = ( p->pos.x - q->pos.x)*d;		// dist in cm
+			dy = ( p->pos.y - q->pos.y)*d;
+			dz = ( p->pos.z - q->pos.z)*d;
+			dsq = (dx*dx + dy*dy + dz*dz);
+			if ( mR2 > dsq ) {
+				c =  m_R2 - dsq;
+				sum += c * c * c;
+				cnt++;
+				//if ( p == m_CurrP ) q->tag = true;
+			}
+		}	
+		p->density = sum * m_Param[SPH_PMASS] * m_Poly6Kern ;	
+		p->pressure = ( p->density - m_Param[SPH_RESTDENSITY] ) * m_Param[SPH_INTSTIFF];
+		p->density = 1.0f / p->density;
+	}
+}
+
+// Compute Pressures - Using spatial grid, and also create neighbor table
+void FluidSystem::SPH_ComputePressureGrid ()
+{
+	char *dat1, *dat1_end;
+	Fluid* p;
+	Fluid* pcurr;
+	int pndx;
+	int i, cnt = 0;
+	float dx, dy, dz, sum, dsq, c;
+	float d, d2, mR, mR2;
+	float radius = m_Param[SPH_SMOOTHRADIUS] / m_Param[SPH_SIMSCALE];
+	d = m_Param[SPH_SIMSCALE];
+	d2 = d*d;
+	mR = m_Param[SPH_SMOOTHRADIUS];
+	mR2 = mR*mR;	
+
+	dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
+	i = 0;
+	for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride, i++ ) {
+		p = (Fluid*) dat1;
+
+		sum = 0.0;	
+		m_NC[i] = 0;
+
+		Grid_FindCells ( p->pos, radius );
+		for (int cell=0; cell < 8; cell++) {
+			if ( m_GridCell[cell] != -1 ) {
+				pndx = m_Grid [ m_GridCell[cell] ];				
+				while ( pndx != -1 ) {					
+					pcurr = (Fluid*) (mBuf[0].data + pndx*mBuf[0].stride);					
+					if ( pcurr == p ) {pndx = pcurr->next; continue; }
+					dx = ( p->pos.x - pcurr->pos.x)*d;		// dist in cm
+					dy = ( p->pos.y - pcurr->pos.y)*d;
+					dz = ( p->pos.z - pcurr->pos.z)*d;
+					dsq = (dx*dx + dy*dy + dz*dz);
+					if ( mR2 > dsq ) {
+						c =  m_R2 - dsq;
+						sum += c * c * c;
+						if ( m_NC[i] < MAX_NEIGHBOR ) {
+							m_Neighbor[i][ m_NC[i] ] = pndx;
+							m_NDist[i][ m_NC[i] ] = sqrt(dsq);
+							m_NC[i]++;
+						}
+					}
+					pndx = pcurr->next;
+				}
+			}
+			m_GridCell[cell] = -1;
+		}
+		p->density = sum * m_Param[SPH_PMASS] * m_Poly6Kern ;	
+		p->pressure = ( p->density - m_Param[SPH_RESTDENSITY] ) * m_Param[SPH_INTSTIFF];		
+		p->density = 1.0f / p->density;		
+	}
+}
+
+// Compute Forces - Very slow, but simple. O(n^2)
+void FluidSystem::SPH_ComputeForceSlow ()
+{
+	char *dat1, *dat1_end;
+	char *dat2, *dat2_end;
+	Fluid *p, *q;
+	Vector3DF force, fcurr;
+	register double pterm, vterm, dterm;
+	double c, r, d, sum, dsq;
+	double dx, dy, dz;
+	double mR, mR2, visc;
+
+	d = m_Param[SPH_SIMSCALE];
+	mR = m_Param[SPH_SMOOTHRADIUS];
+	mR2 = (mR*mR);
+	visc = m_Param[SPH_VISC];
+	vterm = m_LapKern * visc;
+
+	dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
+	for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride ) {
+		p = (Fluid*) dat1;
+
+		sum = 0.0;
+		force.Set ( 0, 0, 0 );
+		
+		dat2_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
+		for ( dat2 = mBuf[0].data; dat2 < dat2_end; dat2 += mBuf[0].stride ) {
+			q = (Fluid*) dat2;
+
+			if ( p == q ) continue;
+			dx = ( p->pos.x - q->pos.x )*d;			// dist in cm
+			dy = ( p->pos.y - q->pos.y )*d;
+			dz = ( p->pos.z - q->pos.z )*d;
+			dsq = (dx*dx + dy*dy + dz*dz);
+			if ( mR2 > dsq ) {
+				r = sqrt ( dsq );
+				c = (mR - r);
+				pterm = -0.5f * c * m_SpikyKern * ( p->pressure + q->pressure) / r;
+				dterm = c * p->density * q->density;
+				force.x += ( pterm * dx + vterm * (q->vel_eval.x - p->vel_eval.x) ) * dterm;
+				force.y += ( pterm * dy + vterm * (q->vel_eval.y - p->vel_eval.y) ) * dterm;
+				force.z += ( pterm * dz + vterm * (q->vel_eval.z - p->vel_eval.z) ) * dterm;
+			}
+		}			
+		p->sph_force = force;		
+	}
+}
+
+// Compute Forces - Using spatial grid. Faster.
+void FluidSystem::SPH_ComputeForceGrid ()
+{
+	char *dat1, *dat1_end;	
+	Fluid *p;
+	Fluid *pcurr;
+	int pndx;
+	Vector3DF force, fcurr;
+	register double pterm, vterm, dterm;
+	double c, d, dsq, r;
+	double dx, dy, dz;
+	double mR, mR2, visc;
+	float radius = m_Param[SPH_SMOOTHRADIUS] / m_Param[SPH_SIMSCALE];
+
+	d = m_Param[SPH_SIMSCALE];
+	mR = m_Param[SPH_SMOOTHRADIUS];
+	mR2 = (mR*mR);
+	visc = m_Param[SPH_VISC];
+
+	dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
+	for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride ) {
+		p = (Fluid*) dat1;
+
+		force.Set ( 0, 0, 0 );
+
+		Grid_FindCells ( p->pos, radius );
+		for (int cell=0; cell < 8; cell++) {
+			if ( m_GridCell[cell] != -1 ) {
+				pndx = m_Grid [ m_GridCell[cell] ];				
+				while ( pndx != -1 ) {					
+					pcurr = (Fluid*) (mBuf[0].data + pndx*mBuf[0].stride);					
+					if ( pcurr == p ) {pndx = pcurr->next; continue; }
+			
+					dx = ( p->pos.x - pcurr->pos.x)*d;		// dist in cm
+					dy = ( p->pos.y - pcurr->pos.y)*d;
+					dz = ( p->pos.z - pcurr->pos.z)*d;
+					dsq = (dx*dx + dy*dy + dz*dz);
+					if ( mR2 > dsq ) {
+						r = sqrt ( dsq );
+						c = (mR - r);
+						pterm = -0.5f * c * m_SpikyKern * ( p->pressure + pcurr->pressure) / r;
+						dterm = c * p->density * pcurr->density;
+						vterm =	m_LapKern * visc;
+						force.x += ( pterm * dx + vterm * (pcurr->vel_eval.x - p->vel_eval.x) ) * dterm;
+						force.y += ( pterm * dy + vterm * (pcurr->vel_eval.y - p->vel_eval.y) ) * dterm;
+						force.z += ( pterm * dz + vterm * (pcurr->vel_eval.z - p->vel_eval.z) ) * dterm;
+					}
+					pndx = pcurr->next;
+				}
+			}
+		}
+		p->sph_force = force;
+	}
+}
+
+// Compute Forces - Using spatial grid with saved neighbor table. Fastest.
+void FluidSystem::SPH_ComputeForceGridNC ()
+{
+	char *dat1, *dat1_end;	
+	Fluid *p;
+	Fluid *pcurr;
+	Vector3DF force, fcurr;
+	register float pterm, vterm, dterm;
+	int i;
+	float c, d;
+	float dx, dy, dz;
+	float mR, mR2, visc;	
+
+	d = m_Param[SPH_SIMSCALE];
+	mR = m_Param[SPH_SMOOTHRADIUS];
+	mR2 = (mR*mR);
+	visc = m_Param[SPH_VISC];
+
+	dat1_end = mBuf[0].data + NumPoints()*mBuf[0].stride;
+	i = 0;
+	
+	for ( dat1 = mBuf[0].data; dat1 < dat1_end; dat1 += mBuf[0].stride, i++ ) {
+		p = (Fluid*) dat1;
+
+		force.Set ( 0, 0, 0 );
+		for (int j=0; j < m_NC[i]; j++ ) {
+			pcurr = (Fluid*) (mBuf[0].data + m_Neighbor[i][j]*mBuf[0].stride);
+			dx = ( p->pos.x - pcurr->pos.x)*d;		// dist in cm
+			dy = ( p->pos.y - pcurr->pos.y)*d;
+			dz = ( p->pos.z - pcurr->pos.z)*d;				
+			c = ( mR - m_NDist[i][j] );
+			pterm = -0.5f * c * m_SpikyKern * ( p->pressure + pcurr->pressure) / m_NDist[i][j];
+			dterm = c * p->density * pcurr->density;
+			vterm = m_LapKern * visc;
+			force.x += ( pterm * dx + vterm * (pcurr->vel_eval.x - p->vel_eval.x) ) * dterm;
+			force.y += ( pterm * dy + vterm * (pcurr->vel_eval.y - p->vel_eval.y) ) * dterm;
+			force.z += ( pterm * dz + vterm * (pcurr->vel_eval.z - p->vel_eval.z) ) * dterm;
+		}			
+		p->sph_force = force;
+	}
+}
+
--- a/Extras/sph/fluids/fluid_system.cu
+++ b/Extras/sph/fluids/fluid_system.cu
@@ -0,0 +1,71 @@
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2009. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <cutil.h>
+#include <cstdlib>
+#include <cstdio>
+#include <string.h>
+
+#if defined(__APPLE__) || defined(MACOSX)
+	#include <GLUT/glut.h>
+#else
+	#include <GL/glut.h>
+#endif
+#include <cuda_gl_interop.h>
+
+#include "fluid_system_kern.cu"
+
+extern "C"
+{
+
+// Compute number of blocks to create
+int iDivUp (int a, int b) {
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+}
+void computeNumBlocks (int numPnts, int minThreads, int &numBlocks, int &numThreads)
+{
+    numThreads = min( minThreads, numPnts );
+    numBlocks = iDivUp ( numPnts, numThreads );
+}
+
+
+void Grid_InsertParticlesCUDA ( uchar* data, uint stride, uint numPoints )
+{
+    int numThreads, numBlocks;
+    computeNumBlocks (numPoints, 256, numBlocks, numThreads);
+
+	// transfer point data to device
+    char* pntData;
+	size = numPoints * stride;
+	cudaMalloc( (void**) &pntData, size);
+	cudaMemcpy( pntData, data, size, cudaMemcpyHostToDevice);    
+
+    // execute the kernel
+    insertParticles<<< numBlocks, numThreads >>> ( pntData, stride );
+    
+    // transfer data back to host
+    cudaMemcpy( data, pntData, cudaMemcpyDeviceToHost);
+    
+    // check if kernel invocation generated an error
+    CUT_CHECK_ERROR("Kernel execution failed");
+    CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboPos));
+}
--- a/Extras/sph/fluids/fluid_system.h
+++ b/Extras/sph/fluids/fluid_system.h
@@ -0,0 +1,106 @@
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef DEF_FLUID_SYS
+	#define DEF_FLUID_SYS
+
+	#include <iostream>
+	#include <vector>
+	#include <stdio.h>
+	#include <stdlib.h>
+	#include <math.h>
+
+	#include "point_set.h"
+	#include "fluid.h"
+	
+	// Scalar params
+	#define SPH_SIMSIZE			4
+	#define SPH_SIMSCALE		5
+	#define SPH_VISC			6
+	#define SPH_RESTDENSITY		7
+	#define SPH_PMASS			8
+	#define SPH_PRADIUS			9
+	#define SPH_PDIST			10
+	#define SPH_SMOOTHRADIUS	11
+	#define SPH_INTSTIFF		12
+	#define SPH_EXTSTIFF		13
+	#define SPH_EXTDAMP			14
+	#define SPH_LIMIT			15
+	#define BOUND_ZMIN_SLOPE	16
+	#define FORCE_XMAX_SIN		17
+	#define FORCE_XMIN_SIN		18
+	#define MAX_FRAC			19
+	#define CLR_MODE			20
+
+	// Vector params
+	#define SPH_VOLMIN			7
+	#define SPH_VOLMAX			8
+	#define SPH_INITMIN			9
+	#define SPH_INITMAX			10
+
+	// Toggles
+	#define	SPH_GRID			0
+	#define SPH_DEBUG			1
+	#define WRAP_X				2
+	#define WALL_BARRIER		3
+	#define LEVY_BARRIER		4
+	#define DRAIN_BARRIER		5
+	#define USE_CUDA			6
+	
+	#define MAX_PARAM			21
+	#define BFLUID				2
+
+	class FluidSystem : public PointSet {
+	public:
+		FluidSystem ();
+
+		// Basic Particle System
+		virtual void Initialize ( int mode, int nmax );
+		virtual void Reset ( int nmax );
+		virtual void Run ();
+		virtual void Advance ();
+		virtual int AddPoint ();		
+		virtual int AddPointReuse ();
+		Fluid* AddFluid ()			{ return (Fluid*) GetElem(0, AddPointReuse()); }
+		Fluid* GetFluid (int n)		{ return (Fluid*) GetElem(0, n); }
+		
+		// Smoothed Particle Hydrodynamics
+		void SPH_Setup ();
+		void SPH_CreateExample ( int n, int nmax );
+		void SPH_DrawDomain ();
+		void SPH_ComputeKernels ();
+
+		void SPH_ComputePressureSlow ();			// O(n^2)
+		void SPH_ComputePressureGrid ();			// O(kn) - spatial grid
+		
+		void SPH_ComputeForceSlow ();				// O(n^2)
+		void SPH_ComputeForceGrid ();				// O(kn) - spatial grid
+		void SPH_ComputeForceGridNC ();				// O(cn) - neighbor table		
+		
+	private:
+
+		// Smoothed Particle Hydrodynamics
+		double						m_R2, m_Poly6Kern, m_LapKern, m_SpikyKern;		// Kernel functions
+	};
+
+#endif
--- a/Extras/sph/fluids/fluid_system_host.cu
+++ b/Extras/sph/fluids/fluid_system_host.cu
@@ -0,0 +1,247 @@
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "C:\CUDA\common\inc\cutil.h"				// cutil32.lib
+#include <string.h>
+
+
+#if defined(__APPLE__) || defined(MACOSX)
+	#include <GLUT/glut.h>
+#else
+	#include <GL/glut.h>
+#endif
+#include <cuda_gl_interop.h>
+
+#include "radixsort.cu"
+#include "fluid_system_kern.cu"			// build kernel
+
+FluidParams					fcuda;
+
+__device__ char*			bufPnts;		// point data (array of Fluid structs)
+__device__ char*			bufPntSort;		// point data (array of Fluid structs)
+__device__ uint*			bufHash[2];		// point grid hash
+__device__ int*				bufGrid;	
+
+	
+extern "C"
+{
+// Initialize CUDA
+void cudaInit(int argc, char **argv)
+{   
+    CUT_DEVICE_INIT(argc, argv);
+ 
+	cudaDeviceProp p;
+	cudaGetDeviceProperties ( &p, 0);
+	
+	printf ( "-- CUDA --\n" );
+	printf ( "Name:       %s\n", p.name );
+	printf ( "Revision:   %d.%d\n", p.major, p.minor );
+	printf ( "Global Mem: %d\n", p.totalGlobalMem );
+	printf ( "Shared/Blk: %d\n", p.sharedMemPerBlock );
+	printf ( "Regs/Blk:   %d\n", p.regsPerBlock );
+	printf ( "Warp Size:  %d\n", p.warpSize );
+	printf ( "Mem Pitch:  %d\n", p.memPitch );
+	printf ( "Thrds/Blk:  %d\n", p.maxThreadsPerBlock );
+	printf ( "Const Mem:  %d\n", p.totalConstMem );
+	printf ( "Clock Rate: %d\n", p.clockRate );	
+	
+	CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufPnts, 10 ) );	
+	CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufPntSort, 10 ) );
+	CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufHash, 10 ) );	
+	CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufGrid, 10 ) );	
+};
+	
+// Compute number of blocks to create
+int iDivUp (int a, int b) {
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+}
+void computeNumBlocks (int numPnts, int maxThreads, int &numBlocks, int &numThreads)
+{
+    numThreads = min( maxThreads, numPnts );
+    numBlocks = iDivUp ( numPnts, numThreads );
+}
+
+void FluidClearCUDA ()
+{
+	CUDA_SAFE_CALL ( cudaFree ( bufPnts ) );	
+	CUDA_SAFE_CALL ( cudaFree ( bufPntSort ) );
+	CUDA_SAFE_CALL ( cudaFree ( bufHash[0] ) );	
+	CUDA_SAFE_CALL ( cudaFree ( bufHash[1] ) );	
+	CUDA_SAFE_CALL ( cudaFree ( bufGrid ) );
+}
+
+
+void FluidSetupCUDA ( int num, int stride, float3 min, float3 max, float3 res, float3 size, int chk )
+{	
+	fcuda.min = make_float3(min.x, min.y, min.z);
+	fcuda.max = make_float3(max.x, max.y, max.z);
+	fcuda.res = make_float3(res.x, res.y, res.z);
+	fcuda.size = make_float3(size.x, size.y, size.z);	
+	fcuda.pnts = num;
+	fcuda.delta.x = res.x / size.x;
+	fcuda.delta.y = res.y / size.y;
+	fcuda.delta.z = res.z / size.z;
+	fcuda.cells = res.x*res.y*res.z;
+	fcuda.chk = chk;
+		
+    computeNumBlocks ( fcuda.pnts, 256, fcuda.numBlocks, fcuda.numThreads);			// particles
+    computeNumBlocks ( fcuda.cells, 256, fcuda.gridBlocks, fcuda.gridThreads);		// grid cell
+    
+    fcuda.szPnts = (fcuda.numBlocks * fcuda.numThreads) * stride;        
+    fcuda.szHash = (fcuda.numBlocks * fcuda.numThreads) * sizeof(uint2);		// <cell, particle> pairs
+    fcuda.szGrid = (fcuda.gridBlocks * fcuda.gridThreads) * sizeof(uint);    
+    fcuda.stride = stride;
+    printf ( "pnts: %d, t:%dx%d=%d, bufPnts:%d, bufHash:%d\n", fcuda.pnts, fcuda.numBlocks, fcuda.numThreads, fcuda.numBlocks*fcuda.numThreads, fcuda.szPnts, fcuda.szHash );
+    printf ( "grds: %d, t:%dx%d=%d, bufGrid:%d, Res: %dx%dx%d\n", fcuda.cells, fcuda.gridBlocks, fcuda.gridThreads, fcuda.gridBlocks*fcuda.gridThreads, fcuda.szGrid, (int) fcuda.res.x, (int) fcuda.res.y, (int) fcuda.res.z );	
+
+	CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufPnts, fcuda.szPnts ) );	
+	CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufPntSort, fcuda.szPnts ) );	
+	CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufHash[0], fcuda.szHash ) );	
+	CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufHash[1], fcuda.szHash ) );	
+	CUDA_SAFE_CALL ( cudaMalloc ( (void**) &bufGrid, fcuda.szGrid ) );
+	
+	printf ( "POINTERS\n");
+	printf ( "bufPnts:    %p\n", bufPnts );
+	printf ( "bufPntSort: %p\n", bufPntSort );
+	printf ( "bufHash0:   %p\n", bufHash[0] );
+	printf ( "bufHash1:   %p\n", bufHash[1] );
+	printf ( "bufGrid:    %p\n", bufGrid );
+	
+	CUDA_SAFE_CALL ( cudaMemcpyToSymbol ( simData, &fcuda, sizeof(FluidParams) ) );
+	cudaThreadSynchronize ();
+}
+
+void FluidParamCUDA ( float sim_scale, float smooth_rad, float mass, float rest, float stiff, float visc )
+{
+	fcuda.sim_scale = sim_scale;
+	fcuda.smooth_rad = smooth_rad;
+	fcuda.r2 = smooth_rad * smooth_rad;
+	fcuda.pmass = mass;
+	fcuda.rest_dens = rest;	
+	fcuda.stiffness = stiff;
+	fcuda.visc = visc;
+	
+	fcuda.pdist = pow ( fcuda.pmass / fcuda.rest_dens, 1/3.0f );
+	fcuda.poly6kern = 315.0f / (64.0f * 3.141592 * pow( smooth_rad, 9.0f) );
+	fcuda.spikykern = -45.0f / (3.141592 * pow( smooth_rad, 6.0f) );
+	fcuda.lapkern = 45.0f / (3.141592 * pow( smooth_rad, 6.0f) );	
+
+	CUDA_SAFE_CALL( cudaMemcpyToSymbol ( simData, &fcuda, sizeof(FluidParams) ) );
+	cudaThreadSynchronize ();
+}
+
+void TransferToCUDA ( char* data, int* grid, int numPoints )
+{
+	CUDA_SAFE_CALL( cudaMemcpy ( bufPnts, data, numPoints * fcuda.stride, cudaMemcpyHostToDevice ) );
+	cudaThreadSynchronize ();
+}
+
+void TransferFromCUDA ( char* data, int* grid, int numPoints )
+{
+	CUDA_SAFE_CALL( cudaMemcpy ( data, bufPntSort, numPoints * fcuda.stride, cudaMemcpyDeviceToHost ) );	
+	cudaThreadSynchronize ();	
+	
+	CUDA_SAFE_CALL( cudaMemcpy ( grid, bufGrid, fcuda.cells * sizeof(uint), cudaMemcpyDeviceToHost ) );			
+}
+
+void Grid_InsertParticlesCUDA ()
+{
+	CUDA_SAFE_CALL( cudaMemset ( bufHash[0], 0, fcuda.szHash ) );
+	
+	hashParticles<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPnts, (uint2*) bufHash[0], fcuda.pnts );	
+	CUT_CHECK_ERROR( "Kernel execution failed");
+	cudaThreadSynchronize ();
+	
+	//int buf[20000];		
+	/*printf ( "HASH: %d (%d)\n", fcuda.pnts, fcuda.numBlocks*fcuda.numThreads );
+	CUDA_SAFE_CALL( cudaMemcpy ( buf, bufHash[0], fcuda.pnts * 2*sizeof(uint), cudaMemcpyDeviceToHost ) );		
+	//for (int n=0; n < fcuda.numBlocks*fcuda.numThreads; n++) {		
+	for (int n=0; n < 100; n++) {
+		printf ( "%d: <%d,%d>\n", n, buf[n*2], buf[n*2+1] );
+	}*/
+	 
+	RadixSort( (KeyValuePair *) bufHash[0], (KeyValuePair *) bufHash[1], fcuda.pnts, 32);
+	CUT_CHECK_ERROR( "Kernel execution failed");
+	cudaThreadSynchronize ();
+	
+	/*printf ( "HASH: %d (%d)\n", fcuda.pnts, fcuda.numBlocks*fcuda.numThreads );
+	CUDA_SAFE_CALL( cudaMemcpy ( buf, bufHash[0], fcuda.pnts * 2*sizeof(uint), cudaMemcpyDeviceToHost ) );		
+	//for (int n=0; n < fcuda.numBlocks*fcuda.numThreads; n++) {		
+	for (int n=0; n < 100; n++) {
+		printf ( "%d: <%d,%d>\n", n, buf[n*2], buf[n*2+1] );
+	}*/
+	
+	// insertParticles<<< fcuda.gridBlocks, fcuda.gridThreads>>> ( bufPnts, (uint2*) bufHash[0], bufGrid, fcuda.pnts, fcuda.cells );			
+	
+	CUDA_SAFE_CALL( cudaMemset ( bufGrid, NULL_HASH, fcuda.cells * sizeof(uint) ) );
+	
+	insertParticlesRadix<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPnts, (uint2*) bufHash[0], bufGrid, bufPntSort, fcuda.pnts, fcuda.cells );
+	CUT_CHECK_ERROR( "Kernel execution failed");
+	cudaThreadSynchronize ();	
+    
+    /*printf ( "GRID: %d\n", fcuda.cells );
+	CUDA_SAFE_CALL( cudaMemcpy ( buf, bufGrid, fcuda.cells * sizeof(uint), cudaMemcpyDeviceToHost ) );		
+	*for (int n=0; n < 100; n++) {		
+		printf ( "%d: %d\n", n, buf[n]);
+	}*/
+}
+
+void SPH_ComputePressureCUDA ()
+{
+	computePressure<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, bufGrid, (uint2*) bufHash[0], fcuda.pnts );	
+    CUT_CHECK_ERROR( "Kernel execution failed");
+    cudaThreadSynchronize ();	
+}
+
+void SPH_ComputeForceCUDA ()
+{
+	//-- standard force
+	//computeForce<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, bufGrid, (uint2*) bufHash[0], fcuda.pnts );	
+	
+	// Force using neighbor table
+	computeForceNbr<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, fcuda.pnts );	
+    CUT_CHECK_ERROR( "Kernel execution failed");
+    cudaThreadSynchronize ();	
+}
+
+void SPH_AdvanceCUDA ( float dt, float ss )
+{
+	advanceParticles<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, fcuda.pnts, dt, ss );
+    CUT_CHECK_ERROR( "Kernel execution failed");
+    cudaThreadSynchronize ();
+}
+
+}	// extern C
+
+
+
+
+   	//----------- Per frame: Malloc/Free, Host<->Device
+	// transfer point data to device    
+    /*char* pntData;
+	int size = (fcuda.numBlocks*fcuda.numThreads) * stride;
+	cudaMalloc( (void**) &pntData, size);
+	cudaMemcpy( pntData, data, numPoints*stride, cudaMemcpyHostToDevice);  	
+    insertParticles<<< fcuda.numBlocks, fcuda.numThreads >>> ( pntData, stride, numPoints );
+    cudaMemcpy( data, pntData, numPoints*stride, cudaMemcpyDeviceToHost);    
+    cudaFree( pntData );*/
--- a/Extras/sph/fluids/fluid_system_host.cuh
+++ b/Extras/sph/fluids/fluid_system_host.cuh
@@ -0,0 +1,63 @@
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include <vector_types.h>	
+#include <driver_types.h>			// for cudaStream_t
+
+typedef unsigned int		uint;		// should be 4-bytes on CUDA
+typedef unsigned char		uchar;		// should be 1-bytes on CUDA
+
+struct FluidParams {
+	int				numThreads, numBlocks;
+	int				gridThreads, gridBlocks;	
+	int				szPnts, szHash, szGrid;
+	int				stride, pnts, cells;
+	int				chk;
+	float			smooth_rad, r2, sim_scale, visc;
+	float3			min, max, res, size, delta;
+	
+	float			pdist, pmass, rest_dens, stiffness;
+	float			poly6kern, spikykern, lapkern;
+
+};
+
+extern "C"
+{
+
+void cudaInit(int argc, char **argv);
+
+void FluidClearCUDA ();
+void FluidSetupCUDA ( int num, int stride, float3 min, float3 max, float3 res, float3 size, int chk );
+void FluidParamCUDA ( float sim_scale, float smooth_rad, float mass, float rest, float stiff, float visc );
+
+void TransferToCUDA ( char* data, int* grid, int numPoints );
+void TransferFromCUDA ( char* data, int* grid, int numPoints );
+
+void Grid_InsertParticlesCUDA ();
+void SPH_ComputePressureCUDA ();
+void SPH_ComputeForceCUDA ();
+void SPH_AdvanceCUDA ( float dt, float ss );
+
+}
+
+
--- a/Extras/sph/fluids/fluid_system_kern.cu
+++ b/Extras/sph/fluids/fluid_system_kern.cu
@@ -0,0 +1,402 @@
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#ifndef _PARTICLES_KERNEL_H_
+	#define _PARTICLES_KERNEL_H_
+
+	#include <stdio.h>
+	#include <math.h>
+
+	#include "fluid_system_host.cuh"
+
+	#define TOTAL_THREADS		65536
+	#define BLOCK_THREADS		256
+	#define MAX_NBR				80
+	
+	__constant__	FluidParams		simData;		// simulation data (on device)
+	
+	__device__ int				bufNeighbor[ TOTAL_THREADS*MAX_NBR ];
+	__device__ float			bufNdist[ TOTAL_THREADS*MAX_NBR ];	
+
+	#define COLOR(r,g,b)	( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) )
+	#define COLORA(r,g,b,a)	( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) | uint((a)*255.0f) )
+	
+	#define NULL_HASH		333333
+	
+	#define OFFSET_CLR		12
+	#define OFFSET_NEXT		16
+	#define OFFSET_VEL		20
+	#define OFFSET_VEVAL	32
+	#define OFFSET_PRESS	48
+	#define OFFSET_DENS		52
+	#define OFFSET_FORCE	56
+	
+
+	__global__ void hashParticles ( char* bufPnts, uint2* bufHash, int numPnt )
+	{			
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index
+		float3* pos = (float3*) (bufPnts + __mul24(ndx, simData.stride) );
+		int gz = (pos->z - simData.min.z) * simData.delta.z ;
+		int gy = (pos->y - simData.min.y) * simData.delta.y ;
+		int gx = (pos->x - simData.min.x) * simData.delta.x ;
+		if ( ndx >= numPnt || gx < 0 || gz > simData.res.x-1 || gy < 0 || gy > simData.res.y-1 || gz < 0 || gz > simData.res.z-1 ) 
+			bufHash[ndx] = make_uint2( NULL_HASH, ndx );
+		else
+			bufHash[ndx] = make_uint2( __mul24(__mul24(gz, (int) simData.res.y)+gy, (int) simData.res.x) + gx, ndx );		
+		
+		__syncthreads ();
+	}
+	
+	__global__ void insertParticles ( char* bufPnts, uint2* bufHash, int* bufGrid, int numPnt, int numGrid )
+	{
+		uint grid_ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// grid cell index		
+		
+		bufPnts += OFFSET_NEXT;
+		bufGrid[grid_ndx] = -1;
+		for (int n=0; n < numPnt; n++) {
+			if ( bufHash[n].x == grid_ndx ) {
+				*(int*) (bufPnts + __mul24(bufHash[n].y, simData.stride)) = bufGrid[grid_ndx];
+				bufGrid[grid_ndx] = bufHash[n].y;
+			}
+		}		
+		__syncthreads ();
+	}
+	
+	__global__ void insertParticlesRadix ( char* bufPnts, uint2* bufHash, int* bufGrid, char* bufPntSort, int numPnt, int numGrid )
+	{
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;		// particle index		
+		
+		uint2 bufHashSort = bufHash[ndx];
+		
+		__shared__ uint sharedHash[257];	
+		sharedHash[threadIdx.x+1] = bufHashSort.x;
+		if ( ndx > 0 && threadIdx.x == 0 ) {
+			volatile uint2 prevData = bufHash[ndx-1];
+			sharedHash[0]  = prevData.x;
+		}
+		__syncthreads ();
+		
+		if ( (ndx == 0 || bufHashSort.x != sharedHash[threadIdx.x]) && bufHashSort.x != NULL_HASH ) {
+			bufGrid [ bufHashSort.x ] = ndx;			
+		}
+		if ( ndx < numPnt ) {
+			char* src = bufPnts + __mul24( bufHashSort.y, simData.stride );
+			char* dest = bufPntSort + __mul24( ndx, simData.stride );
+			
+			*(float3*)(dest)				= *(float3*)(src);
+			*(uint*)  (dest + OFFSET_CLR)	= *(uint*)  (src + OFFSET_CLR);
+			*(float3*)(dest + OFFSET_VEL)	= *(float3*)(src + OFFSET_VEL);
+			*(float3*)(dest + OFFSET_VEVAL)	= *(float3*)(src + OFFSET_VEVAL);				
+			
+			*(float*) (dest + OFFSET_DENS)	= 0.0;
+			*(float*) (dest + OFFSET_PRESS)	= 0.0;				
+			*(float3*) (dest + OFFSET_FORCE)= make_float3(0,0,0);		
+			*(int*)   (dest + OFFSET_NEXT)	= bufHashSort.x;			
+		} 
+		
+		__syncthreads ();
+		
+	}
+	
+	//__shared__ int ncount [ BLOCK_THREADS ];
+	
+	__device__ float contributePressure ( int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
+	{			
+		float3* qpos;		
+		float3 dist;
+		float dsq, c, sum;
+		float d = simData.sim_scale;				
+		int nbr = __mul24(pndx, MAX_NBR);
+						
+		sum = 0.0;		
+		for ( ; qndx < simData.pnts; qndx++ ) {
+			
+			if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
+			
+			if ( qndx != pndx ) {
+				qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));	
+					
+				dist.x = ( p->x - qpos->x )*d;		// dist in cm
+				dist.y = ( p->y - qpos->y )*d;
+				dist.z = ( p->z - qpos->z )*d;			
+				dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);			
+				if ( dsq < simData.r2 ) {
+					c = simData.r2 - dsq;
+					sum += c * c * c;				
+					if  ( bufNeighbor[nbr] < MAX_NBR ) {
+						bufNeighbor[ nbr+bufNeighbor[nbr] ] = qndx;
+						bufNdist[ nbr+bufNeighbor[nbr] ] = sqrt(dsq);
+						bufNeighbor[nbr]++;
+					}
+				}				
+			}
+			//curr = *(int*) (bufPnts + __mul24(curr, simData.stride) + OFFSET_NEXT);
+		}		
+		return sum;
+	}
+	
+		/*if  ( ncount[threadIdx.x]  < MAX_NBR ) {
+				bufNeighbor [ nbr + ncount[threadIdx.x]  ] = curr;
+				bufNdist [ nbr + ncount[threadIdx.x]  ] = sqrt(dsq);
+				ncount[threadIdx.x]++;
+		}*/	
+		
+	__global__ void computePressure ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
+	{
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
+
+		//if ( ndx < 1024 ) {
+		
+		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));
+
+		// Find 2x2x2 grid cells
+		// - Use registers only, no arrays (local-memory too slow)
+		int3 cell;
+		int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;					
+		float gs = simData.smooth_rad / simData.sim_scale;		
+
+		cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
+		cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
+		cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));		
+		gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
+		gc1 = gc0 + 1;
+		gc2 = gc0 + simData.res.x;
+		gc3 = gc2 + 1;
+		if ( cell.z+1 < simData.res.z ) {
+			gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
+			gc5 = gc4 + 1;
+			gc6 = gc4 + simData.res.x;
+			gc7 = gc6 + 1;
+		}
+		if ( cell.x+1 >= simData.res.x ) {
+			gc1 = -1; gc3 = -1;
+			gc5 = -1; gc7 = -1;
+		}
+		if ( cell.y+1 >= simData.res.y ) {
+			gc2 = -1; gc3 = -1;
+			gc6 = -1; gc7 = -1;
+		}
+		// Sum Pressure
+		float sum = 0.0;		
+		bufNeighbor[ __mul24(ndx, MAX_NBR) ] = 1;
+		if (gc0 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
+		if (gc1 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );		
+		if (gc2 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );		
+		if (gc3 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );	
+		if (gc4 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
+		if (gc5 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );		
+		if (gc6 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
+		if (gc7 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
+		
+		// Compute Density & Pressure
+		sum = sum * simData.pmass * simData.poly6kern;
+		if ( sum == 0.0 ) sum = 1.0;
+		*(float*) ((char*)pos + OFFSET_PRESS) = ( sum - simData.rest_dens ) * simData.stiffness;
+		*(float*) ((char*)pos + OFFSET_DENS) = 1.0f / sum;			
+		
+		//}		
+		//__syncthreads ();
+	}
+
+	__device__ void contributeForce ( float3& force, int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
+	{
+		float press = *(float*) ((char*)p + OFFSET_PRESS);
+		float dens = *(float*) ((char*)p + OFFSET_DENS);
+		float3 veval = *(float3*) ((char*)p + OFFSET_VEVAL );
+		float3 qeval, dist;				
+		float c, ndistj, dsq;
+		float pterm, dterm, vterm;		
+		float3* qpos;				
+		float d = simData.sim_scale;				
+		
+		vterm = simData.lapkern * simData.visc;		
+						
+		for ( ; qndx < simData.pnts; qndx++ ) {
+			
+			if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
+			
+			if ( qndx != pndx ) {
+				qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));	
+					
+				dist.x = ( p->x - qpos->x )*d;		// dist in cm
+				dist.y = ( p->y - qpos->y )*d;
+				dist.z = ( p->z - qpos->z )*d;			
+				dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);			
+				if ( dsq < simData.r2 ) {				
+					ndistj = sqrt(dsq);
+					c = ( simData.smooth_rad - ndistj ); 
+					dist.x = ( p->x - qpos->x )*d;		// dist in cm
+					dist.y = ( p->y - qpos->y )*d;
+					dist.z = ( p->z - qpos->z )*d;			
+					pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
+					dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);	
+					qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
+					force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
+					force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
+					force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;							
+				}
+			}
+		}				
+	}
+	
+	
+	
+	__global__ void computeForce ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
+	{
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
+		
+		//if ( ndx < numPnt ) {
+		
+		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));				
+		
+		// Find 2x2x2 grid cells
+		// - Use registers only, no arrays (local-memory too slow)
+		int3 cell;
+		int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;					
+		float gs = simData.smooth_rad / simData.sim_scale;		
+
+		cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
+		cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
+		cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));		
+		gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
+		gc1 = gc0 + 1;
+		gc2 = gc0 + simData.res.x;
+		gc3 = gc2 + 1;
+		if ( cell.z+1 < simData.res.z ) {
+			gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
+			gc5 = gc4 + 1;
+			gc6 = gc4 + simData.res.x;
+			gc7 = gc6 + 1;
+		}
+		if ( cell.x+1 >= simData.res.x ) {
+			gc1 = -1; gc3 = -1;
+			gc5 = -1; gc7 = -1;
+		}
+		if ( cell.y+1 >= simData.res.y ) {
+			gc2 = -1; gc3 = -1;
+			gc6 = -1; gc7 = -1;
+		}
+		// Sum Pressure
+		float3 force = make_float3(0,0,0);
+		if (gc0 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
+		if (gc1 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );		
+		if (gc2 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );		
+		if (gc3 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );	
+		if (gc4 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
+		if (gc5 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );		
+		if (gc6 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
+		if (gc7 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
+		
+		// Update Force
+		*(float3*) ((char*)pos + OFFSET_FORCE ) = force;	
+		
+		//}
+		//__syncthreads ();
+	}
+
+	
+	__global__ void computeForceNbr ( char* bufPntSort, int numPnt )
+	{		
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
+		
+		if ( ndx < numPnt ) {
+				
+		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));			
+		
+		float3* qpos;
+		float press = *(float*) ((char*)pos + OFFSET_PRESS);
+		float dens = *(float*) ((char*)pos + OFFSET_DENS);
+		float3 veval = *(float3*) ((char*)pos + OFFSET_VEVAL );
+		float3 qeval, dist, force;		
+		float d = simData.sim_scale;
+		float c, ndistj;
+		float pterm, dterm, vterm;
+		vterm = simData.lapkern * simData.visc;
+		int nbr = __mul24(ndx, MAX_NBR);
+		
+		int ncnt = bufNeighbor[ nbr ];		
+		
+		force = make_float3(0,0,0);
+		for (int j=1; j < ncnt; j++) {		// base 1, n[0] = count
+			ndistj = bufNdist[ nbr+j ];
+			qpos = (float3*) (bufPntSort + __mul24( bufNeighbor[ nbr+j ], simData.stride) );
+			c = ( simData.smooth_rad - ndistj ); 
+			dist.x = ( pos->x - qpos->x )*d;		// dist in cm
+			dist.y = ( pos->y - qpos->y )*d;
+			dist.z = ( pos->z - qpos->z )*d;			
+			pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
+			dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);	
+			qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
+			force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
+			force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
+			force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;			
+		}
+		*(float3*) ((char*)pos + OFFSET_FORCE ) = force;
+		
+		}	
+	
+	}
+		
+	__global__ void advanceParticles ( char* bufPntSort, int numPnt, float dt, float ss )
+	{		
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
+		
+		if ( ndx < numPnt ) {
+				
+			// Get particle vars
+			float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));			
+			float3* vel = (float3*) ((char*)pos + OFFSET_VEL );
+			float3* vel_eval = (float3*) ((char*)pos + OFFSET_VEVAL );
+			float3 accel = *(float3*) ((char*)pos + OFFSET_FORCE );
+			float3 vcurr, vnext;			
+
+			// Leapfrog integration						
+			accel.x *= 0.00020543;			// NOTE - To do: SPH_PMASS should be passed in			
+			accel.y *= 0.00020543;
+			accel.z *= 0.00020543;			
+			accel.z -= 9.8;	
+			
+			vcurr = *vel;
+			vnext.x = accel.x*dt + vcurr.x;	
+			vnext.y = accel.y*dt + vcurr.y;	
+			vnext.z = accel.z*dt + vcurr.z;			// v(t+1/2) = v(t-1/2) + a(t) dt			
+			
+			accel.x = (vcurr.x + vnext.x) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
+			accel.y = (vcurr.y + vnext.y) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
+			accel.z = (vcurr.z + vnext.z) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
+			
+			*vel_eval = accel;			
+			*vel = vnext;
+			
+			dt /= simData.sim_scale;
+			vnext.x = pos->x + vnext.x*dt;
+			vnext.y = pos->y + vnext.y*dt;
+			vnext.z = pos->z + vnext.z*dt;
+			*pos = vnext;						// p(t+1) = p(t) + v(t+1/2) dt			
+		}	
+		
+		__syncthreads ();	
+	}
+
+#endif
--- a/Extras/sph/fluids/fluid_system_kern.cuh
+++ b/Extras/sph/fluids/fluid_system_kern.cuh
@@ -0,0 +1,45 @@
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2009. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef _PARTICLES_KERNEL_H_
+	#define _PARTICLES_KERNEL_H_
+
+	#include <stdio.h>
+	#include <math.h>
+	#include "cutil_math.h"
+	#include "math_constants.h"
+	
+	// Insert particles in grid
+	
+	__global__ void insertParticles ( char* pntData, uint pntStride )
+	{
+		int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+		float4 p = *(float4*) (pntData + index*pntStride);
+
+		// get address in grid
+		int3 gridPos = calcGridPos(p);
+
+		addParticleToCell(gridPos, index, gridCounters, gridCells);
+	}
+
+
+#endif
--- a/Extras/sph/fluids/radixsort.cu
+++ b/Extras/sph/fluids/radixsort.cu
@@ -0,0 +1,79 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer software" and "commercial computer software
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ */
+
+/* Radixsort project with key/value and arbitrary datset size support
+ * which demonstrates the use of CUDA in a multi phase sorting 
+ * computation.
+ * Host code.
+ */
+
+#include "radixsort.cuh"
+#include "radixsort_kernel.cu"
+
+extern "C"
+{
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform a radix sort
+//! Sorting performed in place on passed arrays.
+//!
+//! @param pData0       input and output array - data will be sorted
+//! @param pData1       additional array to allow ping pong computation
+//! @param elements     number of elements to sort
+////////////////////////////////////////////////////////////////////////////////
+void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits)
+{
+    // Round element count to total number of threads for efficiency
+    uint elements_rounded_to_3072;
+    int modval = elements % 3072;
+    if( modval == 0 )
+        elements_rounded_to_3072 = elements;
+    else
+        elements_rounded_to_3072 = elements + (3072 - (modval));
+
+    // Iterate over n bytes of y bit word, using each byte to sort the list in turn
+    for (uint shift = 0; shift < bits; shift += RADIX)
+    {
+        // Perform one round of radix sorting
+
+        // Generate per radix group sums radix counts across a radix group
+        RadixSum<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, GRFSIZE>>>(pData0, elements, elements_rounded_to_3072, shift);
+        // Prefix sum in radix groups, and then between groups throughout a block
+        RadixPrefixSum<<<PREFIX_NUM_BLOCKS, PREFIX_NUM_THREADS_PER_BLOCK, PREFIX_GRFSIZE>>>();
+        // Sum the block offsets and then shuffle data into bins
+        RadixAddOffsetsAndShuffle<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, SHUFFLE_GRFSIZE>>>(pData0, pData1, elements, elements_rounded_to_3072, shift); 
+
+        // Exchange data pointers
+        KeyValuePair* pTemp = pData0;
+        pData0 = pData1;
+        pData1 = pTemp;
+   }
+}
+
+}
--- a/Extras/sph/fluids/radixsort.cuh
+++ b/Extras/sph/fluids/radixsort.cuh
@@ -0,0 +1,63 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer software" and "commercial computer software
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ */
+
+/* Radixsort project which demonstrates the use of CUDA in a multi phase
+ * sorting computation.
+ * Type definitions.
+ */
+
+#ifndef _RADIXSORT_H_
+#define _RADIXSORT_H_
+
+#include <host_defines.h>
+
+#define SYNCIT __syncthreads()
+
+// Use 16 bit keys/values
+#define SIXTEEN 0
+
+typedef unsigned int uint;
+typedef unsigned short ushort;
+
+#if SIXTEEN
+typedef struct __align__(4) {
+    ushort key;
+    ushort value;
+#else
+typedef struct __align__(8) {
+    uint key;
+    uint value;
+#endif
+} KeyValuePair;
+
+extern "C" {
+    void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits);
+}
+
+#endif // #ifndef _RADIXSORT_H_
--- a/Extras/sph/fluids/radixsort_kernel.cu
+++ b/Extras/sph/fluids/radixsort_kernel.cu
@@ -0,0 +1,577 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as 
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer software" and "commercial computer software 
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein.
+ */
+
+/* Radixsort project with key/value and arbitrary datset size support
+ * which demonstrates the use of CUDA in a multi phase sorting 
+ * computation.
+ * Device code.
+ */
+
+#ifndef _RADIXSORT_KERNEL_H_
+#define _RADIXSORT_KERNEL_H_
+
+#include <stdio.h>
+#include "radixsort.cuh"
+
+#define SYNCIT __syncthreads()
+
+static const int NUM_SMS = 16;
+static const int NUM_THREADS_PER_SM = 192;
+static const int NUM_THREADS_PER_BLOCK = 64;
+//static const int NUM_THREADS = NUM_THREADS_PER_SM * NUM_SMS;
+static const int NUM_BLOCKS = (NUM_THREADS_PER_SM / NUM_THREADS_PER_BLOCK) * NUM_SMS;
+static const int RADIX = 8;                                                        // Number of bits per radix sort pass
+static const int RADICES = 1 << RADIX;                                             // Number of radices
+static const int RADIXMASK = RADICES - 1;                                          // Mask for each radix sort pass
+#if SIXTEEN
+static const int RADIXBITS = 16;                                                   // Number of bits to sort over
+#else
+static const int RADIXBITS = 32;                                                   // Number of bits to sort over
+#endif
+static const int RADIXTHREADS = 16;                                                // Number of threads sharing each radix counter
+static const int RADIXGROUPS = NUM_THREADS_PER_BLOCK / RADIXTHREADS;               // Number of radix groups per CTA
+static const int TOTALRADIXGROUPS = NUM_BLOCKS * RADIXGROUPS;                      // Number of radix groups for each radix
+static const int SORTRADIXGROUPS = TOTALRADIXGROUPS * RADICES;                     // Total radix count
+static const int GRFELEMENTS = (NUM_THREADS_PER_BLOCK / RADIXTHREADS) * RADICES; 
+static const int GRFSIZE = GRFELEMENTS * sizeof(uint); 
+
+// Prefix sum variables
+static const int PREFIX_NUM_THREADS_PER_SM = NUM_THREADS_PER_SM;
+static const int PREFIX_NUM_THREADS_PER_BLOCK = PREFIX_NUM_THREADS_PER_SM;
+static const int PREFIX_NUM_BLOCKS = (PREFIX_NUM_THREADS_PER_SM / PREFIX_NUM_THREADS_PER_BLOCK) * NUM_SMS;
+static const int PREFIX_BLOCKSIZE = SORTRADIXGROUPS / PREFIX_NUM_BLOCKS;
+static const int PREFIX_GRFELEMENTS = PREFIX_BLOCKSIZE + 2 * PREFIX_NUM_THREADS_PER_BLOCK;
+static const int PREFIX_GRFSIZE = PREFIX_GRFELEMENTS * sizeof(uint);
+
+// Shuffle variables
+static const int SHUFFLE_GRFOFFSET = RADIXGROUPS * RADICES;
+static const int SHUFFLE_GRFELEMENTS = SHUFFLE_GRFOFFSET + PREFIX_NUM_BLOCKS; 
+static const int SHUFFLE_GRFSIZE = SHUFFLE_GRFELEMENTS * sizeof(uint); 
+
+
+#define SDATA( index)      CUT_BANK_CHECKER(sdata, index)
+
+// Prefix sum data
+uint gRadixSum[TOTALRADIXGROUPS * RADICES];
+__device__ uint dRadixSum[TOTALRADIXGROUPS * RADICES];
+uint gRadixBlockSum[PREFIX_NUM_BLOCKS];
+__device__ uint dRadixBlockSum[PREFIX_NUM_BLOCKS];
+
+extern __shared__ uint sRadixSum[];
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform a radix sum on the list to be sorted.  Each SM holds a set of
+//! radix counters for each group of RADIXGROUPS thread in the GRF. 
+//!
+//! @param pData     input data
+//! @param elements  total number of elements
+//! @param elements_rounded_to_3072  total number of elements rounded up to the 
+//!                                  nearest multiple of 3072
+//! @param shift     the shift (0 to 24) that we are using to obtain the correct 
+//!                  byte
+////////////////////////////////////////////////////////////////////////////////
+__global__ void RadixSum(KeyValuePair *pData, uint elements, uint elements_rounded_to_3072, uint shift)
+{
+    uint pos    = threadIdx.x;
+
+    // Zero radix counts
+    while (pos < GRFELEMENTS)
+    {
+        sRadixSum[pos] = 0;
+        pos += NUM_THREADS_PER_BLOCK;
+    }
+
+    // Sum up data
+    // Source addresses computed so that each thread is reading from a block of 
+    // consecutive addresses so there are no conflicts between threads
+    // They then loop over their combined region and the next batch works elsewhere.
+    // So threads 0 to 16 work on memory 0 to 320.
+    // First reading 0,1,2,3...15 then 16,17,18,19...31 and so on
+    // optimising parallel access to shared memory by a thread accessing 16*threadID
+    // The next radix group runs from 320 to 640 and the same applies in that region
+    uint tmod   =   threadIdx.x % RADIXTHREADS;
+    uint tpos   =   threadIdx.x / RADIXTHREADS;
+
+    // Take the rounded element list size so that all threads have a certain size dataset to work with
+    // and no zero size datasets confusing the issue
+    // By using a multiple of 3072 we ensure that all threads have elements
+    // to work with until the last phase, at which point we individually test
+    uint element_fraction  =   elements_rounded_to_3072 / TOTALRADIXGROUPS;
+
+    // Generate range 
+    // Note that it is possible for both pos and end to be past the end of the element set
+    // which will be caught later.
+    pos       = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction;
+    uint end  = pos + element_fraction;
+    pos      += tmod; 
+    //printf("pos: %d\n", pos);
+    __syncthreads();
+
+    while (pos < end )
+    {
+        uint key = 0;
+
+        // Read first data element if we are in the set of elements
+        //if( pos < elements )
+            //key = pData[pos].key;
+        KeyValuePair kvp;
+        // Read first data element, both items at once as the memory will want to coalesce like that anyway
+        if (pos < elements)
+            kvp = pData[pos];
+        else
+            kvp.key = 0;
+        key = kvp.key;
+
+
+        // Calculate position of radix counter to increment
+        // There are RADICES radices in each pass (256)
+        // and hence this many counters for bin grouping
+        // Multiply by RADIXGROUPS (4) to spread through memory
+        // and into 4 radix groups
+        uint p = ((key >> shift) & RADIXMASK) * RADIXGROUPS;
+       
+        // Increment radix counters
+        // Each radix group has its own set of counters
+        // so we add the thread position [0-3], ie the group index.
+        // We slow down here and take at least 16 cycles to write to the summation boxes
+        // but other groups will only conflict with themselves and so can also be writing
+        // 16 cycles here at least avoids retries.
+        uint ppos = p + tpos;
+
+        // If we are past the last element we don't want to do anything
+        // We do have to check each time, however, to ensure that all
+        // threads sync on each sync here.
+        if (tmod == 0 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;
+        if (tmod == 1 && pos < elements)
+            sRadixSum[ppos]++;  
+            SYNCIT;          
+        if (tmod == 2 && pos < elements)
+            sRadixSum[ppos]++;   
+            SYNCIT; 
+        if (tmod == 3 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;  
+        if (tmod == 4 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;
+        if (tmod == 5 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;           
+        if (tmod == 6 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;   
+        if (tmod == 7 && pos < elements)
+            sRadixSum[ppos]++;  
+            SYNCIT;   
+        if (tmod == 8 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;
+        if (tmod == 9 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;           
+        if (tmod == 10 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;    
+        if (tmod == 11 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;     
+        if (tmod == 12 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;
+        if (tmod == 13 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;           
+        if (tmod == 14 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;   
+        if (tmod == 15 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;   
+           
+        pos += RADIXTHREADS;
+                
+    }
+
+    __syncthreads();
+
+    __syncthreads();
+
+    // Output radix sums into separate memory regions for each radix group
+    // So this memory then is layed out:
+    // 0...... 192..... 384 ................ 192*256
+    // ie all 256 bins for each radix group
+    // in there:
+    // 0.............192
+    // 0  4  8  12...     - block idx * 4
+    // And in the block boxes we see the 4 radix groups for that block
+    // So 0-192 should contain bin 0 for each radix group, and so on
+    uint offset = blockIdx.x * RADIXGROUPS;
+    uint row    = threadIdx.x / RADIXGROUPS;
+    uint column = threadIdx.x % RADIXGROUPS;
+    while (row < RADICES)
+    {
+        dRadixSum[offset + row * TOTALRADIXGROUPS + column] = sRadixSum[row * RADIXGROUPS + column];
+        row += NUM_THREADS_PER_BLOCK / RADIXGROUPS;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Performs first part of parallel prefix sum - individual sums of each radix
+//! count. By the end of this we have prefix sums on a block level in dRadixSum
+//! and totals for blocks in dRadixBlockSum.
+////////////////////////////////////////////////////////////////////////////////
+__global__ void RadixPrefixSum()
+{
+    // Read radix groups in offset by one in the GRF so a zero can be inserted at the beginning
+    // and the final sum of all radix counts summed here is tacked onto the end for reading by
+    // the next stage
+    // Each block in this case is the full number of threads per SM (and hence the total number 
+    // of radix groups), 192. We should then have the total set of offsets for an entire radix 
+    // group by the end of this stage
+    // Device mem addressing
+    
+    uint brow       = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS);
+    uint drow       = threadIdx.x / TOTALRADIXGROUPS; // In default parameterisation this is always 0
+    uint dcolumn    = threadIdx.x % TOTALRADIXGROUPS; // And similarly this is always the same as threadIdx.x   
+    uint dpos       = (brow + drow) * TOTALRADIXGROUPS + dcolumn;
+    uint end        = ((blockIdx.x + 1) * (RADICES / PREFIX_NUM_BLOCKS)) * TOTALRADIXGROUPS;
+    // Shared mem addressing
+    uint srow       = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    uint scolumn    = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    uint spos       = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn;
+
+    // Read (RADICES / PREFIX_NUM_BLOCKS) radix counts into the GRF alongside each other
+    while (dpos < end)
+    {
+        sRadixSum[spos] = dRadixSum[dpos];
+        spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) * 
+                (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+        dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS;
+    }
+    __syncthreads();
+       
+    // Perform preliminary sum on each thread's stretch of data
+    // Each thread having a block of 16, with spacers between 0...16 18...33 and so on
+    int pos     = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+    end         = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    uint sum    = 0;
+    while (pos < end)
+    {
+        sum += sRadixSum[pos];
+        sRadixSum[pos] = sum;
+        pos++;  
+    }
+    __syncthreads();   
+  
+ 
+    // Calculate internal offsets by performing a more traditional parallel
+    // prefix sum of the topmost member of each thread's work data.  Right now,
+    // these are stored between the work data for each thread, allowing us to 
+    // eliminate GRF conflicts as well as hold the offsets needed to complete the sum
+    // In other words we have:
+    // 0....15 16 17....32 33 34....
+    // Where this first stage updates the intermediate values (so 16=15, 33=32 etc)
+    int m           = (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+    pos             = threadIdx.x  * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) +
+                      (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    sRadixSum[pos]  = sRadixSum[pos - 1];
+    __syncthreads();
+    // This stage then performs a parallel prefix sum (ie use powers of 2 to propagate in log n stages)
+    // to update 17, 34 etc with the totals to that point (so 34 becomes [34] + [17]) and so on.
+    while (m < PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1))
+    {
+        int p  = pos - m;
+        uint t = ((p > 0) ? sRadixSum[p] : 0);
+        __syncthreads();
+        sRadixSum[pos] += t;
+        __syncthreads();
+        m *= 2;
+    } 
+    __syncthreads();
+
+  
+  
+    // Add internal offsets to each thread's work data.
+    // So now we take 17 and add it to all values 18 to 33 so all offsets for that block 
+    // are updated.
+    pos     = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+    end     = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    int p   = pos - 1;
+    sum     = ((p > 0) ? sRadixSum[p] : 0);
+    while (pos < end)
+    {
+        sRadixSum[pos] += sum;
+        pos++; 
+    }
+    __syncthreads();        
+               
+    // Write summed data back out to global memory in the same way as we read it in
+    // We now have prefix sum values internal to groups
+    brow       = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS);
+    drow       = threadIdx.x / TOTALRADIXGROUPS;
+    dcolumn    = threadIdx.x % TOTALRADIXGROUPS;    
+    srow       = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    scolumn    = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    dpos       = (brow + drow) * TOTALRADIXGROUPS + dcolumn + 1;
+    spos       = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn;
+    end        = ((blockIdx.x + 1) * RADICES / PREFIX_NUM_BLOCKS) * TOTALRADIXGROUPS;
+    while (dpos < end)
+    {
+        dRadixSum[dpos] = sRadixSum[spos];
+        dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS;        
+        spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) * 
+                (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+    }
+
+    // Write last element to summation
+    // Storing block sums in a separate array
+    if (threadIdx.x == 0) {
+        dRadixBlockSum[blockIdx.x] = sRadixSum[PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) - 1];
+        dRadixSum[blockIdx.x * PREFIX_BLOCKSIZE] = 0;
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Initially perform prefix sum of block totals to obtain final set of offsets.
+//! Then make use of radix sums to perform a shuffling of the data into the 
+//! correct bins.
+//!
+//! @param pSrc      input data
+//! @param pDst      output data
+//! @param elements  total number of elements
+//! @param shift     the shift (0 to 24) that we are using to obtain the correct 
+//!                  byte
+////////////////////////////////////////////////////////////////////////////////
+__global__ void RadixAddOffsetsAndShuffle(KeyValuePair* pSrc, KeyValuePair* pDst, uint elements, uint elements_rounded_to_3072, int shift)
+{
+    // Read offsets from previous blocks
+    if (threadIdx.x == 0)
+        sRadixSum[SHUFFLE_GRFOFFSET] = 0;
+        
+    if (threadIdx.x < PREFIX_NUM_BLOCKS - 1)
+        sRadixSum[SHUFFLE_GRFOFFSET + threadIdx.x + 1] = dRadixBlockSum[threadIdx.x];
+    __syncthreads();
+    
+    // Parallel prefix sum over block sums
+    int pos = threadIdx.x;
+    int n = 1;
+    while (n < PREFIX_NUM_BLOCKS)
+    {
+        int ppos = pos - n;
+        uint t0 = ((pos < PREFIX_NUM_BLOCKS) && (ppos >= 0)) ? sRadixSum[SHUFFLE_GRFOFFSET + ppos] : 0;
+        __syncthreads();
+        if (pos < PREFIX_NUM_BLOCKS)
+            sRadixSum[SHUFFLE_GRFOFFSET + pos] += t0;
+        __syncthreads(); 
+        n *= 2;
+    }
+
+    // Read radix count data and add appropriate block offset
+    // for each radix at the memory location for this thread
+    // (where the other threads in the block will be reading
+    // as well, hence the large stride).
+    // There is one counter box per radix group per radix 
+    // per block (4*256*3)
+    // We use 64 threads to read the 4 radix groups set of radices 
+    // for the block. 
+    int row    = threadIdx.x / RADIXGROUPS;
+    int column = threadIdx.x % RADIXGROUPS;
+    int spos   = row * RADIXGROUPS + column;
+    int dpos   = row * TOTALRADIXGROUPS + column + blockIdx.x * RADIXGROUPS;
+    while (spos < SHUFFLE_GRFOFFSET)
+    {
+        sRadixSum[spos] = dRadixSum[dpos] + sRadixSum[SHUFFLE_GRFOFFSET + dpos / (TOTALRADIXGROUPS * RADICES / PREFIX_NUM_BLOCKS)];
+        spos += NUM_THREADS_PER_BLOCK;
+        dpos += (NUM_THREADS_PER_BLOCK / RADIXGROUPS) * TOTALRADIXGROUPS;
+    }
+    __syncthreads();
+
+    //int pos;
+    // Shuffle data
+    // Each of the subbins for a block should be filled via the counters, properly interleaved
+    // Then, as we now iterate over each data value, we increment the subbins (each thread in the 
+    // radix group in turn to avoid miss writes due to conflicts) and set locations correctly.
+    uint element_fraction  =   elements_rounded_to_3072 / TOTALRADIXGROUPS;
+    int tmod   =   threadIdx.x % RADIXTHREADS;
+    int tpos   =   threadIdx.x / RADIXTHREADS;
+
+    pos       = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction;
+    uint end  = pos + element_fraction; //(blockIdx.x * RADIXGROUPS + tpos + 1) * element_fraction;
+    pos      += tmod; 
+
+    __syncthreads();
+
+    while (pos < end )
+    {
+        KeyValuePair kvp;
+#if 1 // old load
+        // Read first data element, both items at once as the memory will want to coalesce like that anyway
+        if (pos < elements)
+        {
+            kvp = pSrc[pos];
+        }
+        else
+            kvp.key = 0;
+
+#else // casting to float2 to get it to combine loads
+        int2 kvpf2;
+
+        // Read first data element, both items at once as the memory will want to coalesce like that anyway
+        if (pos < elements)
+        {
+  //          kvp = pSrc[pos];
+            kvpf2 = ((int2*)pSrc)[pos];
+           // printf("kvp: %f %f  kvpf2: %f %f\n", kvp.key, kvp.value, kvpf2.x, kvpf2.y);
+        }
+        else
+            //kvp.key = 0;
+            kvpf2.x = 0;
+ 
+        kvp.key = kvpf2.x;
+        kvp.value = kvpf2.y;
+#endif  
+
+        uint index;
+                
+        // Calculate position of radix counter to increment
+        uint p = ((kvp.key >> shift) & RADIXMASK) * RADIXGROUPS;
+                
+        // Move data, keeping counts updated.
+        // Increment radix counters, relying on hexadecathread
+        // warp to prevent this code from stepping all over itself.
+        uint ppos = p + tpos;
+        if (tmod == 0 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 1 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;     
+        if (tmod == 2 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 3 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 4 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 5 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT; 
+        if (tmod == 6 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 7 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT; 
+        if (tmod == 8 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 9 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;          
+        if (tmod == 10 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 11 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 12 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 13 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;           
+        if (tmod == 14 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 15 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;                         
+
+        pos += RADIXTHREADS;
+    }
+
+    __syncthreads();
+}
+
+#endif // #ifndef _RADIXSORT_KERNEL_H_