fix: some file didn't have the svn:eol-style native yet

2010-03-06 15:23:36 +00:00
parent 4fd48ac691
commit 81f04a4d48
641 changed files with 301123 additions and 301123 deletions
--- a/Extras/sph/fluids/fluid.h
+++ b/Extras/sph/fluids/fluid.h
@@ -1,44 +1,44 @@
-/*
-  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
-  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
-
-  ZLib license
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-*/
-
-#ifndef DEF_FLUID
-	#define DEF_FLUID
-	
-	#include "vector.h"
-
-	#include "common_defs.h"
-
-	struct Fluid {
-	public:
-		Vector3DF		pos;			// Basic particle (must match Particle class)
-		DWORD			clr;
-		int				next;
-		Vector3DF		vel;			
-		Vector3DF		vel_eval;		
-		unsigned short	age;
-
-		float			pressure;		// Smoothed Particle Hydrodynamics
-		float			density;	
-		Vector3DF		sph_force;
-	};
-
-#endif /*PARTICLE_H_*/
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef DEF_FLUID
+	#define DEF_FLUID
+	
+	#include "vector.h"
+
+	#include "common_defs.h"
+
+	struct Fluid {
+	public:
+		Vector3DF		pos;			// Basic particle (must match Particle class)
+		DWORD			clr;
+		int				next;
+		Vector3DF		vel;			
+		Vector3DF		vel_eval;		
+		unsigned short	age;
+
+		float			pressure;		// Smoothed Particle Hydrodynamics
+		float			density;	
+		Vector3DF		sph_force;
+	};
+
+#endif /*PARTICLE_H_*/
--- a/Extras/sph/fluids/fluid_system.cpp
+++ b/Extras/sph/fluids/fluid_system.cpp
--- a/Extras/sph/fluids/fluid_system.cu
+++ b/Extras/sph/fluids/fluid_system.cu
@@ -1,71 +1,71 @@
-/*
-  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
-  Copyright (C) 2009. Rama Hoetzlein, http://www.rchoetzlein.com
-
-  ZLib license
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-*/
-
-#include <cutil.h>
-#include <cstdlib>
-#include <cstdio>
-#include <string.h>
-
-#if defined(__APPLE__) || defined(MACOSX)
-	#include <GLUT/glut.h>
-#else
-	#include <GL/glut.h>
-#endif
-#include <cuda_gl_interop.h>
-
-#include "fluid_system_kern.cu"
-
-extern "C"
-{
-
-// Compute number of blocks to create
-int iDivUp (int a, int b) {
-    return (a % b != 0) ? (a / b + 1) : (a / b);
-}
-void computeNumBlocks (int numPnts, int minThreads, int &numBlocks, int &numThreads)
-{
-    numThreads = min( minThreads, numPnts );
-    numBlocks = iDivUp ( numPnts, numThreads );
-}
-
-
-void Grid_InsertParticlesCUDA ( uchar* data, uint stride, uint numPoints )
-{
-    int numThreads, numBlocks;
-    computeNumBlocks (numPoints, 256, numBlocks, numThreads);
-
-	// transfer point data to device
-    char* pntData;
-	size = numPoints * stride;
-	cudaMalloc( (void**) &pntData, size);
-	cudaMemcpy( pntData, data, size, cudaMemcpyHostToDevice);    
-
-    // execute the kernel
-    insertParticles<<< numBlocks, numThreads >>> ( pntData, stride );
-    
-    // transfer data back to host
-    cudaMemcpy( data, pntData, cudaMemcpyDeviceToHost);
-    
-    // check if kernel invocation generated an error
-    CUT_CHECK_ERROR("Kernel execution failed");
-    CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboPos));
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2009. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <cutil.h>
+#include <cstdlib>
+#include <cstdio>
+#include <string.h>
+
+#if defined(__APPLE__) || defined(MACOSX)
+	#include <GLUT/glut.h>
+#else
+	#include <GL/glut.h>
+#endif
+#include <cuda_gl_interop.h>
+
+#include "fluid_system_kern.cu"
+
+extern "C"
+{
+
+// Compute number of blocks to create
+int iDivUp (int a, int b) {
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+}
+void computeNumBlocks (int numPnts, int minThreads, int &numBlocks, int &numThreads)
+{
+    numThreads = min( minThreads, numPnts );
+    numBlocks = iDivUp ( numPnts, numThreads );
+}
+
+
+void Grid_InsertParticlesCUDA ( uchar* data, uint stride, uint numPoints )
+{
+    int numThreads, numBlocks;
+    computeNumBlocks (numPoints, 256, numBlocks, numThreads);
+
+	// transfer point data to device
+    char* pntData;
+	size = numPoints * stride;
+	cudaMalloc( (void**) &pntData, size);
+	cudaMemcpy( pntData, data, size, cudaMemcpyHostToDevice);    
+
+    // execute the kernel
+    insertParticles<<< numBlocks, numThreads >>> ( pntData, stride );
+    
+    // transfer data back to host
+    cudaMemcpy( data, pntData, cudaMemcpyDeviceToHost);
+    
+    // check if kernel invocation generated an error
+    CUT_CHECK_ERROR("Kernel execution failed");
+    CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboPos));
 }
--- a/Extras/sph/fluids/fluid_system.h
+++ b/Extras/sph/fluids/fluid_system.h
@@ -1,106 +1,106 @@
-/*
-  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
-  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
-
-  ZLib license
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-#ifndef DEF_FLUID_SYS
-	#define DEF_FLUID_SYS
-
-	#include <iostream>
-	#include <vector>
-	#include <stdio.h>
-	#include <stdlib.h>
-	#include <math.h>
-
-	#include "point_set.h"
-	#include "fluid.h"
-	
-	// Scalar params
-	#define SPH_SIMSIZE			4
-	#define SPH_SIMSCALE		5
-	#define SPH_VISC			6
-	#define SPH_RESTDENSITY		7
-	#define SPH_PMASS			8
-	#define SPH_PRADIUS			9
-	#define SPH_PDIST			10
-	#define SPH_SMOOTHRADIUS	11
-	#define SPH_INTSTIFF		12
-	#define SPH_EXTSTIFF		13
-	#define SPH_EXTDAMP			14
-	#define SPH_LIMIT			15
-	#define BOUND_ZMIN_SLOPE	16
-	#define FORCE_XMAX_SIN		17
-	#define FORCE_XMIN_SIN		18
-	#define MAX_FRAC			19
-	#define CLR_MODE			20
-
-	// Vector params
-	#define SPH_VOLMIN			7
-	#define SPH_VOLMAX			8
-	#define SPH_INITMIN			9
-	#define SPH_INITMAX			10
-
-	// Toggles
-	#define	SPH_GRID			0
-	#define SPH_DEBUG			1
-	#define WRAP_X				2
-	#define WALL_BARRIER		3
-	#define LEVY_BARRIER		4
-	#define DRAIN_BARRIER		5
-	#define USE_CUDA			6
-	
-	#define MAX_PARAM			21
-	#define BFLUID				2
-
-	class FluidSystem : public PointSet {
-	public:
-		FluidSystem ();
-
-		// Basic Particle System
-		virtual void Initialize ( int mode, int nmax );
-		virtual void Reset ( int nmax );
-		virtual void Run ();
-		virtual void Advance ();
-		virtual int AddPoint ();		
-		virtual int AddPointReuse ();
-		Fluid* AddFluid ()			{ return (Fluid*) GetElem(0, AddPointReuse()); }
-		Fluid* GetFluid (int n)		{ return (Fluid*) GetElem(0, n); }
-		
-		// Smoothed Particle Hydrodynamics
-		void SPH_Setup ();
-		void SPH_CreateExample ( int n, int nmax );
-		void SPH_DrawDomain ();
-		void SPH_ComputeKernels ();
-
-		void SPH_ComputePressureSlow ();			// O(n^2)
-		void SPH_ComputePressureGrid ();			// O(kn) - spatial grid
-		
-		void SPH_ComputeForceSlow ();				// O(n^2)
-		void SPH_ComputeForceGrid ();				// O(kn) - spatial grid
-		void SPH_ComputeForceGridNC ();				// O(cn) - neighbor table		
-		
-	private:
-
-		// Smoothed Particle Hydrodynamics
-		double						m_R2, m_Poly6Kern, m_LapKern, m_SpikyKern;		// Kernel functions
-	};
-
-#endif
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef DEF_FLUID_SYS
+	#define DEF_FLUID_SYS
+
+	#include <iostream>
+	#include <vector>
+	#include <stdio.h>
+	#include <stdlib.h>
+	#include <math.h>
+
+	#include "point_set.h"
+	#include "fluid.h"
+	
+	// Scalar params
+	#define SPH_SIMSIZE			4
+	#define SPH_SIMSCALE		5
+	#define SPH_VISC			6
+	#define SPH_RESTDENSITY		7
+	#define SPH_PMASS			8
+	#define SPH_PRADIUS			9
+	#define SPH_PDIST			10
+	#define SPH_SMOOTHRADIUS	11
+	#define SPH_INTSTIFF		12
+	#define SPH_EXTSTIFF		13
+	#define SPH_EXTDAMP			14
+	#define SPH_LIMIT			15
+	#define BOUND_ZMIN_SLOPE	16
+	#define FORCE_XMAX_SIN		17
+	#define FORCE_XMIN_SIN		18
+	#define MAX_FRAC			19
+	#define CLR_MODE			20
+
+	// Vector params
+	#define SPH_VOLMIN			7
+	#define SPH_VOLMAX			8
+	#define SPH_INITMIN			9
+	#define SPH_INITMAX			10
+
+	// Toggles
+	#define	SPH_GRID			0
+	#define SPH_DEBUG			1
+	#define WRAP_X				2
+	#define WALL_BARRIER		3
+	#define LEVY_BARRIER		4
+	#define DRAIN_BARRIER		5
+	#define USE_CUDA			6
+	
+	#define MAX_PARAM			21
+	#define BFLUID				2
+
+	class FluidSystem : public PointSet {
+	public:
+		FluidSystem ();
+
+		// Basic Particle System
+		virtual void Initialize ( int mode, int nmax );
+		virtual void Reset ( int nmax );
+		virtual void Run ();
+		virtual void Advance ();
+		virtual int AddPoint ();		
+		virtual int AddPointReuse ();
+		Fluid* AddFluid ()			{ return (Fluid*) GetElem(0, AddPointReuse()); }
+		Fluid* GetFluid (int n)		{ return (Fluid*) GetElem(0, n); }
+		
+		// Smoothed Particle Hydrodynamics
+		void SPH_Setup ();
+		void SPH_CreateExample ( int n, int nmax );
+		void SPH_DrawDomain ();
+		void SPH_ComputeKernels ();
+
+		void SPH_ComputePressureSlow ();			// O(n^2)
+		void SPH_ComputePressureGrid ();			// O(kn) - spatial grid
+		
+		void SPH_ComputeForceSlow ();				// O(n^2)
+		void SPH_ComputeForceGrid ();				// O(kn) - spatial grid
+		void SPH_ComputeForceGridNC ();				// O(cn) - neighbor table		
+		
+	private:
+
+		// Smoothed Particle Hydrodynamics
+		double						m_R2, m_Poly6Kern, m_LapKern, m_SpikyKern;		// Kernel functions
+	};
+
+#endif
--- a/Extras/sph/fluids/fluid_system_host.cu
+++ b/Extras/sph/fluids/fluid_system_host.cu
@@ -1,250 +1,250 @@
-/*
-  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
-  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
-
-  ZLib license
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-//#include "C:\CUDA\common\inc\cutil.h"				// cutil32.lib
-#include <string.h>
-#include "../CUDA/btCudaDefines.h"
-
-
-
-#if defined(__APPLE__) || defined(MACOSX)
-	#include <GLUT/glut.h>
-#else
-	#include <GL/glut.h>
-#endif
-#include <cuda_gl_interop.h>
-
-#include "radixsort.cu"
-#include "fluid_system_kern.cu"			// build kernel
-
-FluidParams					fcuda;
-
-__device__ char*			bufPnts;		// point data (array of Fluid structs)
-__device__ char*			bufPntSort;		// point data (array of Fluid structs)
-__device__ uint*			bufHash[2];		// point grid hash
-__device__ int*				bufGrid;	
-
-	
-
-extern "C"
-{
-// Initialize CUDA
-void cudaInit(int argc, char **argv)
-{   
-    //CUT_DEVICE_INIT(argc, argv);
- 
-	cudaDeviceProp p;
-	cudaGetDeviceProperties ( &p, 0);
-	
-	printf ( "-- CUDA --\n" );
-	printf ( "Name:       %s\n", p.name );
-	printf ( "Revision:   %d.%d\n", p.major, p.minor );
-	printf ( "Global Mem: %d\n", p.totalGlobalMem );
-	printf ( "Shared/Blk: %d\n", p.sharedMemPerBlock );
-	printf ( "Regs/Blk:   %d\n", p.regsPerBlock );
-	printf ( "Warp Size:  %d\n", p.warpSize );
-	printf ( "Mem Pitch:  %d\n", p.memPitch );
-	printf ( "Thrds/Blk:  %d\n", p.maxThreadsPerBlock );
-	printf ( "Const Mem:  %d\n", p.totalConstMem );
-	printf ( "Clock Rate: %d\n", p.clockRate );	
-	
-	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufPnts, 10 ) );	
-	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufPntSort, 10 ) );
-	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufHash, 10 ) );	
-	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufGrid, 10 ) );	
-};
-	
-// Compute number of blocks to create
-int iDivUp (int a, int b) {
-    return (a % b != 0) ? (a / b + 1) : (a / b);
-}
-void computeNumBlocks (int numPnts, int maxThreads, int &numBlocks, int &numThreads)
-{
-    numThreads = min( maxThreads, numPnts );
-    numBlocks = iDivUp ( numPnts, numThreads );
-}
-
-void FluidClearCUDA ()
-{
-	BT_GPU_SAFE_CALL ( cudaFree ( bufPnts ) );	
-	BT_GPU_SAFE_CALL ( cudaFree ( bufPntSort ) );
-	BT_GPU_SAFE_CALL ( cudaFree ( bufHash[0] ) );	
-	BT_GPU_SAFE_CALL ( cudaFree ( bufHash[1] ) );	
-	BT_GPU_SAFE_CALL ( cudaFree ( bufGrid ) );
-}
-
-
-void FluidSetupCUDA ( int num, int stride, float3 min, float3 max, float3 res, float3 size, int chk )
-{	
-	fcuda.min = make_float3(min.x, min.y, min.z);
-	fcuda.max = make_float3(max.x, max.y, max.z);
-	fcuda.res = make_float3(res.x, res.y, res.z);
-	fcuda.size = make_float3(size.x, size.y, size.z);	
-	fcuda.pnts = num;
-	fcuda.delta.x = res.x / size.x;
-	fcuda.delta.y = res.y / size.y;
-	fcuda.delta.z = res.z / size.z;
-	fcuda.cells = res.x*res.y*res.z;
-	fcuda.chk = chk;
-		
-    computeNumBlocks ( fcuda.pnts, 256, fcuda.numBlocks, fcuda.numThreads);			// particles
-    computeNumBlocks ( fcuda.cells, 256, fcuda.gridBlocks, fcuda.gridThreads);		// grid cell
-    
-    fcuda.szPnts = (fcuda.numBlocks * fcuda.numThreads) * stride;        
-    fcuda.szHash = (fcuda.numBlocks * fcuda.numThreads) * sizeof(uint2);		// <cell, particle> pairs
-    fcuda.szGrid = (fcuda.gridBlocks * fcuda.gridThreads) * sizeof(uint);    
-    fcuda.stride = stride;
-    printf ( "pnts: %d, t:%dx%d=%d, bufPnts:%d, bufHash:%d\n", fcuda.pnts, fcuda.numBlocks, fcuda.numThreads, fcuda.numBlocks*fcuda.numThreads, fcuda.szPnts, fcuda.szHash );
-    printf ( "grds: %d, t:%dx%d=%d, bufGrid:%d, Res: %dx%dx%d\n", fcuda.cells, fcuda.gridBlocks, fcuda.gridThreads, fcuda.gridBlocks*fcuda.gridThreads, fcuda.szGrid, (int) fcuda.res.x, (int) fcuda.res.y, (int) fcuda.res.z );	
-
-	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufPnts, fcuda.szPnts ) );	
-	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufPntSort, fcuda.szPnts ) );	
-	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufHash[0], fcuda.szHash ) );	
-	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufHash[1], fcuda.szHash ) );	
-	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufGrid, fcuda.szGrid ) );
-	
-	printf ( "POINTERS\n");
-	printf ( "bufPnts:    %p\n", bufPnts );
-	printf ( "bufPntSort: %p\n", bufPntSort );
-	printf ( "bufHash0:   %p\n", bufHash[0] );
-	printf ( "bufHash1:   %p\n", bufHash[1] );
-	printf ( "bufGrid:    %p\n", bufGrid );
-	
-	BT_GPU_SAFE_CALL ( cudaMemcpyToSymbol ( simData, &fcuda, sizeof(FluidParams) ) );
-	cudaThreadSynchronize ();
-}
-
-void FluidParamCUDA ( float sim_scale, float smooth_rad, float mass, float rest, float stiff, float visc )
-{
-	fcuda.sim_scale = sim_scale;
-	fcuda.smooth_rad = smooth_rad;
-	fcuda.r2 = smooth_rad * smooth_rad;
-	fcuda.pmass = mass;
-	fcuda.rest_dens = rest;	
-	fcuda.stiffness = stiff;
-	fcuda.visc = visc;
-	
-	fcuda.pdist = pow ( fcuda.pmass / fcuda.rest_dens, 1/3.0f );
-	fcuda.poly6kern = 315.0f / (64.0f * 3.141592 * pow( smooth_rad, 9.0f) );
-	fcuda.spikykern = -45.0f / (3.141592 * pow( smooth_rad, 6.0f) );
-	fcuda.lapkern = 45.0f / (3.141592 * pow( smooth_rad, 6.0f) );	
-
-	BT_GPU_SAFE_CALL( cudaMemcpyToSymbol ( simData, &fcuda, sizeof(FluidParams) ) );
-	cudaThreadSynchronize ();
-}
-
-void TransferToCUDA ( char* data, int* grid, int numPoints )
-{
-	BT_GPU_SAFE_CALL( cudaMemcpy ( bufPnts, data, numPoints * fcuda.stride, cudaMemcpyHostToDevice ) );
-	cudaThreadSynchronize ();
-}
-
-void TransferFromCUDA ( char* data, int* grid, int numPoints )
-{
-	BT_GPU_SAFE_CALL( cudaMemcpy ( data, bufPntSort, numPoints * fcuda.stride, cudaMemcpyDeviceToHost ) );	
-	cudaThreadSynchronize ();	
-	
-	BT_GPU_SAFE_CALL( cudaMemcpy ( grid, bufGrid, fcuda.cells * sizeof(uint), cudaMemcpyDeviceToHost ) );			
-}
-
-void Grid_InsertParticlesCUDA ()
-{
-	BT_GPU_SAFE_CALL( cudaMemset ( bufHash[0], 0, fcuda.szHash ) );
-	
-	hashParticles<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPnts, (uint2*) bufHash[0], fcuda.pnts );	
-	BT_GPU_CHECK_ERROR( "Kernel execution failed");
-	cudaThreadSynchronize ();
-	
-	//int buf[20000];		
-	/*printf ( "HASH: %d (%d)\n", fcuda.pnts, fcuda.numBlocks*fcuda.numThreads );
-	BT_GPU_SAFE_CALL( cudaMemcpy ( buf, bufHash[0], fcuda.pnts * 2*sizeof(uint), cudaMemcpyDeviceToHost ) );		
-	//for (int n=0; n < fcuda.numBlocks*fcuda.numThreads; n++) {		
-	for (int n=0; n < 100; n++) {
-		printf ( "%d: <%d,%d>\n", n, buf[n*2], buf[n*2+1] );
-	}*/
-	 
-	RadixSort( (KeyValuePair *) bufHash[0], (KeyValuePair *) bufHash[1], fcuda.pnts, 32);
-	BT_GPU_CHECK_ERROR( "Kernel execution failed");
-	cudaThreadSynchronize ();
-	
-	/*printf ( "HASH: %d (%d)\n", fcuda.pnts, fcuda.numBlocks*fcuda.numThreads );
-	BT_GPU_SAFE_CALL( cudaMemcpy ( buf, bufHash[0], fcuda.pnts * 2*sizeof(uint), cudaMemcpyDeviceToHost ) );		
-	//for (int n=0; n < fcuda.numBlocks*fcuda.numThreads; n++) {		
-	for (int n=0; n < 100; n++) {
-		printf ( "%d: <%d,%d>\n", n, buf[n*2], buf[n*2+1] );
-	}*/
-	
-	// insertParticles<<< fcuda.gridBlocks, fcuda.gridThreads>>> ( bufPnts, (uint2*) bufHash[0], bufGrid, fcuda.pnts, fcuda.cells );			
-	
-	BT_GPU_SAFE_CALL( cudaMemset ( bufGrid, NULL_HASH, fcuda.cells * sizeof(uint) ) );
-	
-	insertParticlesRadix<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPnts, (uint2*) bufHash[0], bufGrid, bufPntSort, fcuda.pnts, fcuda.cells );
-	BT_GPU_CHECK_ERROR( "Kernel execution failed");
-	cudaThreadSynchronize ();	
-    
-    /*printf ( "GRID: %d\n", fcuda.cells );
-	BT_GPU_SAFE_CALL( cudaMemcpy ( buf, bufGrid, fcuda.cells * sizeof(uint), cudaMemcpyDeviceToHost ) );		
-	*for (int n=0; n < 100; n++) {		
-		printf ( "%d: %d\n", n, buf[n]);
-	}*/
-}
-
-void SPH_ComputePressureCUDA ()
-{
-	computePressure<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, bufGrid, (uint2*) bufHash[0], fcuda.pnts );	
-    BT_GPU_CHECK_ERROR( "Kernel execution failed");
-    cudaThreadSynchronize ();	
-}
-
-void SPH_ComputeForceCUDA ()
-{
-	//-- standard force
-	//computeForce<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, bufGrid, (uint2*) bufHash[0], fcuda.pnts );	
-	
-	// Force using neighbor table
-	computeForceNbr<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, fcuda.pnts );	
-    BT_GPU_CHECK_ERROR( "Kernel execution failed");
-    cudaThreadSynchronize ();	
-}
-
-void SPH_AdvanceCUDA ( float dt, float ss )
-{
-	advanceParticles<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, fcuda.pnts, dt, ss );
-    BT_GPU_CHECK_ERROR( "Kernel execution failed");
-    cudaThreadSynchronize ();
-}
-
-}	// extern C
-
-
-
-
-   	//----------- Per frame: Malloc/Free, Host<->Device
-	// transfer point data to device    
-    /*char* pntData;
-	int size = (fcuda.numBlocks*fcuda.numThreads) * stride;
-	cudaMalloc( (void**) &pntData, size);
-	cudaMemcpy( pntData, data, numPoints*stride, cudaMemcpyHostToDevice);  	
-    insertParticles<<< fcuda.numBlocks, fcuda.numThreads >>> ( pntData, stride, numPoints );
-    cudaMemcpy( data, pntData, numPoints*stride, cudaMemcpyDeviceToHost);    
-    cudaFree( pntData );*/
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+//#include "C:\CUDA\common\inc\cutil.h"				// cutil32.lib
+#include <string.h>
+#include "../CUDA/btCudaDefines.h"
+
+
+
+#if defined(__APPLE__) || defined(MACOSX)
+	#include <GLUT/glut.h>
+#else
+	#include <GL/glut.h>
+#endif
+#include <cuda_gl_interop.h>
+
+#include "radixsort.cu"
+#include "fluid_system_kern.cu"			// build kernel
+
+FluidParams					fcuda;
+
+__device__ char*			bufPnts;		// point data (array of Fluid structs)
+__device__ char*			bufPntSort;		// point data (array of Fluid structs)
+__device__ uint*			bufHash[2];		// point grid hash
+__device__ int*				bufGrid;	
+
+	
+
+extern "C"
+{
+// Initialize CUDA
+void cudaInit(int argc, char **argv)
+{   
+    //CUT_DEVICE_INIT(argc, argv);
+ 
+	cudaDeviceProp p;
+	cudaGetDeviceProperties ( &p, 0);
+	
+	printf ( "-- CUDA --\n" );
+	printf ( "Name:       %s\n", p.name );
+	printf ( "Revision:   %d.%d\n", p.major, p.minor );
+	printf ( "Global Mem: %d\n", p.totalGlobalMem );
+	printf ( "Shared/Blk: %d\n", p.sharedMemPerBlock );
+	printf ( "Regs/Blk:   %d\n", p.regsPerBlock );
+	printf ( "Warp Size:  %d\n", p.warpSize );
+	printf ( "Mem Pitch:  %d\n", p.memPitch );
+	printf ( "Thrds/Blk:  %d\n", p.maxThreadsPerBlock );
+	printf ( "Const Mem:  %d\n", p.totalConstMem );
+	printf ( "Clock Rate: %d\n", p.clockRate );	
+	
+	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufPnts, 10 ) );	
+	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufPntSort, 10 ) );
+	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufHash, 10 ) );	
+	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufGrid, 10 ) );	
+};
+	
+// Compute number of blocks to create
+int iDivUp (int a, int b) {
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+}
+void computeNumBlocks (int numPnts, int maxThreads, int &numBlocks, int &numThreads)
+{
+    numThreads = min( maxThreads, numPnts );
+    numBlocks = iDivUp ( numPnts, numThreads );
+}
+
+void FluidClearCUDA ()
+{
+	BT_GPU_SAFE_CALL ( cudaFree ( bufPnts ) );	
+	BT_GPU_SAFE_CALL ( cudaFree ( bufPntSort ) );
+	BT_GPU_SAFE_CALL ( cudaFree ( bufHash[0] ) );	
+	BT_GPU_SAFE_CALL ( cudaFree ( bufHash[1] ) );	
+	BT_GPU_SAFE_CALL ( cudaFree ( bufGrid ) );
+}
+
+
+void FluidSetupCUDA ( int num, int stride, float3 min, float3 max, float3 res, float3 size, int chk )
+{	
+	fcuda.min = make_float3(min.x, min.y, min.z);
+	fcuda.max = make_float3(max.x, max.y, max.z);
+	fcuda.res = make_float3(res.x, res.y, res.z);
+	fcuda.size = make_float3(size.x, size.y, size.z);	
+	fcuda.pnts = num;
+	fcuda.delta.x = res.x / size.x;
+	fcuda.delta.y = res.y / size.y;
+	fcuda.delta.z = res.z / size.z;
+	fcuda.cells = res.x*res.y*res.z;
+	fcuda.chk = chk;
+		
+    computeNumBlocks ( fcuda.pnts, 256, fcuda.numBlocks, fcuda.numThreads);			// particles
+    computeNumBlocks ( fcuda.cells, 256, fcuda.gridBlocks, fcuda.gridThreads);		// grid cell
+    
+    fcuda.szPnts = (fcuda.numBlocks * fcuda.numThreads) * stride;        
+    fcuda.szHash = (fcuda.numBlocks * fcuda.numThreads) * sizeof(uint2);		// <cell, particle> pairs
+    fcuda.szGrid = (fcuda.gridBlocks * fcuda.gridThreads) * sizeof(uint);    
+    fcuda.stride = stride;
+    printf ( "pnts: %d, t:%dx%d=%d, bufPnts:%d, bufHash:%d\n", fcuda.pnts, fcuda.numBlocks, fcuda.numThreads, fcuda.numBlocks*fcuda.numThreads, fcuda.szPnts, fcuda.szHash );
+    printf ( "grds: %d, t:%dx%d=%d, bufGrid:%d, Res: %dx%dx%d\n", fcuda.cells, fcuda.gridBlocks, fcuda.gridThreads, fcuda.gridBlocks*fcuda.gridThreads, fcuda.szGrid, (int) fcuda.res.x, (int) fcuda.res.y, (int) fcuda.res.z );	
+
+	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufPnts, fcuda.szPnts ) );	
+	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufPntSort, fcuda.szPnts ) );	
+	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufHash[0], fcuda.szHash ) );	
+	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufHash[1], fcuda.szHash ) );	
+	BT_GPU_SAFE_CALL ( cudaMalloc ( (void**) &bufGrid, fcuda.szGrid ) );
+	
+	printf ( "POINTERS\n");
+	printf ( "bufPnts:    %p\n", bufPnts );
+	printf ( "bufPntSort: %p\n", bufPntSort );
+	printf ( "bufHash0:   %p\n", bufHash[0] );
+	printf ( "bufHash1:   %p\n", bufHash[1] );
+	printf ( "bufGrid:    %p\n", bufGrid );
+	
+	BT_GPU_SAFE_CALL ( cudaMemcpyToSymbol ( simData, &fcuda, sizeof(FluidParams) ) );
+	cudaThreadSynchronize ();
+}
+
+void FluidParamCUDA ( float sim_scale, float smooth_rad, float mass, float rest, float stiff, float visc )
+{
+	fcuda.sim_scale = sim_scale;
+	fcuda.smooth_rad = smooth_rad;
+	fcuda.r2 = smooth_rad * smooth_rad;
+	fcuda.pmass = mass;
+	fcuda.rest_dens = rest;	
+	fcuda.stiffness = stiff;
+	fcuda.visc = visc;
+	
+	fcuda.pdist = pow ( fcuda.pmass / fcuda.rest_dens, 1/3.0f );
+	fcuda.poly6kern = 315.0f / (64.0f * 3.141592 * pow( smooth_rad, 9.0f) );
+	fcuda.spikykern = -45.0f / (3.141592 * pow( smooth_rad, 6.0f) );
+	fcuda.lapkern = 45.0f / (3.141592 * pow( smooth_rad, 6.0f) );	
+
+	BT_GPU_SAFE_CALL( cudaMemcpyToSymbol ( simData, &fcuda, sizeof(FluidParams) ) );
+	cudaThreadSynchronize ();
+}
+
+void TransferToCUDA ( char* data, int* grid, int numPoints )
+{
+	BT_GPU_SAFE_CALL( cudaMemcpy ( bufPnts, data, numPoints * fcuda.stride, cudaMemcpyHostToDevice ) );
+	cudaThreadSynchronize ();
+}
+
+void TransferFromCUDA ( char* data, int* grid, int numPoints )
+{
+	BT_GPU_SAFE_CALL( cudaMemcpy ( data, bufPntSort, numPoints * fcuda.stride, cudaMemcpyDeviceToHost ) );	
+	cudaThreadSynchronize ();	
+	
+	BT_GPU_SAFE_CALL( cudaMemcpy ( grid, bufGrid, fcuda.cells * sizeof(uint), cudaMemcpyDeviceToHost ) );			
+}
+
+void Grid_InsertParticlesCUDA ()
+{
+	BT_GPU_SAFE_CALL( cudaMemset ( bufHash[0], 0, fcuda.szHash ) );
+	
+	hashParticles<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPnts, (uint2*) bufHash[0], fcuda.pnts );	
+	BT_GPU_CHECK_ERROR( "Kernel execution failed");
+	cudaThreadSynchronize ();
+	
+	//int buf[20000];		
+	/*printf ( "HASH: %d (%d)\n", fcuda.pnts, fcuda.numBlocks*fcuda.numThreads );
+	BT_GPU_SAFE_CALL( cudaMemcpy ( buf, bufHash[0], fcuda.pnts * 2*sizeof(uint), cudaMemcpyDeviceToHost ) );		
+	//for (int n=0; n < fcuda.numBlocks*fcuda.numThreads; n++) {		
+	for (int n=0; n < 100; n++) {
+		printf ( "%d: <%d,%d>\n", n, buf[n*2], buf[n*2+1] );
+	}*/
+	 
+	RadixSort( (KeyValuePair *) bufHash[0], (KeyValuePair *) bufHash[1], fcuda.pnts, 32);
+	BT_GPU_CHECK_ERROR( "Kernel execution failed");
+	cudaThreadSynchronize ();
+	
+	/*printf ( "HASH: %d (%d)\n", fcuda.pnts, fcuda.numBlocks*fcuda.numThreads );
+	BT_GPU_SAFE_CALL( cudaMemcpy ( buf, bufHash[0], fcuda.pnts * 2*sizeof(uint), cudaMemcpyDeviceToHost ) );		
+	//for (int n=0; n < fcuda.numBlocks*fcuda.numThreads; n++) {		
+	for (int n=0; n < 100; n++) {
+		printf ( "%d: <%d,%d>\n", n, buf[n*2], buf[n*2+1] );
+	}*/
+	
+	// insertParticles<<< fcuda.gridBlocks, fcuda.gridThreads>>> ( bufPnts, (uint2*) bufHash[0], bufGrid, fcuda.pnts, fcuda.cells );			
+	
+	BT_GPU_SAFE_CALL( cudaMemset ( bufGrid, NULL_HASH, fcuda.cells * sizeof(uint) ) );
+	
+	insertParticlesRadix<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPnts, (uint2*) bufHash[0], bufGrid, bufPntSort, fcuda.pnts, fcuda.cells );
+	BT_GPU_CHECK_ERROR( "Kernel execution failed");
+	cudaThreadSynchronize ();	
+    
+    /*printf ( "GRID: %d\n", fcuda.cells );
+	BT_GPU_SAFE_CALL( cudaMemcpy ( buf, bufGrid, fcuda.cells * sizeof(uint), cudaMemcpyDeviceToHost ) );		
+	*for (int n=0; n < 100; n++) {		
+		printf ( "%d: %d\n", n, buf[n]);
+	}*/
+}
+
+void SPH_ComputePressureCUDA ()
+{
+	computePressure<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, bufGrid, (uint2*) bufHash[0], fcuda.pnts );	
+    BT_GPU_CHECK_ERROR( "Kernel execution failed");
+    cudaThreadSynchronize ();	
+}
+
+void SPH_ComputeForceCUDA ()
+{
+	//-- standard force
+	//computeForce<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, bufGrid, (uint2*) bufHash[0], fcuda.pnts );	
+	
+	// Force using neighbor table
+	computeForceNbr<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, fcuda.pnts );	
+    BT_GPU_CHECK_ERROR( "Kernel execution failed");
+    cudaThreadSynchronize ();	
+}
+
+void SPH_AdvanceCUDA ( float dt, float ss )
+{
+	advanceParticles<<< fcuda.numBlocks, fcuda.numThreads>>> ( bufPntSort, fcuda.pnts, dt, ss );
+    BT_GPU_CHECK_ERROR( "Kernel execution failed");
+    cudaThreadSynchronize ();
+}
+
+}	// extern C
+
+
+
+
+   	//----------- Per frame: Malloc/Free, Host<->Device
+	// transfer point data to device    
+    /*char* pntData;
+	int size = (fcuda.numBlocks*fcuda.numThreads) * stride;
+	cudaMalloc( (void**) &pntData, size);
+	cudaMemcpy( pntData, data, numPoints*stride, cudaMemcpyHostToDevice);  	
+    insertParticles<<< fcuda.numBlocks, fcuda.numThreads >>> ( pntData, stride, numPoints );
+    cudaMemcpy( data, pntData, numPoints*stride, cudaMemcpyDeviceToHost);    
+    cudaFree( pntData );*/
--- a/Extras/sph/fluids/fluid_system_host.cuh
+++ b/Extras/sph/fluids/fluid_system_host.cuh
@@ -1,63 +1,63 @@
-/*
-  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
-  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
-
-  ZLib license
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-#include <vector_types.h>	
-#include <driver_types.h>			// for cudaStream_t
-
-typedef unsigned int		uint;		// should be 4-bytes on CUDA
-typedef unsigned char		uchar;		// should be 1-bytes on CUDA
-
-struct FluidParams {
-	int				numThreads, numBlocks;
-	int				gridThreads, gridBlocks;	
-	int				szPnts, szHash, szGrid;
-	int				stride, pnts, cells;
-	int				chk;
-	float			smooth_rad, r2, sim_scale, visc;
-	float3			min, max, res, size, delta;
-	
-	float			pdist, pmass, rest_dens, stiffness;
-	float			poly6kern, spikykern, lapkern;
-
-};
-
-extern "C"
-{
-
-void cudaInit(int argc, char **argv);
-
-void FluidClearCUDA ();
-void FluidSetupCUDA ( int num, int stride, float3 min, float3 max, float3 res, float3 size, int chk );
-void FluidParamCUDA ( float sim_scale, float smooth_rad, float mass, float rest, float stiff, float visc );
-
-void TransferToCUDA ( char* data, int* grid, int numPoints );
-void TransferFromCUDA ( char* data, int* grid, int numPoints );
-
-void Grid_InsertParticlesCUDA ();
-void SPH_ComputePressureCUDA ();
-void SPH_ComputeForceCUDA ();
-void SPH_AdvanceCUDA ( float dt, float ss );
-
-}
-
-
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include <vector_types.h>	
+#include <driver_types.h>			// for cudaStream_t
+
+typedef unsigned int		uint;		// should be 4-bytes on CUDA
+typedef unsigned char		uchar;		// should be 1-bytes on CUDA
+
+struct FluidParams {
+	int				numThreads, numBlocks;
+	int				gridThreads, gridBlocks;	
+	int				szPnts, szHash, szGrid;
+	int				stride, pnts, cells;
+	int				chk;
+	float			smooth_rad, r2, sim_scale, visc;
+	float3			min, max, res, size, delta;
+	
+	float			pdist, pmass, rest_dens, stiffness;
+	float			poly6kern, spikykern, lapkern;
+
+};
+
+extern "C"
+{
+
+void cudaInit(int argc, char **argv);
+
+void FluidClearCUDA ();
+void FluidSetupCUDA ( int num, int stride, float3 min, float3 max, float3 res, float3 size, int chk );
+void FluidParamCUDA ( float sim_scale, float smooth_rad, float mass, float rest, float stiff, float visc );
+
+void TransferToCUDA ( char* data, int* grid, int numPoints );
+void TransferFromCUDA ( char* data, int* grid, int numPoints );
+
+void Grid_InsertParticlesCUDA ();
+void SPH_ComputePressureCUDA ();
+void SPH_ComputeForceCUDA ();
+void SPH_AdvanceCUDA ( float dt, float ss );
+
+}
+
+
--- a/Extras/sph/fluids/fluid_system_kern.cu
+++ b/Extras/sph/fluids/fluid_system_kern.cu
@@ -1,402 +1,402 @@
-/*
-  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
-  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
-
-  ZLib license
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-
-#ifndef _PARTICLES_KERNEL_H_
-	#define _PARTICLES_KERNEL_H_
-
-	#include <stdio.h>
-	#include <math.h>
-
-	#include "fluid_system_host.cuh"
-
-	#define TOTAL_THREADS		65536
-	#define BLOCK_THREADS		256
-	#define MAX_NBR				80
-	
-	__constant__	FluidParams		simData;		// simulation data (on device)
-	
-	__device__ int				bufNeighbor[ TOTAL_THREADS*MAX_NBR ];
-	__device__ float			bufNdist[ TOTAL_THREADS*MAX_NBR ];	
-
-	#define COLOR(r,g,b)	( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) )
-	#define COLORA(r,g,b,a)	( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) | uint((a)*255.0f) )
-	
-	#define NULL_HASH		333333
-	
-	#define OFFSET_CLR		12
-	#define OFFSET_NEXT		16
-	#define OFFSET_VEL		20
-	#define OFFSET_VEVAL	32
-	#define OFFSET_PRESS	48
-	#define OFFSET_DENS		52
-	#define OFFSET_FORCE	56
-	
-
-	__global__ void hashParticles ( char* bufPnts, uint2* bufHash, int numPnt )
-	{			
-		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index
-		float3* pos = (float3*) (bufPnts + __mul24(ndx, simData.stride) );
-		int gz = (pos->z - simData.min.z) * simData.delta.z ;
-		int gy = (pos->y - simData.min.y) * simData.delta.y ;
-		int gx = (pos->x - simData.min.x) * simData.delta.x ;
-		if ( ndx >= numPnt || gx < 0 || gz > simData.res.x-1 || gy < 0 || gy > simData.res.y-1 || gz < 0 || gz > simData.res.z-1 ) 
-			bufHash[ndx] = make_uint2( NULL_HASH, ndx );
-		else
-			bufHash[ndx] = make_uint2( __mul24(__mul24(gz, (int) simData.res.y)+gy, (int) simData.res.x) + gx, ndx );		
-		
-		__syncthreads ();
-	}
-	
-	__global__ void insertParticles ( char* bufPnts, uint2* bufHash, int* bufGrid, int numPnt, int numGrid )
-	{
-		uint grid_ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// grid cell index		
-		
-		bufPnts += OFFSET_NEXT;
-		bufGrid[grid_ndx] = -1;
-		for (int n=0; n < numPnt; n++) {
-			if ( bufHash[n].x == grid_ndx ) {
-				*(int*) (bufPnts + __mul24(bufHash[n].y, simData.stride)) = bufGrid[grid_ndx];
-				bufGrid[grid_ndx] = bufHash[n].y;
-			}
-		}		
-		__syncthreads ();
-	}
-	
-	__global__ void insertParticlesRadix ( char* bufPnts, uint2* bufHash, int* bufGrid, char* bufPntSort, int numPnt, int numGrid )
-	{
-		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;		// particle index		
-		
-		uint2 bufHashSort = bufHash[ndx];
-		
-		__shared__ uint sharedHash[257];	
-		sharedHash[threadIdx.x+1] = bufHashSort.x;
-		if ( ndx > 0 && threadIdx.x == 0 ) {
-			volatile uint2 prevData = bufHash[ndx-1];
-			sharedHash[0]  = prevData.x;
-		}
-		__syncthreads ();
-		
-		if ( (ndx == 0 || bufHashSort.x != sharedHash[threadIdx.x]) && bufHashSort.x != NULL_HASH ) {
-			bufGrid [ bufHashSort.x ] = ndx;			
-		}
-		if ( ndx < numPnt ) {
-			char* src = bufPnts + __mul24( bufHashSort.y, simData.stride );
-			char* dest = bufPntSort + __mul24( ndx, simData.stride );
-			
-			*(float3*)(dest)				= *(float3*)(src);
-			*(uint*)  (dest + OFFSET_CLR)	= *(uint*)  (src + OFFSET_CLR);
-			*(float3*)(dest + OFFSET_VEL)	= *(float3*)(src + OFFSET_VEL);
-			*(float3*)(dest + OFFSET_VEVAL)	= *(float3*)(src + OFFSET_VEVAL);				
-			
-			*(float*) (dest + OFFSET_DENS)	= 0.0;
-			*(float*) (dest + OFFSET_PRESS)	= 0.0;				
-			*(float3*) (dest + OFFSET_FORCE)= make_float3(0,0,0);		
-			*(int*)   (dest + OFFSET_NEXT)	= bufHashSort.x;			
-		} 
-		
-		__syncthreads ();
-		
-	}
-	
-	//__shared__ int ncount [ BLOCK_THREADS ];
-	
-	__device__ float contributePressure ( int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
-	{			
-		float3* qpos;		
-		float3 dist;
-		float dsq, c, sum;
-		float d = simData.sim_scale;				
-		int nbr = __mul24(pndx, MAX_NBR);
-						
-		sum = 0.0;		
-		for ( ; qndx < simData.pnts; qndx++ ) {
-			
-			if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
-			
-			if ( qndx != pndx ) {
-				qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));	
-					
-				dist.x = ( p->x - qpos->x )*d;		// dist in cm
-				dist.y = ( p->y - qpos->y )*d;
-				dist.z = ( p->z - qpos->z )*d;			
-				dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);			
-				if ( dsq < simData.r2 ) {
-					c = simData.r2 - dsq;
-					sum += c * c * c;				
-					if  ( bufNeighbor[nbr] < MAX_NBR ) {
-						bufNeighbor[ nbr+bufNeighbor[nbr] ] = qndx;
-						bufNdist[ nbr+bufNeighbor[nbr] ] = sqrt(dsq);
-						bufNeighbor[nbr]++;
-					}
-				}				
-			}
-			//curr = *(int*) (bufPnts + __mul24(curr, simData.stride) + OFFSET_NEXT);
-		}		
-		return sum;
-	}
-	
-		/*if  ( ncount[threadIdx.x]  < MAX_NBR ) {
-				bufNeighbor [ nbr + ncount[threadIdx.x]  ] = curr;
-				bufNdist [ nbr + ncount[threadIdx.x]  ] = sqrt(dsq);
-				ncount[threadIdx.x]++;
-		}*/	
-		
-	__global__ void computePressure ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
-	{
-		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
-
-		//if ( ndx < 1024 ) {
-		
-		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));
-
-		// Find 2x2x2 grid cells
-		// - Use registers only, no arrays (local-memory too slow)
-		int3 cell;
-		int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;					
-		float gs = simData.smooth_rad / simData.sim_scale;		
-
-		cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
-		cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
-		cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));		
-		gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
-		gc1 = gc0 + 1;
-		gc2 = gc0 + simData.res.x;
-		gc3 = gc2 + 1;
-		if ( cell.z+1 < simData.res.z ) {
-			gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
-			gc5 = gc4 + 1;
-			gc6 = gc4 + simData.res.x;
-			gc7 = gc6 + 1;
-		}
-		if ( cell.x+1 >= simData.res.x ) {
-			gc1 = -1; gc3 = -1;
-			gc5 = -1; gc7 = -1;
-		}
-		if ( cell.y+1 >= simData.res.y ) {
-			gc2 = -1; gc3 = -1;
-			gc6 = -1; gc7 = -1;
-		}
-		// Sum Pressure
-		float sum = 0.0;		
-		bufNeighbor[ __mul24(ndx, MAX_NBR) ] = 1;
-		if (gc0 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
-		if (gc1 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );		
-		if (gc2 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );		
-		if (gc3 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );	
-		if (gc4 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
-		if (gc5 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );		
-		if (gc6 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
-		if (gc7 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
-		
-		// Compute Density & Pressure
-		sum = sum * simData.pmass * simData.poly6kern;
-		if ( sum == 0.0 ) sum = 1.0;
-		*(float*) ((char*)pos + OFFSET_PRESS) = ( sum - simData.rest_dens ) * simData.stiffness;
-		*(float*) ((char*)pos + OFFSET_DENS) = 1.0f / sum;			
-		
-		//}		
-		//__syncthreads ();
-	}
-
-	__device__ void contributeForce ( float3& force, int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
-	{
-		float press = *(float*) ((char*)p + OFFSET_PRESS);
-		float dens = *(float*) ((char*)p + OFFSET_DENS);
-		float3 veval = *(float3*) ((char*)p + OFFSET_VEVAL );
-		float3 qeval, dist;				
-		float c, ndistj, dsq;
-		float pterm, dterm, vterm;		
-		float3* qpos;				
-		float d = simData.sim_scale;				
-		
-		vterm = simData.lapkern * simData.visc;		
-						
-		for ( ; qndx < simData.pnts; qndx++ ) {
-			
-			if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
-			
-			if ( qndx != pndx ) {
-				qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));	
-					
-				dist.x = ( p->x - qpos->x )*d;		// dist in cm
-				dist.y = ( p->y - qpos->y )*d;
-				dist.z = ( p->z - qpos->z )*d;			
-				dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);			
-				if ( dsq < simData.r2 ) {				
-					ndistj = sqrt(dsq);
-					c = ( simData.smooth_rad - ndistj ); 
-					dist.x = ( p->x - qpos->x )*d;		// dist in cm
-					dist.y = ( p->y - qpos->y )*d;
-					dist.z = ( p->z - qpos->z )*d;			
-					pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
-					dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);	
-					qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
-					force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
-					force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
-					force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;							
-				}
-			}
-		}				
-	}
-	
-	
-	
-	__global__ void computeForce ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
-	{
-		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
-		
-		//if ( ndx < numPnt ) {
-		
-		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));				
-		
-		// Find 2x2x2 grid cells
-		// - Use registers only, no arrays (local-memory too slow)
-		int3 cell;
-		int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;					
-		float gs = simData.smooth_rad / simData.sim_scale;		
-
-		cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
-		cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
-		cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));		
-		gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
-		gc1 = gc0 + 1;
-		gc2 = gc0 + simData.res.x;
-		gc3 = gc2 + 1;
-		if ( cell.z+1 < simData.res.z ) {
-			gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
-			gc5 = gc4 + 1;
-			gc6 = gc4 + simData.res.x;
-			gc7 = gc6 + 1;
-		}
-		if ( cell.x+1 >= simData.res.x ) {
-			gc1 = -1; gc3 = -1;
-			gc5 = -1; gc7 = -1;
-		}
-		if ( cell.y+1 >= simData.res.y ) {
-			gc2 = -1; gc3 = -1;
-			gc6 = -1; gc7 = -1;
-		}
-		// Sum Pressure
-		float3 force = make_float3(0,0,0);
-		if (gc0 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
-		if (gc1 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );		
-		if (gc2 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );		
-		if (gc3 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );	
-		if (gc4 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
-		if (gc5 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );		
-		if (gc6 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
-		if (gc7 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
-		
-		// Update Force
-		*(float3*) ((char*)pos + OFFSET_FORCE ) = force;	
-		
-		//}
-		//__syncthreads ();
-	}
-
-	
-	__global__ void computeForceNbr ( char* bufPntSort, int numPnt )
-	{		
-		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
-		
-		if ( ndx < numPnt ) {
-				
-		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));			
-		
-		float3* qpos;
-		float press = *(float*) ((char*)pos + OFFSET_PRESS);
-		float dens = *(float*) ((char*)pos + OFFSET_DENS);
-		float3 veval = *(float3*) ((char*)pos + OFFSET_VEVAL );
-		float3 qeval, dist, force;		
-		float d = simData.sim_scale;
-		float c, ndistj;
-		float pterm, dterm, vterm;
-		vterm = simData.lapkern * simData.visc;
-		int nbr = __mul24(ndx, MAX_NBR);
-		
-		int ncnt = bufNeighbor[ nbr ];		
-		
-		force = make_float3(0,0,0);
-		for (int j=1; j < ncnt; j++) {		// base 1, n[0] = count
-			ndistj = bufNdist[ nbr+j ];
-			qpos = (float3*) (bufPntSort + __mul24( bufNeighbor[ nbr+j ], simData.stride) );
-			c = ( simData.smooth_rad - ndistj ); 
-			dist.x = ( pos->x - qpos->x )*d;		// dist in cm
-			dist.y = ( pos->y - qpos->y )*d;
-			dist.z = ( pos->z - qpos->z )*d;			
-			pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
-			dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);	
-			qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
-			force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
-			force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
-			force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;			
-		}
-		*(float3*) ((char*)pos + OFFSET_FORCE ) = force;
-		
-		}	
-	
-	}
-		
-	__global__ void advanceParticles ( char* bufPntSort, int numPnt, float dt, float ss )
-	{		
-		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
-		
-		if ( ndx < numPnt ) {
-				
-			// Get particle vars
-			float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));			
-			float3* vel = (float3*) ((char*)pos + OFFSET_VEL );
-			float3* vel_eval = (float3*) ((char*)pos + OFFSET_VEVAL );
-			float3 accel = *(float3*) ((char*)pos + OFFSET_FORCE );
-			float3 vcurr, vnext;			
-
-			// Leapfrog integration						
-			accel.x *= 0.00020543;			// NOTE - To do: SPH_PMASS should be passed in			
-			accel.y *= 0.00020543;
-			accel.z *= 0.00020543;			
-			accel.z -= 9.8;	
-			
-			vcurr = *vel;
-			vnext.x = accel.x*dt + vcurr.x;	
-			vnext.y = accel.y*dt + vcurr.y;	
-			vnext.z = accel.z*dt + vcurr.z;			// v(t+1/2) = v(t-1/2) + a(t) dt			
-			
-			accel.x = (vcurr.x + vnext.x) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
-			accel.y = (vcurr.y + vnext.y) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
-			accel.z = (vcurr.z + vnext.z) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
-			
-			*vel_eval = accel;			
-			*vel = vnext;
-			
-			dt /= simData.sim_scale;
-			vnext.x = pos->x + vnext.x*dt;
-			vnext.y = pos->y + vnext.y*dt;
-			vnext.z = pos->z + vnext.z*dt;
-			*pos = vnext;						// p(t+1) = p(t) + v(t+1/2) dt			
-		}	
-		
-		__syncthreads ();	
-	}
-
-#endif
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#ifndef _PARTICLES_KERNEL_H_
+	#define _PARTICLES_KERNEL_H_
+
+	#include <stdio.h>
+	#include <math.h>
+
+	#include "fluid_system_host.cuh"
+
+	#define TOTAL_THREADS		65536
+	#define BLOCK_THREADS		256
+	#define MAX_NBR				80
+	
+	__constant__	FluidParams		simData;		// simulation data (on device)
+	
+	__device__ int				bufNeighbor[ TOTAL_THREADS*MAX_NBR ];
+	__device__ float			bufNdist[ TOTAL_THREADS*MAX_NBR ];	
+
+	#define COLOR(r,g,b)	( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) )
+	#define COLORA(r,g,b,a)	( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) | uint((a)*255.0f) )
+	
+	#define NULL_HASH		333333
+	
+	#define OFFSET_CLR		12
+	#define OFFSET_NEXT		16
+	#define OFFSET_VEL		20
+	#define OFFSET_VEVAL	32
+	#define OFFSET_PRESS	48
+	#define OFFSET_DENS		52
+	#define OFFSET_FORCE	56
+	
+
+	__global__ void hashParticles ( char* bufPnts, uint2* bufHash, int numPnt )
+	{			
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index
+		float3* pos = (float3*) (bufPnts + __mul24(ndx, simData.stride) );
+		int gz = (pos->z - simData.min.z) * simData.delta.z ;
+		int gy = (pos->y - simData.min.y) * simData.delta.y ;
+		int gx = (pos->x - simData.min.x) * simData.delta.x ;
+		if ( ndx >= numPnt || gx < 0 || gz > simData.res.x-1 || gy < 0 || gy > simData.res.y-1 || gz < 0 || gz > simData.res.z-1 ) 
+			bufHash[ndx] = make_uint2( NULL_HASH, ndx );
+		else
+			bufHash[ndx] = make_uint2( __mul24(__mul24(gz, (int) simData.res.y)+gy, (int) simData.res.x) + gx, ndx );		
+		
+		__syncthreads ();
+	}
+	
+	__global__ void insertParticles ( char* bufPnts, uint2* bufHash, int* bufGrid, int numPnt, int numGrid )
+	{
+		uint grid_ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// grid cell index		
+		
+		bufPnts += OFFSET_NEXT;
+		bufGrid[grid_ndx] = -1;
+		for (int n=0; n < numPnt; n++) {
+			if ( bufHash[n].x == grid_ndx ) {
+				*(int*) (bufPnts + __mul24(bufHash[n].y, simData.stride)) = bufGrid[grid_ndx];
+				bufGrid[grid_ndx] = bufHash[n].y;
+			}
+		}		
+		__syncthreads ();
+	}
+	
+	__global__ void insertParticlesRadix ( char* bufPnts, uint2* bufHash, int* bufGrid, char* bufPntSort, int numPnt, int numGrid )
+	{
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;		// particle index		
+		
+		uint2 bufHashSort = bufHash[ndx];
+		
+		__shared__ uint sharedHash[257];	
+		sharedHash[threadIdx.x+1] = bufHashSort.x;
+		if ( ndx > 0 && threadIdx.x == 0 ) {
+			volatile uint2 prevData = bufHash[ndx-1];
+			sharedHash[0]  = prevData.x;
+		}
+		__syncthreads ();
+		
+		if ( (ndx == 0 || bufHashSort.x != sharedHash[threadIdx.x]) && bufHashSort.x != NULL_HASH ) {
+			bufGrid [ bufHashSort.x ] = ndx;			
+		}
+		if ( ndx < numPnt ) {
+			char* src = bufPnts + __mul24( bufHashSort.y, simData.stride );
+			char* dest = bufPntSort + __mul24( ndx, simData.stride );
+			
+			*(float3*)(dest)				= *(float3*)(src);
+			*(uint*)  (dest + OFFSET_CLR)	= *(uint*)  (src + OFFSET_CLR);
+			*(float3*)(dest + OFFSET_VEL)	= *(float3*)(src + OFFSET_VEL);
+			*(float3*)(dest + OFFSET_VEVAL)	= *(float3*)(src + OFFSET_VEVAL);				
+			
+			*(float*) (dest + OFFSET_DENS)	= 0.0;
+			*(float*) (dest + OFFSET_PRESS)	= 0.0;				
+			*(float3*) (dest + OFFSET_FORCE)= make_float3(0,0,0);		
+			*(int*)   (dest + OFFSET_NEXT)	= bufHashSort.x;			
+		} 
+		
+		__syncthreads ();
+		
+	}
+	
+	//__shared__ int ncount [ BLOCK_THREADS ];
+	
+	__device__ float contributePressure ( int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
+	{			
+		float3* qpos;		
+		float3 dist;
+		float dsq, c, sum;
+		float d = simData.sim_scale;				
+		int nbr = __mul24(pndx, MAX_NBR);
+						
+		sum = 0.0;		
+		for ( ; qndx < simData.pnts; qndx++ ) {
+			
+			if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
+			
+			if ( qndx != pndx ) {
+				qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));	
+					
+				dist.x = ( p->x - qpos->x )*d;		// dist in cm
+				dist.y = ( p->y - qpos->y )*d;
+				dist.z = ( p->z - qpos->z )*d;			
+				dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);			
+				if ( dsq < simData.r2 ) {
+					c = simData.r2 - dsq;
+					sum += c * c * c;				
+					if  ( bufNeighbor[nbr] < MAX_NBR ) {
+						bufNeighbor[ nbr+bufNeighbor[nbr] ] = qndx;
+						bufNdist[ nbr+bufNeighbor[nbr] ] = sqrt(dsq);
+						bufNeighbor[nbr]++;
+					}
+				}				
+			}
+			//curr = *(int*) (bufPnts + __mul24(curr, simData.stride) + OFFSET_NEXT);
+		}		
+		return sum;
+	}
+	
+		/*if  ( ncount[threadIdx.x]  < MAX_NBR ) {
+				bufNeighbor [ nbr + ncount[threadIdx.x]  ] = curr;
+				bufNdist [ nbr + ncount[threadIdx.x]  ] = sqrt(dsq);
+				ncount[threadIdx.x]++;
+		}*/	
+		
+	__global__ void computePressure ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
+	{
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
+
+		//if ( ndx < 1024 ) {
+		
+		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));
+
+		// Find 2x2x2 grid cells
+		// - Use registers only, no arrays (local-memory too slow)
+		int3 cell;
+		int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;					
+		float gs = simData.smooth_rad / simData.sim_scale;		
+
+		cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
+		cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
+		cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));		
+		gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
+		gc1 = gc0 + 1;
+		gc2 = gc0 + simData.res.x;
+		gc3 = gc2 + 1;
+		if ( cell.z+1 < simData.res.z ) {
+			gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
+			gc5 = gc4 + 1;
+			gc6 = gc4 + simData.res.x;
+			gc7 = gc6 + 1;
+		}
+		if ( cell.x+1 >= simData.res.x ) {
+			gc1 = -1; gc3 = -1;
+			gc5 = -1; gc7 = -1;
+		}
+		if ( cell.y+1 >= simData.res.y ) {
+			gc2 = -1; gc3 = -1;
+			gc6 = -1; gc7 = -1;
+		}
+		// Sum Pressure
+		float sum = 0.0;		
+		bufNeighbor[ __mul24(ndx, MAX_NBR) ] = 1;
+		if (gc0 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
+		if (gc1 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );		
+		if (gc2 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );		
+		if (gc3 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );	
+		if (gc4 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
+		if (gc5 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );		
+		if (gc6 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
+		if (gc7 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
+		
+		// Compute Density & Pressure
+		sum = sum * simData.pmass * simData.poly6kern;
+		if ( sum == 0.0 ) sum = 1.0;
+		*(float*) ((char*)pos + OFFSET_PRESS) = ( sum - simData.rest_dens ) * simData.stiffness;
+		*(float*) ((char*)pos + OFFSET_DENS) = 1.0f / sum;			
+		
+		//}		
+		//__syncthreads ();
+	}
+
+	__device__ void contributeForce ( float3& force, int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
+	{
+		float press = *(float*) ((char*)p + OFFSET_PRESS);
+		float dens = *(float*) ((char*)p + OFFSET_DENS);
+		float3 veval = *(float3*) ((char*)p + OFFSET_VEVAL );
+		float3 qeval, dist;				
+		float c, ndistj, dsq;
+		float pterm, dterm, vterm;		
+		float3* qpos;				
+		float d = simData.sim_scale;				
+		
+		vterm = simData.lapkern * simData.visc;		
+						
+		for ( ; qndx < simData.pnts; qndx++ ) {
+			
+			if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
+			
+			if ( qndx != pndx ) {
+				qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));	
+					
+				dist.x = ( p->x - qpos->x )*d;		// dist in cm
+				dist.y = ( p->y - qpos->y )*d;
+				dist.z = ( p->z - qpos->z )*d;			
+				dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);			
+				if ( dsq < simData.r2 ) {				
+					ndistj = sqrt(dsq);
+					c = ( simData.smooth_rad - ndistj ); 
+					dist.x = ( p->x - qpos->x )*d;		// dist in cm
+					dist.y = ( p->y - qpos->y )*d;
+					dist.z = ( p->z - qpos->z )*d;			
+					pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
+					dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);	
+					qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
+					force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
+					force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
+					force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;							
+				}
+			}
+		}				
+	}
+	
+	
+	
+	__global__ void computeForce ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
+	{
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
+		
+		//if ( ndx < numPnt ) {
+		
+		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));				
+		
+		// Find 2x2x2 grid cells
+		// - Use registers only, no arrays (local-memory too slow)
+		int3 cell;
+		int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;					
+		float gs = simData.smooth_rad / simData.sim_scale;		
+
+		cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
+		cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
+		cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));		
+		gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
+		gc1 = gc0 + 1;
+		gc2 = gc0 + simData.res.x;
+		gc3 = gc2 + 1;
+		if ( cell.z+1 < simData.res.z ) {
+			gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
+			gc5 = gc4 + 1;
+			gc6 = gc4 + simData.res.x;
+			gc7 = gc6 + 1;
+		}
+		if ( cell.x+1 >= simData.res.x ) {
+			gc1 = -1; gc3 = -1;
+			gc5 = -1; gc7 = -1;
+		}
+		if ( cell.y+1 >= simData.res.y ) {
+			gc2 = -1; gc3 = -1;
+			gc6 = -1; gc7 = -1;
+		}
+		// Sum Pressure
+		float3 force = make_float3(0,0,0);
+		if (gc0 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
+		if (gc1 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );		
+		if (gc2 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );		
+		if (gc3 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );	
+		if (gc4 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
+		if (gc5 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );		
+		if (gc6 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
+		if (gc7 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
+		
+		// Update Force
+		*(float3*) ((char*)pos + OFFSET_FORCE ) = force;	
+		
+		//}
+		//__syncthreads ();
+	}
+
+	
+	__global__ void computeForceNbr ( char* bufPntSort, int numPnt )
+	{		
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
+		
+		if ( ndx < numPnt ) {
+				
+		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));			
+		
+		float3* qpos;
+		float press = *(float*) ((char*)pos + OFFSET_PRESS);
+		float dens = *(float*) ((char*)pos + OFFSET_DENS);
+		float3 veval = *(float3*) ((char*)pos + OFFSET_VEVAL );
+		float3 qeval, dist, force;		
+		float d = simData.sim_scale;
+		float c, ndistj;
+		float pterm, dterm, vterm;
+		vterm = simData.lapkern * simData.visc;
+		int nbr = __mul24(ndx, MAX_NBR);
+		
+		int ncnt = bufNeighbor[ nbr ];		
+		
+		force = make_float3(0,0,0);
+		for (int j=1; j < ncnt; j++) {		// base 1, n[0] = count
+			ndistj = bufNdist[ nbr+j ];
+			qpos = (float3*) (bufPntSort + __mul24( bufNeighbor[ nbr+j ], simData.stride) );
+			c = ( simData.smooth_rad - ndistj ); 
+			dist.x = ( pos->x - qpos->x )*d;		// dist in cm
+			dist.y = ( pos->y - qpos->y )*d;
+			dist.z = ( pos->z - qpos->z )*d;			
+			pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
+			dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);	
+			qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
+			force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
+			force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
+			force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;			
+		}
+		*(float3*) ((char*)pos + OFFSET_FORCE ) = force;
+		
+		}	
+	
+	}
+		
+	__global__ void advanceParticles ( char* bufPntSort, int numPnt, float dt, float ss )
+	{		
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
+		
+		if ( ndx < numPnt ) {
+				
+			// Get particle vars
+			float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));			
+			float3* vel = (float3*) ((char*)pos + OFFSET_VEL );
+			float3* vel_eval = (float3*) ((char*)pos + OFFSET_VEVAL );
+			float3 accel = *(float3*) ((char*)pos + OFFSET_FORCE );
+			float3 vcurr, vnext;			
+
+			// Leapfrog integration						
+			accel.x *= 0.00020543;			// NOTE - To do: SPH_PMASS should be passed in			
+			accel.y *= 0.00020543;
+			accel.z *= 0.00020543;			
+			accel.z -= 9.8;	
+			
+			vcurr = *vel;
+			vnext.x = accel.x*dt + vcurr.x;	
+			vnext.y = accel.y*dt + vcurr.y;	
+			vnext.z = accel.z*dt + vcurr.z;			// v(t+1/2) = v(t-1/2) + a(t) dt			
+			
+			accel.x = (vcurr.x + vnext.x) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
+			accel.y = (vcurr.y + vnext.y) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
+			accel.z = (vcurr.z + vnext.z) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
+			
+			*vel_eval = accel;			
+			*vel = vnext;
+			
+			dt /= simData.sim_scale;
+			vnext.x = pos->x + vnext.x*dt;
+			vnext.y = pos->y + vnext.y*dt;
+			vnext.z = pos->z + vnext.z*dt;
+			*pos = vnext;						// p(t+1) = p(t) + v(t+1/2) dt			
+		}	
+		
+		__syncthreads ();	
+	}
+
+#endif
--- a/Extras/sph/fluids/fluid_system_kern.cuh
+++ b/Extras/sph/fluids/fluid_system_kern.cuh
@@ -1,45 +1,45 @@
-/*
-  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
-  Copyright (C) 2009. Rama Hoetzlein, http://www.rchoetzlein.com
-
-  ZLib license
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-*/
-
-#ifndef _PARTICLES_KERNEL_H_
-	#define _PARTICLES_KERNEL_H_
-
-	#include <stdio.h>
-	#include <math.h>
-	#include "cutil_math.h"
-	#include "math_constants.h"
-	
-	// Insert particles in grid
-	
-	__global__ void insertParticles ( char* pntData, uint pntStride )
-	{
-		int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-		float4 p = *(float4*) (pntData + index*pntStride);
-
-		// get address in grid
-		int3 gridPos = calcGridPos(p);
-
-		addParticleToCell(gridPos, index, gridCounters, gridCells);
-	}
-
-
-#endif
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2009. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef _PARTICLES_KERNEL_H_
+	#define _PARTICLES_KERNEL_H_
+
+	#include <stdio.h>
+	#include <math.h>
+	#include "cutil_math.h"
+	#include "math_constants.h"
+	
+	// Insert particles in grid
+	
+	__global__ void insertParticles ( char* pntData, uint pntStride )
+	{
+		int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+		float4 p = *(float4*) (pntData + index*pntStride);
+
+		// get address in grid
+		int3 gridPos = calcGridPos(p);
+
+		addParticleToCell(gridPos, index, gridCounters, gridCells);
+	}
+
+
+#endif
--- a/Extras/sph/fluids/radixsort.cu
+++ b/Extras/sph/fluids/radixsort.cu
@@ -1,79 +1,79 @@
-/*
- * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO USER:
- *
- * This source code is subject to NVIDIA ownership rights under U.S. and
- * international Copyright laws.
- *
- * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
- * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
- * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
- * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
- * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
- * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
- * OR PERFORMANCE OF THIS SOURCE CODE.
- *
- * U.S. Government End Users.  This source code is a "commercial item" as
- * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
- * "commercial computer software" and "commercial computer software
- * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
- * and is provided to the U.S. Government only as a commercial end item.
- * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
- * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
- * source code with only those rights set forth herein.
- */
-
-/* Radixsort project with key/value and arbitrary datset size support
- * which demonstrates the use of CUDA in a multi phase sorting 
- * computation.
- * Host code.
- */
-
-#include "radixsort.cuh"
-#include "radixsort_kernel.cu"
-
-extern "C"
-{
-
-////////////////////////////////////////////////////////////////////////////////
-//! Perform a radix sort
-//! Sorting performed in place on passed arrays.
-//!
-//! @param pData0       input and output array - data will be sorted
-//! @param pData1       additional array to allow ping pong computation
-//! @param elements     number of elements to sort
-////////////////////////////////////////////////////////////////////////////////
-void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits)
-{
-    // Round element count to total number of threads for efficiency
-    uint elements_rounded_to_3072;
-    int modval = elements % 3072;
-    if( modval == 0 )
-        elements_rounded_to_3072 = elements;
-    else
-        elements_rounded_to_3072 = elements + (3072 - (modval));
-
-    // Iterate over n bytes of y bit word, using each byte to sort the list in turn
-    for (uint shift = 0; shift < bits; shift += RADIX)
-    {
-        // Perform one round of radix sorting
-
-        // Generate per radix group sums radix counts across a radix group
-        RadixSum<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, GRFSIZE>>>(pData0, elements, elements_rounded_to_3072, shift);
-        // Prefix sum in radix groups, and then between groups throughout a block
-        RadixPrefixSum<<<PREFIX_NUM_BLOCKS, PREFIX_NUM_THREADS_PER_BLOCK, PREFIX_GRFSIZE>>>();
-        // Sum the block offsets and then shuffle data into bins
-        RadixAddOffsetsAndShuffle<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, SHUFFLE_GRFSIZE>>>(pData0, pData1, elements, elements_rounded_to_3072, shift); 
-
-        // Exchange data pointers
-        KeyValuePair* pTemp = pData0;
-        pData0 = pData1;
-        pData1 = pTemp;
-   }
-}
-
-}
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer software" and "commercial computer software
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ */
+
+/* Radixsort project with key/value and arbitrary datset size support
+ * which demonstrates the use of CUDA in a multi phase sorting 
+ * computation.
+ * Host code.
+ */
+
+#include "radixsort.cuh"
+#include "radixsort_kernel.cu"
+
+extern "C"
+{
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform a radix sort
+//! Sorting performed in place on passed arrays.
+//!
+//! @param pData0       input and output array - data will be sorted
+//! @param pData1       additional array to allow ping pong computation
+//! @param elements     number of elements to sort
+////////////////////////////////////////////////////////////////////////////////
+void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits)
+{
+    // Round element count to total number of threads for efficiency
+    uint elements_rounded_to_3072;
+    int modval = elements % 3072;
+    if( modval == 0 )
+        elements_rounded_to_3072 = elements;
+    else
+        elements_rounded_to_3072 = elements + (3072 - (modval));
+
+    // Iterate over n bytes of y bit word, using each byte to sort the list in turn
+    for (uint shift = 0; shift < bits; shift += RADIX)
+    {
+        // Perform one round of radix sorting
+
+        // Generate per radix group sums radix counts across a radix group
+        RadixSum<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, GRFSIZE>>>(pData0, elements, elements_rounded_to_3072, shift);
+        // Prefix sum in radix groups, and then between groups throughout a block
+        RadixPrefixSum<<<PREFIX_NUM_BLOCKS, PREFIX_NUM_THREADS_PER_BLOCK, PREFIX_GRFSIZE>>>();
+        // Sum the block offsets and then shuffle data into bins
+        RadixAddOffsetsAndShuffle<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, SHUFFLE_GRFSIZE>>>(pData0, pData1, elements, elements_rounded_to_3072, shift); 
+
+        // Exchange data pointers
+        KeyValuePair* pTemp = pData0;
+        pData0 = pData1;
+        pData1 = pTemp;
+   }
+}
+
+}
--- a/Extras/sph/fluids/radixsort.cuh
+++ b/Extras/sph/fluids/radixsort.cuh
@@ -1,63 +1,63 @@
-/*
- * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO USER:
- *
- * This source code is subject to NVIDIA ownership rights under U.S. and
- * international Copyright laws.
- *
- * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
- * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
- * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
- * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
- * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
- * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
- * OR PERFORMANCE OF THIS SOURCE CODE.
- *
- * U.S. Government End Users.  This source code is a "commercial item" as
- * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
- * "commercial computer software" and "commercial computer software
- * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
- * and is provided to the U.S. Government only as a commercial end item.
- * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
- * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
- * source code with only those rights set forth herein.
- */
-
-/* Radixsort project which demonstrates the use of CUDA in a multi phase
- * sorting computation.
- * Type definitions.
- */
-
-#ifndef _RADIXSORT_H_
-#define _RADIXSORT_H_
-
-#include <host_defines.h>
-
-#define SYNCIT __syncthreads()
-
-// Use 16 bit keys/values
-#define SIXTEEN 0
-
-typedef unsigned int uint;
-typedef unsigned short ushort;
-
-#if SIXTEEN
-typedef struct __align__(4) {
-    ushort key;
-    ushort value;
-#else
-typedef struct __align__(8) {
-    uint key;
-    uint value;
-#endif
-} KeyValuePair;
-
-extern "C" {
-    void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits);
-}
-
-#endif // #ifndef _RADIXSORT_H_
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer software" and "commercial computer software
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ */
+
+/* Radixsort project which demonstrates the use of CUDA in a multi phase
+ * sorting computation.
+ * Type definitions.
+ */
+
+#ifndef _RADIXSORT_H_
+#define _RADIXSORT_H_
+
+#include <host_defines.h>
+
+#define SYNCIT __syncthreads()
+
+// Use 16 bit keys/values
+#define SIXTEEN 0
+
+typedef unsigned int uint;
+typedef unsigned short ushort;
+
+#if SIXTEEN
+typedef struct __align__(4) {
+    ushort key;
+    ushort value;
+#else
+typedef struct __align__(8) {
+    uint key;
+    uint value;
+#endif
+} KeyValuePair;
+
+extern "C" {
+    void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits);
+}
+
+#endif // #ifndef _RADIXSORT_H_
--- a/Extras/sph/fluids/radixsort_kernel.cu
+++ b/Extras/sph/fluids/radixsort_kernel.cu