fix: some file didn't have the svn:eol-style native yet

2010-03-06 15:23:36 +00:00
parent 4fd48ac691
commit 81f04a4d48
641 changed files with 301123 additions and 301123 deletions
--- a/Extras/sph/fluids/fluid_system_kern.cu
+++ b/Extras/sph/fluids/fluid_system_kern.cu
@@ -1,402 +1,402 @@
-/*
-  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
-  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
-
-  ZLib license
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-
-#ifndef _PARTICLES_KERNEL_H_
-	#define _PARTICLES_KERNEL_H_
-
-	#include <stdio.h>
-	#include <math.h>
-
-	#include "fluid_system_host.cuh"
-
-	#define TOTAL_THREADS		65536
-	#define BLOCK_THREADS		256
-	#define MAX_NBR				80
-	
-	__constant__	FluidParams		simData;		// simulation data (on device)
-	
-	__device__ int				bufNeighbor[ TOTAL_THREADS*MAX_NBR ];
-	__device__ float			bufNdist[ TOTAL_THREADS*MAX_NBR ];	
-
-	#define COLOR(r,g,b)	( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) )
-	#define COLORA(r,g,b,a)	( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) | uint((a)*255.0f) )
-	
-	#define NULL_HASH		333333
-	
-	#define OFFSET_CLR		12
-	#define OFFSET_NEXT		16
-	#define OFFSET_VEL		20
-	#define OFFSET_VEVAL	32
-	#define OFFSET_PRESS	48
-	#define OFFSET_DENS		52
-	#define OFFSET_FORCE	56
-	
-
-	__global__ void hashParticles ( char* bufPnts, uint2* bufHash, int numPnt )
-	{			
-		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index
-		float3* pos = (float3*) (bufPnts + __mul24(ndx, simData.stride) );
-		int gz = (pos->z - simData.min.z) * simData.delta.z ;
-		int gy = (pos->y - simData.min.y) * simData.delta.y ;
-		int gx = (pos->x - simData.min.x) * simData.delta.x ;
-		if ( ndx >= numPnt || gx < 0 || gz > simData.res.x-1 || gy < 0 || gy > simData.res.y-1 || gz < 0 || gz > simData.res.z-1 ) 
-			bufHash[ndx] = make_uint2( NULL_HASH, ndx );
-		else
-			bufHash[ndx] = make_uint2( __mul24(__mul24(gz, (int) simData.res.y)+gy, (int) simData.res.x) + gx, ndx );		
-		
-		__syncthreads ();
-	}
-	
-	__global__ void insertParticles ( char* bufPnts, uint2* bufHash, int* bufGrid, int numPnt, int numGrid )
-	{
-		uint grid_ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// grid cell index		
-		
-		bufPnts += OFFSET_NEXT;
-		bufGrid[grid_ndx] = -1;
-		for (int n=0; n < numPnt; n++) {
-			if ( bufHash[n].x == grid_ndx ) {
-				*(int*) (bufPnts + __mul24(bufHash[n].y, simData.stride)) = bufGrid[grid_ndx];
-				bufGrid[grid_ndx] = bufHash[n].y;
-			}
-		}		
-		__syncthreads ();
-	}
-	
-	__global__ void insertParticlesRadix ( char* bufPnts, uint2* bufHash, int* bufGrid, char* bufPntSort, int numPnt, int numGrid )
-	{
-		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;		// particle index		
-		
-		uint2 bufHashSort = bufHash[ndx];
-		
-		__shared__ uint sharedHash[257];	
-		sharedHash[threadIdx.x+1] = bufHashSort.x;
-		if ( ndx > 0 && threadIdx.x == 0 ) {
-			volatile uint2 prevData = bufHash[ndx-1];
-			sharedHash[0]  = prevData.x;
-		}
-		__syncthreads ();
-		
-		if ( (ndx == 0 || bufHashSort.x != sharedHash[threadIdx.x]) && bufHashSort.x != NULL_HASH ) {
-			bufGrid [ bufHashSort.x ] = ndx;			
-		}
-		if ( ndx < numPnt ) {
-			char* src = bufPnts + __mul24( bufHashSort.y, simData.stride );
-			char* dest = bufPntSort + __mul24( ndx, simData.stride );
-			
-			*(float3*)(dest)				= *(float3*)(src);
-			*(uint*)  (dest + OFFSET_CLR)	= *(uint*)  (src + OFFSET_CLR);
-			*(float3*)(dest + OFFSET_VEL)	= *(float3*)(src + OFFSET_VEL);
-			*(float3*)(dest + OFFSET_VEVAL)	= *(float3*)(src + OFFSET_VEVAL);				
-			
-			*(float*) (dest + OFFSET_DENS)	= 0.0;
-			*(float*) (dest + OFFSET_PRESS)	= 0.0;				
-			*(float3*) (dest + OFFSET_FORCE)= make_float3(0,0,0);		
-			*(int*)   (dest + OFFSET_NEXT)	= bufHashSort.x;			
-		} 
-		
-		__syncthreads ();
-		
-	}
-	
-	//__shared__ int ncount [ BLOCK_THREADS ];
-	
-	__device__ float contributePressure ( int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
-	{			
-		float3* qpos;		
-		float3 dist;
-		float dsq, c, sum;
-		float d = simData.sim_scale;				
-		int nbr = __mul24(pndx, MAX_NBR);
-						
-		sum = 0.0;		
-		for ( ; qndx < simData.pnts; qndx++ ) {
-			
-			if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
-			
-			if ( qndx != pndx ) {
-				qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));	
-					
-				dist.x = ( p->x - qpos->x )*d;		// dist in cm
-				dist.y = ( p->y - qpos->y )*d;
-				dist.z = ( p->z - qpos->z )*d;			
-				dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);			
-				if ( dsq < simData.r2 ) {
-					c = simData.r2 - dsq;
-					sum += c * c * c;				
-					if  ( bufNeighbor[nbr] < MAX_NBR ) {
-						bufNeighbor[ nbr+bufNeighbor[nbr] ] = qndx;
-						bufNdist[ nbr+bufNeighbor[nbr] ] = sqrt(dsq);
-						bufNeighbor[nbr]++;
-					}
-				}				
-			}
-			//curr = *(int*) (bufPnts + __mul24(curr, simData.stride) + OFFSET_NEXT);
-		}		
-		return sum;
-	}
-	
-		/*if  ( ncount[threadIdx.x]  < MAX_NBR ) {
-				bufNeighbor [ nbr + ncount[threadIdx.x]  ] = curr;
-				bufNdist [ nbr + ncount[threadIdx.x]  ] = sqrt(dsq);
-				ncount[threadIdx.x]++;
-		}*/	
-		
-	__global__ void computePressure ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
-	{
-		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
-
-		//if ( ndx < 1024 ) {
-		
-		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));
-
-		// Find 2x2x2 grid cells
-		// - Use registers only, no arrays (local-memory too slow)
-		int3 cell;
-		int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;					
-		float gs = simData.smooth_rad / simData.sim_scale;		
-
-		cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
-		cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
-		cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));		
-		gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
-		gc1 = gc0 + 1;
-		gc2 = gc0 + simData.res.x;
-		gc3 = gc2 + 1;
-		if ( cell.z+1 < simData.res.z ) {
-			gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
-			gc5 = gc4 + 1;
-			gc6 = gc4 + simData.res.x;
-			gc7 = gc6 + 1;
-		}
-		if ( cell.x+1 >= simData.res.x ) {
-			gc1 = -1; gc3 = -1;
-			gc5 = -1; gc7 = -1;
-		}
-		if ( cell.y+1 >= simData.res.y ) {
-			gc2 = -1; gc3 = -1;
-			gc6 = -1; gc7 = -1;
-		}
-		// Sum Pressure
-		float sum = 0.0;		
-		bufNeighbor[ __mul24(ndx, MAX_NBR) ] = 1;
-		if (gc0 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
-		if (gc1 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );		
-		if (gc2 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );		
-		if (gc3 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );	
-		if (gc4 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
-		if (gc5 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );		
-		if (gc6 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
-		if (gc7 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
-		
-		// Compute Density & Pressure
-		sum = sum * simData.pmass * simData.poly6kern;
-		if ( sum == 0.0 ) sum = 1.0;
-		*(float*) ((char*)pos + OFFSET_PRESS) = ( sum - simData.rest_dens ) * simData.stiffness;
-		*(float*) ((char*)pos + OFFSET_DENS) = 1.0f / sum;			
-		
-		//}		
-		//__syncthreads ();
-	}
-
-	__device__ void contributeForce ( float3& force, int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
-	{
-		float press = *(float*) ((char*)p + OFFSET_PRESS);
-		float dens = *(float*) ((char*)p + OFFSET_DENS);
-		float3 veval = *(float3*) ((char*)p + OFFSET_VEVAL );
-		float3 qeval, dist;				
-		float c, ndistj, dsq;
-		float pterm, dterm, vterm;		
-		float3* qpos;				
-		float d = simData.sim_scale;				
-		
-		vterm = simData.lapkern * simData.visc;		
-						
-		for ( ; qndx < simData.pnts; qndx++ ) {
-			
-			if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
-			
-			if ( qndx != pndx ) {
-				qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));	
-					
-				dist.x = ( p->x - qpos->x )*d;		// dist in cm
-				dist.y = ( p->y - qpos->y )*d;
-				dist.z = ( p->z - qpos->z )*d;			
-				dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);			
-				if ( dsq < simData.r2 ) {				
-					ndistj = sqrt(dsq);
-					c = ( simData.smooth_rad - ndistj ); 
-					dist.x = ( p->x - qpos->x )*d;		// dist in cm
-					dist.y = ( p->y - qpos->y )*d;
-					dist.z = ( p->z - qpos->z )*d;			
-					pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
-					dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);	
-					qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
-					force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
-					force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
-					force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;							
-				}
-			}
-		}				
-	}
-	
-	
-	
-	__global__ void computeForce ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
-	{
-		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
-		
-		//if ( ndx < numPnt ) {
-		
-		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));				
-		
-		// Find 2x2x2 grid cells
-		// - Use registers only, no arrays (local-memory too slow)
-		int3 cell;
-		int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;					
-		float gs = simData.smooth_rad / simData.sim_scale;		
-
-		cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
-		cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
-		cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));		
-		gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
-		gc1 = gc0 + 1;
-		gc2 = gc0 + simData.res.x;
-		gc3 = gc2 + 1;
-		if ( cell.z+1 < simData.res.z ) {
-			gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
-			gc5 = gc4 + 1;
-			gc6 = gc4 + simData.res.x;
-			gc7 = gc6 + 1;
-		}
-		if ( cell.x+1 >= simData.res.x ) {
-			gc1 = -1; gc3 = -1;
-			gc5 = -1; gc7 = -1;
-		}
-		if ( cell.y+1 >= simData.res.y ) {
-			gc2 = -1; gc3 = -1;
-			gc6 = -1; gc7 = -1;
-		}
-		// Sum Pressure
-		float3 force = make_float3(0,0,0);
-		if (gc0 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
-		if (gc1 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );		
-		if (gc2 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );		
-		if (gc3 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );	
-		if (gc4 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
-		if (gc5 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );		
-		if (gc6 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
-		if (gc7 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
-		
-		// Update Force
-		*(float3*) ((char*)pos + OFFSET_FORCE ) = force;	
-		
-		//}
-		//__syncthreads ();
-	}
-
-	
-	__global__ void computeForceNbr ( char* bufPntSort, int numPnt )
-	{		
-		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
-		
-		if ( ndx < numPnt ) {
-				
-		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));			
-		
-		float3* qpos;
-		float press = *(float*) ((char*)pos + OFFSET_PRESS);
-		float dens = *(float*) ((char*)pos + OFFSET_DENS);
-		float3 veval = *(float3*) ((char*)pos + OFFSET_VEVAL );
-		float3 qeval, dist, force;		
-		float d = simData.sim_scale;
-		float c, ndistj;
-		float pterm, dterm, vterm;
-		vterm = simData.lapkern * simData.visc;
-		int nbr = __mul24(ndx, MAX_NBR);
-		
-		int ncnt = bufNeighbor[ nbr ];		
-		
-		force = make_float3(0,0,0);
-		for (int j=1; j < ncnt; j++) {		// base 1, n[0] = count
-			ndistj = bufNdist[ nbr+j ];
-			qpos = (float3*) (bufPntSort + __mul24( bufNeighbor[ nbr+j ], simData.stride) );
-			c = ( simData.smooth_rad - ndistj ); 
-			dist.x = ( pos->x - qpos->x )*d;		// dist in cm
-			dist.y = ( pos->y - qpos->y )*d;
-			dist.z = ( pos->z - qpos->z )*d;			
-			pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
-			dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);	
-			qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
-			force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
-			force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
-			force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;			
-		}
-		*(float3*) ((char*)pos + OFFSET_FORCE ) = force;
-		
-		}	
-	
-	}
-		
-	__global__ void advanceParticles ( char* bufPntSort, int numPnt, float dt, float ss )
-	{		
-		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
-		
-		if ( ndx < numPnt ) {
-				
-			// Get particle vars
-			float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));			
-			float3* vel = (float3*) ((char*)pos + OFFSET_VEL );
-			float3* vel_eval = (float3*) ((char*)pos + OFFSET_VEVAL );
-			float3 accel = *(float3*) ((char*)pos + OFFSET_FORCE );
-			float3 vcurr, vnext;			
-
-			// Leapfrog integration						
-			accel.x *= 0.00020543;			// NOTE - To do: SPH_PMASS should be passed in			
-			accel.y *= 0.00020543;
-			accel.z *= 0.00020543;			
-			accel.z -= 9.8;	
-			
-			vcurr = *vel;
-			vnext.x = accel.x*dt + vcurr.x;	
-			vnext.y = accel.y*dt + vcurr.y;	
-			vnext.z = accel.z*dt + vcurr.z;			// v(t+1/2) = v(t-1/2) + a(t) dt			
-			
-			accel.x = (vcurr.x + vnext.x) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
-			accel.y = (vcurr.y + vnext.y) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
-			accel.z = (vcurr.z + vnext.z) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
-			
-			*vel_eval = accel;			
-			*vel = vnext;
-			
-			dt /= simData.sim_scale;
-			vnext.x = pos->x + vnext.x*dt;
-			vnext.y = pos->y + vnext.y*dt;
-			vnext.z = pos->z + vnext.z*dt;
-			*pos = vnext;						// p(t+1) = p(t) + v(t+1/2) dt			
-		}	
-		
-		__syncthreads ();	
-	}
-
-#endif
+/*
+  FLUIDS v.1 - SPH Fluid Simulator for CPU and GPU
+  Copyright (C) 2008. Rama Hoetzlein, http://www.rchoetzlein.com
+
+  ZLib license
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#ifndef _PARTICLES_KERNEL_H_
+	#define _PARTICLES_KERNEL_H_
+
+	#include <stdio.h>
+	#include <math.h>
+
+	#include "fluid_system_host.cuh"
+
+	#define TOTAL_THREADS		65536
+	#define BLOCK_THREADS		256
+	#define MAX_NBR				80
+	
+	__constant__	FluidParams		simData;		// simulation data (on device)
+	
+	__device__ int				bufNeighbor[ TOTAL_THREADS*MAX_NBR ];
+	__device__ float			bufNdist[ TOTAL_THREADS*MAX_NBR ];	
+
+	#define COLOR(r,g,b)	( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) )
+	#define COLORA(r,g,b,a)	( (uint((r)*255.0f)<<24) | (uint((g)*255.0f)<<16) | (uint((b)*255.0f)<<8) | uint((a)*255.0f) )
+	
+	#define NULL_HASH		333333
+	
+	#define OFFSET_CLR		12
+	#define OFFSET_NEXT		16
+	#define OFFSET_VEL		20
+	#define OFFSET_VEVAL	32
+	#define OFFSET_PRESS	48
+	#define OFFSET_DENS		52
+	#define OFFSET_FORCE	56
+	
+
+	__global__ void hashParticles ( char* bufPnts, uint2* bufHash, int numPnt )
+	{			
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index
+		float3* pos = (float3*) (bufPnts + __mul24(ndx, simData.stride) );
+		int gz = (pos->z - simData.min.z) * simData.delta.z ;
+		int gy = (pos->y - simData.min.y) * simData.delta.y ;
+		int gx = (pos->x - simData.min.x) * simData.delta.x ;
+		if ( ndx >= numPnt || gx < 0 || gz > simData.res.x-1 || gy < 0 || gy > simData.res.y-1 || gz < 0 || gz > simData.res.z-1 ) 
+			bufHash[ndx] = make_uint2( NULL_HASH, ndx );
+		else
+			bufHash[ndx] = make_uint2( __mul24(__mul24(gz, (int) simData.res.y)+gy, (int) simData.res.x) + gx, ndx );		
+		
+		__syncthreads ();
+	}
+	
+	__global__ void insertParticles ( char* bufPnts, uint2* bufHash, int* bufGrid, int numPnt, int numGrid )
+	{
+		uint grid_ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// grid cell index		
+		
+		bufPnts += OFFSET_NEXT;
+		bufGrid[grid_ndx] = -1;
+		for (int n=0; n < numPnt; n++) {
+			if ( bufHash[n].x == grid_ndx ) {
+				*(int*) (bufPnts + __mul24(bufHash[n].y, simData.stride)) = bufGrid[grid_ndx];
+				bufGrid[grid_ndx] = bufHash[n].y;
+			}
+		}		
+		__syncthreads ();
+	}
+	
+	__global__ void insertParticlesRadix ( char* bufPnts, uint2* bufHash, int* bufGrid, char* bufPntSort, int numPnt, int numGrid )
+	{
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;		// particle index		
+		
+		uint2 bufHashSort = bufHash[ndx];
+		
+		__shared__ uint sharedHash[257];	
+		sharedHash[threadIdx.x+1] = bufHashSort.x;
+		if ( ndx > 0 && threadIdx.x == 0 ) {
+			volatile uint2 prevData = bufHash[ndx-1];
+			sharedHash[0]  = prevData.x;
+		}
+		__syncthreads ();
+		
+		if ( (ndx == 0 || bufHashSort.x != sharedHash[threadIdx.x]) && bufHashSort.x != NULL_HASH ) {
+			bufGrid [ bufHashSort.x ] = ndx;			
+		}
+		if ( ndx < numPnt ) {
+			char* src = bufPnts + __mul24( bufHashSort.y, simData.stride );
+			char* dest = bufPntSort + __mul24( ndx, simData.stride );
+			
+			*(float3*)(dest)				= *(float3*)(src);
+			*(uint*)  (dest + OFFSET_CLR)	= *(uint*)  (src + OFFSET_CLR);
+			*(float3*)(dest + OFFSET_VEL)	= *(float3*)(src + OFFSET_VEL);
+			*(float3*)(dest + OFFSET_VEVAL)	= *(float3*)(src + OFFSET_VEVAL);				
+			
+			*(float*) (dest + OFFSET_DENS)	= 0.0;
+			*(float*) (dest + OFFSET_PRESS)	= 0.0;				
+			*(float3*) (dest + OFFSET_FORCE)= make_float3(0,0,0);		
+			*(int*)   (dest + OFFSET_NEXT)	= bufHashSort.x;			
+		} 
+		
+		__syncthreads ();
+		
+	}
+	
+	//__shared__ int ncount [ BLOCK_THREADS ];
+	
+	__device__ float contributePressure ( int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
+	{			
+		float3* qpos;		
+		float3 dist;
+		float dsq, c, sum;
+		float d = simData.sim_scale;				
+		int nbr = __mul24(pndx, MAX_NBR);
+						
+		sum = 0.0;		
+		for ( ; qndx < simData.pnts; qndx++ ) {
+			
+			if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
+			
+			if ( qndx != pndx ) {
+				qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));	
+					
+				dist.x = ( p->x - qpos->x )*d;		// dist in cm
+				dist.y = ( p->y - qpos->y )*d;
+				dist.z = ( p->z - qpos->z )*d;			
+				dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);			
+				if ( dsq < simData.r2 ) {
+					c = simData.r2 - dsq;
+					sum += c * c * c;				
+					if  ( bufNeighbor[nbr] < MAX_NBR ) {
+						bufNeighbor[ nbr+bufNeighbor[nbr] ] = qndx;
+						bufNdist[ nbr+bufNeighbor[nbr] ] = sqrt(dsq);
+						bufNeighbor[nbr]++;
+					}
+				}				
+			}
+			//curr = *(int*) (bufPnts + __mul24(curr, simData.stride) + OFFSET_NEXT);
+		}		
+		return sum;
+	}
+	
+		/*if  ( ncount[threadIdx.x]  < MAX_NBR ) {
+				bufNeighbor [ nbr + ncount[threadIdx.x]  ] = curr;
+				bufNdist [ nbr + ncount[threadIdx.x]  ] = sqrt(dsq);
+				ncount[threadIdx.x]++;
+		}*/	
+		
+	__global__ void computePressure ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
+	{
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
+
+		//if ( ndx < 1024 ) {
+		
+		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));
+
+		// Find 2x2x2 grid cells
+		// - Use registers only, no arrays (local-memory too slow)
+		int3 cell;
+		int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;					
+		float gs = simData.smooth_rad / simData.sim_scale;		
+
+		cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
+		cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
+		cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));		
+		gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
+		gc1 = gc0 + 1;
+		gc2 = gc0 + simData.res.x;
+		gc3 = gc2 + 1;
+		if ( cell.z+1 < simData.res.z ) {
+			gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
+			gc5 = gc4 + 1;
+			gc6 = gc4 + simData.res.x;
+			gc7 = gc6 + 1;
+		}
+		if ( cell.x+1 >= simData.res.x ) {
+			gc1 = -1; gc3 = -1;
+			gc5 = -1; gc7 = -1;
+		}
+		if ( cell.y+1 >= simData.res.y ) {
+			gc2 = -1; gc3 = -1;
+			gc6 = -1; gc7 = -1;
+		}
+		// Sum Pressure
+		float sum = 0.0;		
+		bufNeighbor[ __mul24(ndx, MAX_NBR) ] = 1;
+		if (gc0 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
+		if (gc1 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );		
+		if (gc2 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );		
+		if (gc3 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );	
+		if (gc4 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
+		if (gc5 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );		
+		if (gc6 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
+		if (gc7 != -1 ) sum += contributePressure ( ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
+		
+		// Compute Density & Pressure
+		sum = sum * simData.pmass * simData.poly6kern;
+		if ( sum == 0.0 ) sum = 1.0;
+		*(float*) ((char*)pos + OFFSET_PRESS) = ( sum - simData.rest_dens ) * simData.stiffness;
+		*(float*) ((char*)pos + OFFSET_DENS) = 1.0f / sum;			
+		
+		//}		
+		//__syncthreads ();
+	}
+
+	__device__ void contributeForce ( float3& force, int pndx, float3* p, int qndx, int grid_ndx, char* bufPnts, uint2* bufHash )
+	{
+		float press = *(float*) ((char*)p + OFFSET_PRESS);
+		float dens = *(float*) ((char*)p + OFFSET_DENS);
+		float3 veval = *(float3*) ((char*)p + OFFSET_VEVAL );
+		float3 qeval, dist;				
+		float c, ndistj, dsq;
+		float pterm, dterm, vterm;		
+		float3* qpos;				
+		float d = simData.sim_scale;				
+		
+		vterm = simData.lapkern * simData.visc;		
+						
+		for ( ; qndx < simData.pnts; qndx++ ) {
+			
+			if ( bufHash[qndx].x != grid_ndx || qndx == NULL_HASH) break;
+			
+			if ( qndx != pndx ) {
+				qpos = (float3*) ( bufPnts + __mul24(qndx, simData.stride ));	
+					
+				dist.x = ( p->x - qpos->x )*d;		// dist in cm
+				dist.y = ( p->y - qpos->y )*d;
+				dist.z = ( p->z - qpos->z )*d;			
+				dsq = (dist.x*dist.x + dist.y*dist.y + dist.z*dist.z);			
+				if ( dsq < simData.r2 ) {				
+					ndistj = sqrt(dsq);
+					c = ( simData.smooth_rad - ndistj ); 
+					dist.x = ( p->x - qpos->x )*d;		// dist in cm
+					dist.y = ( p->y - qpos->y )*d;
+					dist.z = ( p->z - qpos->z )*d;			
+					pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
+					dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);	
+					qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
+					force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
+					force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
+					force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;							
+				}
+			}
+		}				
+	}
+	
+	
+	
+	__global__ void computeForce ( char* bufPntSort, int* bufGrid, uint2* bufHash, int numPnt )
+	{
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
+		
+		//if ( ndx < numPnt ) {
+		
+		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));				
+		
+		// Find 2x2x2 grid cells
+		// - Use registers only, no arrays (local-memory too slow)
+		int3 cell;
+		int gc0, gc1, gc2, gc3, gc4, gc5, gc6, gc7;					
+		float gs = simData.smooth_rad / simData.sim_scale;		
+
+		cell.x = max(0, (int)((-gs + pos->x - simData.min.x) * simData.delta.x));
+		cell.y = max(0, (int)((-gs + pos->y - simData.min.y) * simData.delta.y));
+		cell.z = max(0, (int)((-gs + pos->z - simData.min.z) * simData.delta.z));		
+		gc0 = __mul24(__mul24(cell.z, simData.res.y) + cell.y, simData.res.x) + cell.x;
+		gc1 = gc0 + 1;
+		gc2 = gc0 + simData.res.x;
+		gc3 = gc2 + 1;
+		if ( cell.z+1 < simData.res.z ) {
+			gc4 = gc0 + __mul24(simData.res.x, simData.res.y);
+			gc5 = gc4 + 1;
+			gc6 = gc4 + simData.res.x;
+			gc7 = gc6 + 1;
+		}
+		if ( cell.x+1 >= simData.res.x ) {
+			gc1 = -1; gc3 = -1;
+			gc5 = -1; gc7 = -1;
+		}
+		if ( cell.y+1 >= simData.res.y ) {
+			gc2 = -1; gc3 = -1;
+			gc6 = -1; gc7 = -1;
+		}
+		// Sum Pressure
+		float3 force = make_float3(0,0,0);
+		if (gc0 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc0], gc0, bufPntSort, bufHash );
+		if (gc1 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc1], gc1, bufPntSort, bufHash );		
+		if (gc2 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc2], gc2, bufPntSort, bufHash );		
+		if (gc3 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc3], gc3, bufPntSort, bufHash );	
+		if (gc4 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc4], gc4, bufPntSort, bufHash );
+		if (gc5 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc5], gc5, bufPntSort, bufHash );		
+		if (gc6 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc6], gc6, bufPntSort, bufHash );
+		if (gc7 != -1 ) contributeForce ( force, ndx, pos, bufGrid[gc7], gc7, bufPntSort, bufHash );
+		
+		// Update Force
+		*(float3*) ((char*)pos + OFFSET_FORCE ) = force;	
+		
+		//}
+		//__syncthreads ();
+	}
+
+	
+	__global__ void computeForceNbr ( char* bufPntSort, int numPnt )
+	{		
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
+		
+		if ( ndx < numPnt ) {
+				
+		float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));			
+		
+		float3* qpos;
+		float press = *(float*) ((char*)pos + OFFSET_PRESS);
+		float dens = *(float*) ((char*)pos + OFFSET_DENS);
+		float3 veval = *(float3*) ((char*)pos + OFFSET_VEVAL );
+		float3 qeval, dist, force;		
+		float d = simData.sim_scale;
+		float c, ndistj;
+		float pterm, dterm, vterm;
+		vterm = simData.lapkern * simData.visc;
+		int nbr = __mul24(ndx, MAX_NBR);
+		
+		int ncnt = bufNeighbor[ nbr ];		
+		
+		force = make_float3(0,0,0);
+		for (int j=1; j < ncnt; j++) {		// base 1, n[0] = count
+			ndistj = bufNdist[ nbr+j ];
+			qpos = (float3*) (bufPntSort + __mul24( bufNeighbor[ nbr+j ], simData.stride) );
+			c = ( simData.smooth_rad - ndistj ); 
+			dist.x = ( pos->x - qpos->x )*d;		// dist in cm
+			dist.y = ( pos->y - qpos->y )*d;
+			dist.z = ( pos->z - qpos->z )*d;			
+			pterm = -0.5f * c * simData.spikykern * ( press + *(float*)((char*)qpos+OFFSET_PRESS) ) / ndistj;
+			dterm = c * dens * *(float*)((char*)qpos+OFFSET_DENS);	
+			qeval = *(float3*)((char*)qpos+OFFSET_VEVAL);
+			force.x += ( pterm * dist.x + vterm * ( qeval.x - veval.x )) * dterm;
+			force.y += ( pterm * dist.y + vterm * ( qeval.y - veval.y )) * dterm;
+			force.z += ( pterm * dist.z + vterm * ( qeval.z - veval.z )) * dterm;			
+		}
+		*(float3*) ((char*)pos + OFFSET_FORCE ) = force;
+		
+		}	
+	
+	}
+		
+	__global__ void advanceParticles ( char* bufPntSort, int numPnt, float dt, float ss )
+	{		
+		uint ndx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;	// particle index		
+		
+		if ( ndx < numPnt ) {
+				
+			// Get particle vars
+			float3* pos = (float3*) (bufPntSort + __mul24(ndx, simData.stride));			
+			float3* vel = (float3*) ((char*)pos + OFFSET_VEL );
+			float3* vel_eval = (float3*) ((char*)pos + OFFSET_VEVAL );
+			float3 accel = *(float3*) ((char*)pos + OFFSET_FORCE );
+			float3 vcurr, vnext;			
+
+			// Leapfrog integration						
+			accel.x *= 0.00020543;			// NOTE - To do: SPH_PMASS should be passed in			
+			accel.y *= 0.00020543;
+			accel.z *= 0.00020543;			
+			accel.z -= 9.8;	
+			
+			vcurr = *vel;
+			vnext.x = accel.x*dt + vcurr.x;	
+			vnext.y = accel.y*dt + vcurr.y;	
+			vnext.z = accel.z*dt + vcurr.z;			// v(t+1/2) = v(t-1/2) + a(t) dt			
+			
+			accel.x = (vcurr.x + vnext.x) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
+			accel.y = (vcurr.y + vnext.y) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
+			accel.z = (vcurr.z + vnext.z) * 0.5;		// v(t+1) = [v(t-1/2) + v(t+1/2)] * 0.5		used to compute forces later
+			
+			*vel_eval = accel;			
+			*vel = vnext;
+			
+			dt /= simData.sim_scale;
+			vnext.x = pos->x + vnext.x*dt;
+			vnext.y = pos->y + vnext.y*dt;
+			vnext.z = pos->z + vnext.z*dt;
+			*pos = vnext;						// p(t+1) = p(t) + v(t+1/2) dt			
+		}	
+		
+		__syncthreads ();	
+	}
+
+#endif