Added btCudaBroadphase, some early research & development work to accelerate Bullet using CUDA

Re-uses the NVidia particle demo.
2008-09-04 23:24:11 +00:00
parent d8a5bf2c9c
commit aef74321d7
25 changed files with 6570 additions and 0 deletions
--- a/Extras/CUDA/README.txt
+++ b/Extras/CUDA/README.txt
@@ -0,0 +1,7 @@
+btCudaBroadphase is some R&D work, a first attempt to use CUDA in Bullet.
+It uses the NVidia CUDA particle demo grid broadphase as Bullet broadphase.
+
+Press 's' to toggle between the original CUDA particle demo and using Bullet+btCudaBroadphase.
+
+See some related discussion here:
+http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=500&start=105
--- a/Extras/CUDA/btCudaBroadphase.cpp
+++ b/Extras/CUDA/btCudaBroadphase.cpp
--- a/Extras/CUDA/btCudaBroadphase.h
+++ b/Extras/CUDA/btCudaBroadphase.h
@@ -0,0 +1,124 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2008 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef CUDA_BROADPHASE_H
+#define CUDA_BROADPHASE_H
+
+#include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"
+
+///The btCudaBroadphase uses CUDA to compute overlapping pairs using a GPU.
+class btCudaBroadphase : public btSimpleBroadphase
+{
+
+	
+
+	bool m_bInitialized;
+    int	m_numParticles;
+
+    // CPU data
+    float* m_hPos;
+    float* m_hVel;
+	float*	m_hSortedPos;
+
+    unsigned int*  m_hGridCounters;
+    unsigned int*  m_hGridCells;
+
+    unsigned int*  m_hParticleHash;
+    unsigned int*  m_hCellStart;
+	
+
+    // GPU data
+    float* m_dPos[2];
+    float* m_dVel[2];
+
+    float* m_dSortedPos;
+    float* m_dSortedVel;
+
+    // uniform grid data
+    unsigned int*  m_dGridCounters; // counts number of entries per grid cell
+    unsigned int*  m_dGridCells;    // contains indices of up to "m_maxParticlesPerCell" particles per cell
+
+    unsigned int*  m_dParticleHash[2];
+    unsigned int*  m_dCellStart;
+
+    unsigned int	m_posVbo[2];
+    unsigned int	m_colorVBO;
+
+    unsigned int	m_currentPosRead, m_currentVelRead;
+    unsigned int	m_currentPosWrite, m_currentVelWrite;
+
+    // params
+	struct SimParams&	m_simParams;
+    
+
+    
+    unsigned int	m_maxParticlesPerCell;
+
+protected:
+	
+	unsigned int createVBO(unsigned int size);
+
+	void _initialize(int numParticles);
+
+	void _finalize();
+
+
+
+public:
+
+	enum ParticleArray
+    {
+        POSITION,
+        VELOCITY,
+    };
+
+	enum ParticleConfig
+    {
+	    CONFIG_RANDOM,
+	    CONFIG_GRID,
+	    _NUM_CONFIGS
+    };
+
+	btCudaBroadphase(SimParams& simParams,int maxProxies);
+
+	virtual ~btCudaBroadphase();
+
+	void initGrid(unsigned int* size, float spacing, float jitter, unsigned int numParticles);
+
+	void reset(ParticleConfig config);
+
+	void   setArray(ParticleArray array, const float* data, int start, int count);
+
+	float*  getArray(ParticleArray array);
+
+	void addSphere(int start, float *pos, float *vel, int r, float spacing);
+
+	virtual void	calculateOverlappingPairs(btDispatcher* dispatcher);
+
+	unsigned int getCurrentReadBuffer() const { return m_posVbo[m_currentPosRead]; }
+    unsigned int getColorBuffer() const { return m_colorVBO; }
+	void dumpParticles(unsigned int  start, unsigned int count);
+	void	dumpGrid();
+	
+	float*	copyBuffersFromDeviceToHost();
+	void	copyBuffersFromHostToDevice();
+	float* getHvelPtr();
+	float*	getHposPtr();
+	void	quickHack(float deltaTime);
+	void	quickHack2();
+	void	integrate();
+
+};
+#endif //CUDA_BROADPHASE_H
--- a/Extras/CUDA/btCudaBroadphase.sln
+++ b/Extras/CUDA/btCudaBroadphase.sln
@@ -0,0 +1,113 @@
+
+Microsoft Visual Studio Solution File, Format Version 9.00
+# Visual Studio 2005
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "btCudaBroadphase", "btCudaBroadphase.vcproj", "{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}"
+	ProjectSection(ProjectDependencies) = postProject
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE} = {6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319} = {61BD1097-CF2E-B296-DAA9-73A6FE135319}
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A} = {7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libbulletmath", "..\..\msvc\8\libbulletmath.vcproj", "{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libbulletdynamics", "..\..\msvc\8\libbulletdynamics.vcproj", "{61BD1097-CF2E-B296-DAA9-73A6FE135319}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libbulletcollision", "..\..\msvc\8\libbulletcollision.vcproj", "{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		DebugDoublePrecision|Win32 = DebugDoublePrecision|Win32
+		DebugDoublePrecision|x64 = DebugDoublePrecision|x64
+		EmuDebug|Win32 = EmuDebug|Win32
+		EmuDebug|x64 = EmuDebug|x64
+		EmuRelease|Win32 = EmuRelease|Win32
+		EmuRelease|x64 = EmuRelease|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+		ReleaseDoublePrecision|Win32 = ReleaseDoublePrecision|Win32
+		ReleaseDoublePrecision|x64 = ReleaseDoublePrecision|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Debug|Win32.ActiveCfg = Debug|Win32
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Debug|Win32.Build.0 = Debug|Win32
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Debug|x64.ActiveCfg = Debug|x64
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Debug|x64.Build.0 = Debug|x64
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.DebugDoublePrecision|Win32.ActiveCfg = Debug|x64
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.DebugDoublePrecision|x64.ActiveCfg = Debug|x64
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.DebugDoublePrecision|x64.Build.0 = Debug|x64
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuDebug|Win32.ActiveCfg = EmuDebug|Win32
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuDebug|Win32.Build.0 = EmuDebug|Win32
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuDebug|x64.ActiveCfg = EmuDebug|x64
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuDebug|x64.Build.0 = EmuDebug|x64
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuRelease|Win32.ActiveCfg = EmuRelease|Win32
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuRelease|Win32.Build.0 = EmuRelease|Win32
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuRelease|x64.ActiveCfg = EmuRelease|x64
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.EmuRelease|x64.Build.0 = EmuRelease|x64
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Release|Win32.ActiveCfg = Release|Win32
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Release|Win32.Build.0 = Release|Win32
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Release|x64.ActiveCfg = Release|x64
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.Release|x64.Build.0 = Release|x64
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.ReleaseDoublePrecision|Win32.ActiveCfg = Release|x64
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.ReleaseDoublePrecision|x64.ActiveCfg = Release|x64
+		{AF7F45C7-8545-4CA2-B835-FDE8823B7B09}.ReleaseDoublePrecision|x64.Build.0 = Release|x64
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.Debug|Win32.ActiveCfg = Debug|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.Debug|Win32.Build.0 = Debug|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.Debug|x64.ActiveCfg = Debug|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.DebugDoublePrecision|Win32.ActiveCfg = DebugDoublePrecision|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.DebugDoublePrecision|Win32.Build.0 = DebugDoublePrecision|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.DebugDoublePrecision|x64.ActiveCfg = DebugDoublePrecision|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.EmuDebug|Win32.ActiveCfg = Debug|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.EmuDebug|Win32.Build.0 = Debug|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.EmuDebug|x64.ActiveCfg = Debug|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.EmuRelease|Win32.ActiveCfg = Release|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.EmuRelease|Win32.Build.0 = Release|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.EmuRelease|x64.ActiveCfg = Release|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.Release|Win32.ActiveCfg = Release|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.Release|Win32.Build.0 = Release|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.Release|x64.ActiveCfg = Release|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.ReleaseDoublePrecision|Win32.ActiveCfg = ReleaseDoublePrecision|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.ReleaseDoublePrecision|Win32.Build.0 = ReleaseDoublePrecision|Win32
+		{7D6E339F-9C2C-31DA-FDB0-5EE50973CF2A}.ReleaseDoublePrecision|x64.ActiveCfg = ReleaseDoublePrecision|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.Debug|Win32.ActiveCfg = Debug|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.Debug|Win32.Build.0 = Debug|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.Debug|x64.ActiveCfg = Debug|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.DebugDoublePrecision|Win32.ActiveCfg = DebugDoublePrecision|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.DebugDoublePrecision|Win32.Build.0 = DebugDoublePrecision|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.DebugDoublePrecision|x64.ActiveCfg = DebugDoublePrecision|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.EmuDebug|Win32.ActiveCfg = Debug|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.EmuDebug|Win32.Build.0 = Debug|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.EmuDebug|x64.ActiveCfg = Debug|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.EmuRelease|Win32.ActiveCfg = Release|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.EmuRelease|Win32.Build.0 = Release|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.EmuRelease|x64.ActiveCfg = Release|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.Release|Win32.ActiveCfg = Release|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.Release|Win32.Build.0 = Release|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.Release|x64.ActiveCfg = Release|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.ReleaseDoublePrecision|Win32.ActiveCfg = ReleaseDoublePrecision|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.ReleaseDoublePrecision|Win32.Build.0 = ReleaseDoublePrecision|Win32
+		{61BD1097-CF2E-B296-DAA9-73A6FE135319}.ReleaseDoublePrecision|x64.ActiveCfg = ReleaseDoublePrecision|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.Debug|Win32.ActiveCfg = Debug|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.Debug|Win32.Build.0 = Debug|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.Debug|x64.ActiveCfg = Debug|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.DebugDoublePrecision|Win32.ActiveCfg = DebugDoublePrecision|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.DebugDoublePrecision|Win32.Build.0 = DebugDoublePrecision|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.DebugDoublePrecision|x64.ActiveCfg = DebugDoublePrecision|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.EmuDebug|Win32.ActiveCfg = Debug|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.EmuDebug|Win32.Build.0 = Debug|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.EmuDebug|x64.ActiveCfg = Debug|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.EmuRelease|Win32.ActiveCfg = Release|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.EmuRelease|Win32.Build.0 = Release|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.EmuRelease|x64.ActiveCfg = Release|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.Release|Win32.ActiveCfg = Release|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.Release|Win32.Build.0 = Release|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.Release|x64.ActiveCfg = Release|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.ReleaseDoublePrecision|Win32.ActiveCfg = ReleaseDoublePrecision|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.ReleaseDoublePrecision|Win32.Build.0 = ReleaseDoublePrecision|Win32
+		{6ADA430D-009C-2ED4-A787-2AC2D6FEB8CE}.ReleaseDoublePrecision|x64.ActiveCfg = ReleaseDoublePrecision|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Extras/CUDA/btCudaBroadphase.vcproj
+++ b/Extras/CUDA/btCudaBroadphase.vcproj
--- a/Extras/CUDA/cutil_gl_error.h
+++ b/Extras/CUDA/cutil_gl_error.h
@@ -0,0 +1,86 @@
+/*
+* Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:   
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and 
+* international Copyright laws.  
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+* OR PERFORMANCE OF THIS SOURCE CODE.  
+*
+* U.S. Government End Users.  This source code is a "commercial item" as 
+* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+* "commercial computer software" and "commercial computer software 
+* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+* and is provided to the U.S. Government only as a commercial end item.  
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+* source code with only those rights set forth herein.
+*/
+
+#ifndef CUTIL_GL_ERROR
+#define CUTIL_GL_ERROR
+
+/* CUda UTility Library */
+
+// includes, system
+#ifdef _WIN32
+#  define WINDOWS_LEAN_AND_MEAN
+#  include <windows.h>
+#  include <stdlib.h>
+#  undef min
+#  undef max
+#endif
+
+// includes, graphics
+#if defined (__APPLE__) || defined(MACOSX)
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#else
+#include <GL/gl.h>
+#include <GL/glu.h>
+#endif
+
+////////////////////////////////////////////////////////////////////////////
+//! Check for OpenGL error
+//! @return CUTTrue if no GL error has been encountered, otherwise 0
+//! @param file  __FILE__ macro
+//! @param line  __LINE__ macro
+//! @note The GL error is listed on stderr
+//! @note This function should be used via the CHECK_ERROR_GL() macro
+////////////////////////////////////////////////////////////////////////////
+CUTBoolean CUTIL_API
+cutCheckErrorGL( const char* file, const int line) 
+{
+	CUTBoolean ret_val = CUTTrue;
+
+	// check for error
+	GLenum gl_error = glGetError();
+	if (gl_error != GL_NO_ERROR) 
+	{
+		fprintf(stderr, "GL Error in file '%s' in line %d :\n", file, line);
+		fprintf(stderr, "%s\n", gluErrorString(gl_error));
+		ret_val = CUTFalse;
+	}
+	return ret_val;
+}
+
+#ifdef _DEBUG
+
+#define CUT_CHECK_ERROR_GL()                                               \
+	if( CUTFalse == cutCheckErrorGL( __FILE__, __LINE__)) {                  \
+	exit(EXIT_FAILURE);                                                  \
+	}
+
+#endif // _DEBUG
+
+#endif // CUTIL_GL_ERROR
--- a/Extras/CUDA/cutil_math.h
+++ b/Extras/CUDA/cutil_math.h
@@ -0,0 +1,767 @@
+    /*
+ * Copyright 1993-2007 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.  Users and possessors of this source code
+ * are hereby granted a nonexclusive, royalty-free license to use this code
+ * in individual and commercial software.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+/*
+    This file implements common mathematical operations on vector types
+    (float3, float4 etc.) since these are not provided as standard by CUDA.
+
+    The syntax is modelled on the Cg standard library.
+*/
+
+#ifndef CUTIL_MATH_H
+#define CUTIL_MATH_H
+
+#include "cuda_runtime.h"
+
+////////////////////////////////////////////////////////////////////////////////
+typedef unsigned int uint;
+typedef unsigned short ushort;
+
+#ifndef __CUDACC__
+#include <math.h>
+
+inline float fminf(float a, float b)
+{
+  return a < b ? a : b;
+}
+
+inline float fmaxf(float a, float b)
+{
+  return a < b ? a : b;
+}
+
+inline int max(int a, int b)
+{
+  return a > b ? a : b;
+}
+
+inline int min(int a, int b)
+{
+  return a < b ? a : b;
+}
+#endif
+
+// float functions
+////////////////////////////////////////////////////////////////////////////////
+
+// lerp
+inline __device__ __host__ float lerp(float a, float b, float t)
+{
+    return a + t*(b-a);
+}
+
+// clamp
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return fmaxf(a, fminf(f, b));
+}
+
+// int2 functions
+////////////////////////////////////////////////////////////////////////////////
+
+// negate
+inline __host__ __device__ int2 operator-(int2 &a)
+{
+    return make_int2(-a.x, -a.y);
+}
+
+// addition
+inline __host__ __device__ int2 operator+(int2 a, int2 b)
+{
+    return make_int2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(int2 &a, int2 b)
+{
+    a.x += b.x; a.y += b.y;
+}
+
+// subtract
+inline __host__ __device__ int2 operator-(int2 a, int2 b)
+{
+    return make_int2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int2 b)
+{
+    a.x -= b.x; a.y -= b.y;
+}
+
+// multiply
+inline __host__ __device__ int2 operator*(int2 a, int2 b)
+{
+    return make_int2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ int2 operator*(int2 a, int s)
+{
+    return make_int2(a.x * s, a.y * s);
+}
+inline __host__ __device__ int2 operator*(int s, int2 a)
+{
+    return make_int2(a.x * s, a.y * s);
+}
+inline __host__ __device__ void operator*=(int2 &a, int s)
+{
+    a.x *= s; a.y *= s;
+}
+
+// float2 functions
+////////////////////////////////////////////////////////////////////////////////
+
+// additional constructors
+inline __host__ __device__ float2 make_float2(float s)
+{
+    return make_float2(s, s);
+}
+inline __host__ __device__ float2 make_float2(int2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+
+// negate
+inline __host__ __device__ float2 operator-(float2 &a)
+{
+    return make_float2(-a.x, -a.y);
+}
+
+// addition
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+    return make_float2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(float2 &a, float2 b)
+{
+    a.x += b.x; a.y += b.y;
+}
+
+// subtract
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+    return make_float2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float2 b)
+{
+    a.x -= b.x; a.y -= b.y;
+}
+
+// multiply
+inline __host__ __device__ float2 operator*(float2 a, float2 b)
+{
+    return make_float2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ float2 operator*(float2 a, float s)
+{
+    return make_float2(a.x * s, a.y * s);
+}
+inline __host__ __device__ float2 operator*(float s, float2 a)
+{
+    return make_float2(a.x * s, a.y * s);
+}
+inline __host__ __device__ void operator*=(float2 &a, float s)
+{
+    a.x *= s; a.y *= s;
+}
+
+// divide
+inline __host__ __device__ float2 operator/(float2 a, float2 b)
+{
+    return make_float2(a.x / b.x, a.y / b.y);
+}
+inline __host__ __device__ float2 operator/(float2 a, float s)
+{
+    float inv = 1.0f / s;
+    return a * inv;
+}
+inline __host__ __device__ float2 operator/(float s, float2 a)
+{
+    float inv = 1.0f / s;
+    return a * inv;
+}
+inline __host__ __device__ void operator/=(float2 &a, float s)
+{
+    float inv = 1.0f / s;
+    a *= inv;
+}
+
+// lerp
+inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
+{
+    return a + t*(b-a);
+}
+
+// clamp
+inline __device__ __host__ float2 clamp(float2 v, float a, float b)
+{
+    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+
+inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
+{
+    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+
+// dot product
+inline __host__ __device__ float dot(float2 a, float2 b)
+{ 
+    return a.x * b.x + a.y * b.y;
+}
+
+// length
+inline __host__ __device__ float length(float2 v)
+{
+    return sqrtf(dot(v, v));
+}
+
+// normalize
+inline __host__ __device__ float2 normalize(float2 v)
+{
+    float invLen = 1.0f / sqrtf(dot(v, v));
+    return v * invLen;
+}
+
+// floor
+inline __host__ __device__ float2 floor(const float2 v)
+{
+    return make_float2(floor(v.x), floor(v.y));
+}
+
+// reflect
+inline __host__ __device__ float2 reflect(float2 i, float2 n)
+{
+	return i - 2.0f * n * dot(n,i);
+}
+
+// float3 functions
+////////////////////////////////////////////////////////////////////////////////
+
+// additional constructors
+inline __host__ __device__ float3 make_float3(float s)
+{
+    return make_float3(s, s, s);
+}
+inline __host__ __device__ float3 make_float3(float2 a)
+{
+    return make_float3(a.x, a.y, 0.0f);
+}
+inline __host__ __device__ float3 make_float3(float2 a, float s)
+{
+    return make_float3(a.x, a.y, s);
+}
+inline __host__ __device__ float3 make_float3(float4 a)
+{
+    return make_float3(a.x, a.y, a.z);  // discards w
+}
+inline __host__ __device__ float3 make_float3(int3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+
+// negate
+inline __host__ __device__ float3 operator-(float3 &a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+
+// min
+static __inline__ __host__ __device__ float3 fminf(float3 a, float3 b)
+{
+	return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
+}
+
+// max
+static __inline__ __host__ __device__ float3 fmaxf(float3 a, float3 b)
+{
+	return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
+}
+
+// addition
+inline __host__ __device__ float3 operator+(float3 a, float3 b)
+{
+    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ float3 operator+(float3 a, float b)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(float3 &a, float3 b)
+{
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+
+// subtract
+inline __host__ __device__ float3 operator-(float3 a, float3 b)
+{
+    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ float3 operator-(float3 a, float b)
+{
+    return make_float3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ void operator-=(float3 &a, float3 b)
+{
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+
+// multiply
+inline __host__ __device__ float3 operator*(float3 a, float3 b)
+{
+    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ float3 operator*(float3 a, float s)
+{
+    return make_float3(a.x * s, a.y * s, a.z * s);
+}
+inline __host__ __device__ float3 operator*(float s, float3 a)
+{
+    return make_float3(a.x * s, a.y * s, a.z * s);
+}
+inline __host__ __device__ void operator*=(float3 &a, float s)
+{
+    a.x *= s; a.y *= s; a.z *= s;
+}
+
+// divide
+inline __host__ __device__ float3 operator/(float3 a, float3 b)
+{
+    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ float3 operator/(float3 a, float s)
+{
+    float inv = 1.0f / s;
+    return a * inv;
+}
+inline __host__ __device__ float3 operator/(float s, float3 a)
+{
+    float inv = 1.0f / s;
+    return a * inv;
+}
+inline __host__ __device__ void operator/=(float3 &a, float s)
+{
+    float inv = 1.0f / s;
+    a *= inv;
+}
+
+// lerp
+inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
+{
+    return a + t*(b-a);
+}
+
+// clamp
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+
+// dot product
+inline __host__ __device__ float dot(float3 a, float3 b)
+{ 
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+// cross product
+inline __host__ __device__ float3 cross(float3 a, float3 b)
+{ 
+    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); 
+}
+
+// length
+inline __host__ __device__ float length(float3 v)
+{
+    return sqrtf(dot(v, v));
+}
+
+// normalize
+inline __host__ __device__ float3 normalize(float3 v)
+{
+    float invLen = 1.0f / sqrtf(dot(v, v));
+    return v * invLen;
+}
+
+// floor
+inline __host__ __device__ float3 floor(const float3 v)
+{
+    return make_float3(floor(v.x), floor(v.y), floor(v.z));
+}
+
+// reflect
+inline __host__ __device__ float3 reflect(float3 i, float3 n)
+{
+	return i - 2.0f * n * dot(n,i);
+}
+
+// float4 functions
+////////////////////////////////////////////////////////////////////////////////
+
+// additional constructors
+inline __host__ __device__ float4 make_float4(float s)
+{
+    return make_float4(s, s, s, s);
+}
+inline __host__ __device__ float4 make_float4(float3 a)
+{
+    return make_float4(a.x, a.y, a.z, 0.0f);
+}
+inline __host__ __device__ float4 make_float4(float3 a, float w)
+{
+    return make_float4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ float4 make_float4(int4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+
+// negate
+inline __host__ __device__ float4 operator-(float4 &a)
+{
+    return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+
+// min
+static __inline__ __host__ __device__ float4 fminf(float4 a, float4 b)
+{
+	return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
+}
+
+// max
+static __inline__ __host__ __device__ float4 fmaxf(float4 a, float4 b)
+{
+	return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
+}
+
+// addition
+inline __host__ __device__ float4 operator+(float4 a, float4 b)
+{
+    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(float4 &a, float4 b)
+{
+    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+}
+
+// subtract
+inline __host__ __device__ float4 operator-(float4 a, float4 b)
+{
+    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(float4 &a, float4 b)
+{
+    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
+}
+
+// multiply
+inline __host__ __device__ float4 operator*(float4 a, float s)
+{
+    return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
+}
+inline __host__ __device__ float4 operator*(float s, float4 a)
+{
+    return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
+}
+inline __host__ __device__ void operator*=(float4 &a, float s)
+{
+    a.x *= s; a.y *= s; a.z *= s; a.w *= s;
+}
+
+// divide
+inline __host__ __device__ float4 operator/(float4 a, float4 b)
+{
+    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+}
+inline __host__ __device__ float4 operator/(float4 a, float s)
+{
+    float inv = 1.0f / s;
+    return a * inv;
+}
+inline __host__ __device__ float4 operator/(float s, float4 a)
+{
+    float inv = 1.0f / s;
+    return a * inv;
+}
+inline __host__ __device__ void operator/=(float4 &a, float s)
+{
+    float inv = 1.0f / s;
+    a *= inv;
+}
+
+// lerp
+inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
+{
+    return a + t*(b-a);
+}
+
+// clamp
+inline __device__ __host__ float4 clamp(float4 v, float a, float b)
+{
+    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+
+inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
+{
+    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+// dot product
+inline __host__ __device__ float dot(float4 a, float4 b)
+{ 
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+// length
+inline __host__ __device__ float length(float4 r)
+{
+    return sqrtf(dot(r, r));
+}
+
+// normalize
+inline __host__ __device__ float4 normalize(float4 v)
+{
+    float invLen = 1.0f / sqrtf(dot(v, v));
+    return v * invLen;
+}
+
+// floor
+inline __host__ __device__ float4 floor(const float4 v)
+{
+    return make_float4(floor(v.x), floor(v.y), floor(v.z), floor(v.w));
+}
+
+// int3 functions
+////////////////////////////////////////////////////////////////////////////////
+
+// additional constructors
+inline __host__ __device__ int3 make_int3(int s)
+{
+    return make_int3(s, s, s);
+}
+inline __host__ __device__ int3 make_int3(float3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+
+// negate
+inline __host__ __device__ int3 operator-(int3 &a)
+{
+    return make_int3(-a.x, -a.y, -a.z);
+}
+
+// min
+inline __host__ __device__ int3 min(int3 a, int3 b)
+{
+    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+
+// max
+inline __host__ __device__ int3 max(int3 a, int3 b)
+{
+    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+
+// addition
+inline __host__ __device__ int3 operator+(int3 a, int3 b)
+{
+    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(int3 &a, int3 b)
+{
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+
+// subtract
+inline __host__ __device__ int3 operator-(int3 a, int3 b)
+{
+    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+
+inline __host__ __device__ void operator-=(int3 &a, int3 b)
+{
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+
+// multiply
+inline __host__ __device__ int3 operator*(int3 a, int3 b)
+{
+    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ int3 operator*(int3 a, int s)
+{
+    return make_int3(a.x * s, a.y * s, a.z * s);
+}
+inline __host__ __device__ int3 operator*(int s, int3 a)
+{
+    return make_int3(a.x * s, a.y * s, a.z * s);
+}
+inline __host__ __device__ void operator*=(int3 &a, int s)
+{
+    a.x *= s; a.y *= s; a.z *= s;
+}
+
+// divide
+inline __host__ __device__ int3 operator/(int3 a, int3 b)
+{
+    return make_int3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ int3 operator/(int3 a, int s)
+{
+    return make_int3(a.x / s, a.y / s, a.z / s);
+}
+inline __host__ __device__ int3 operator/(int s, int3 a)
+{
+    return make_int3(a.x / s, a.y / s, a.z / s);
+}
+inline __host__ __device__ void operator/=(int3 &a, int s)
+{
+    a.x /= s; a.y /= s; a.z /= s;
+}
+
+// clamp
+inline __device__ __host__ int clamp(int f, int a, int b)
+{
+    return max(a, min(f, b));
+}
+
+inline __device__ __host__ int3 clamp(int3 v, int a, int b)
+{
+    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+
+inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
+{
+    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+
+
+// uint3 functions
+////////////////////////////////////////////////////////////////////////////////
+
+// additional constructors
+inline __host__ __device__ uint3 make_uint3(uint s)
+{
+    return make_uint3(s, s, s);
+}
+inline __host__ __device__ uint3 make_uint3(float3 a)
+{
+    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
+}
+
+// min
+inline __host__ __device__ uint3 min(uint3 a, uint3 b)
+{
+    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+
+// max
+inline __host__ __device__ uint3 max(uint3 a, uint3 b)
+{
+    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+
+// addition
+inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
+{
+    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
+{
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+
+// subtract
+inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
+{
+    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+
+inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
+{
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+
+// multiply
+inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
+{
+    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint s)
+{
+    return make_uint3(a.x * s, a.y * s, a.z * s);
+}
+inline __host__ __device__ uint3 operator*(uint s, uint3 a)
+{
+    return make_uint3(a.x * s, a.y * s, a.z * s);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint s)
+{
+    a.x *= s; a.y *= s; a.z *= s;
+}
+
+// divide
+inline __host__ __device__ uint3 operator/(uint3 a, uint3 b)
+{
+    return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ uint3 operator/(uint3 a, uint s)
+{
+    return make_uint3(a.x / s, a.y / s, a.z / s);
+}
+inline __host__ __device__ uint3 operator/(uint s, uint3 a)
+{
+    return make_uint3(a.x / s, a.y / s, a.z / s);
+}
+inline __host__ __device__ void operator/=(uint3 &a, uint s)
+{
+    a.x /= s; a.y /= s; a.z /= s;
+}
+
+// clamp
+inline __device__ __host__ uint clamp(uint f, uint a, uint b)
+{
+    return max(a, min(f, b));
+}
+
+inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
+{
+    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+
+inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
+{
+    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+
+#endif
--- a/Extras/CUDA/param.cpp
+++ b/Extras/CUDA/param.cpp
@@ -0,0 +1,3 @@
+#include <param.h>
+
+const Param<int> dummy("error");
--- a/Extras/CUDA/param.h
+++ b/Extras/CUDA/param.h
@@ -0,0 +1,226 @@
+/*
+  Simple parameter system
+  sgreen@nvidia.com 4/2001
+*/
+
+#ifndef PARAM_H
+#define PARAM_H
+
+#ifdef _WIN32
+#  pragma warning(disable:4786)   // No stupid debug warnings
+#endif
+
+#include <string>
+#include <vector>
+#include <map>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+
+// base class for named parameter
+class ParamBase {
+public:
+  ParamBase(char *name) { m_name = name; }
+
+  virtual ~ParamBase() { }
+  
+  std::string *GetName() { return &m_name; }
+
+  virtual float GetFloatValue() = 0;
+  virtual int GetIntValue() = 0;
+
+  virtual std::string GetValueString() = 0;
+  
+  virtual void Reset() = 0;
+  virtual void Increment() = 0;
+  virtual void Decrement() = 0;
+
+  virtual float GetPercentage() = 0;
+  virtual void SetPercentage(float p) = 0;
+
+  virtual void Write(std::ostream &stream) = 0;
+  virtual void Read(std::istream &stream) = 0;
+
+  virtual bool IsList() = 0;
+
+protected:
+  std::string m_name;
+};
+
+// derived class for single-valued parameter
+template<class T> class Param : public ParamBase {
+public:
+  Param(char *name, T value = 0, T min = 0, T max = 10000, T step = 1, T* ptr = 0)
+    : ParamBase(name) 
+  {
+    if (ptr) {
+      m_ptr = ptr;
+    } else {
+      m_ptr = &m_value;
+    }
+    *m_ptr = value;
+    m_default = value;
+
+    m_min = min;
+    m_max = max;
+    m_step = step;
+  }
+
+  virtual ~Param() { }
+
+  virtual float GetFloatValue() { return (float) *m_ptr; }
+  virtual int GetIntValue() { return (int) *m_ptr; }
+
+  T GetValue() const { return *m_ptr; }
+  T SetValue(const T value) { *m_ptr = value; }
+
+  // inherited functions
+  std::string GetValueString()
+  {
+    std::ostringstream ost;
+    ost<<std::setprecision(3)<<*m_ptr;
+    return ost.str();
+  }
+
+  float GetPercentage()
+  {
+    return (*m_ptr - m_min) / (float) (m_max - m_min);
+  }
+
+  void SetPercentage(float p)
+  {
+    *m_ptr = (T)(m_min + p * (m_max - m_min));
+  }
+
+  void Reset() { *m_ptr = m_default; }
+
+  void Increment()
+  { 
+    *m_ptr += m_step;
+    if (*m_ptr > m_max)
+      *m_ptr = m_max;
+  }
+
+  void Decrement()
+  {
+    *m_ptr -= m_step;
+    if (*m_ptr < m_min)
+      *m_ptr = m_min;
+  }
+
+  void Write(std::ostream &stream) { stream << m_name << " " << *m_ptr << '\n'; }
+  void Read(std::istream &stream) { stream >> m_name >> *m_ptr; }
+
+  bool IsList() { return false; }
+
+private:
+  T m_value;
+  T *m_ptr;   // pointer to value declared elsewhere
+  T m_default, m_min, m_max;
+  T m_step;
+};
+
+
+extern const Param<int> dummy;
+
+// list of parameters
+class ParamList : public ParamBase {
+public:
+  ParamList(char *name = "")
+    : ParamBase(name)
+  {
+    active = true;
+  }
+
+  virtual ~ParamList() { }
+
+  virtual float GetFloatValue() { return 0.0f; }
+  virtual int GetIntValue() { return 0; }
+
+  void AddParam(ParamBase *param)
+  {
+    m_params.push_back(param);  
+    m_map[*param->GetName()] = param;
+    m_current = m_params.begin();
+  }
+
+  // look-up parameter based on name
+  ParamBase *GetParam(char *name)
+  {
+    ParamBase *p = m_map[name];
+
+    if (p)
+      return p;
+    else
+      return (ParamBase *) &dummy;
+  }
+
+  ParamBase *GetParam(int i)
+  {
+    return m_params[i];
+  }
+
+  ParamBase *GetCurrent()
+  {
+    return *m_current;
+  }
+
+  int GetSize() { return (int)m_params.size(); }
+
+  // inherited functions
+  std::string GetValueString()
+  {
+//    return m_name;
+    return "list";
+  }
+
+  void Reset()
+  {
+    m_current = m_params.begin();
+  }
+
+  void Increment()
+  {
+    m_current++;
+    if (m_current == m_params.end())
+      m_current = m_params.begin();
+  }
+
+  void Decrement()
+  {
+    if (m_current == m_params.begin())
+      m_current = m_params.end()-1;
+    else
+      m_current--;
+
+  }
+
+  float GetPercentage() { return 0.0f; }
+  void SetPercentage(float /*p*/) { ; }
+
+  void Write(std::ostream &stream)
+  { 
+    stream << m_name << '\n';
+    for(std::vector<ParamBase *>::const_iterator p = m_params.begin(); p != m_params.end(); ++p) {
+      (*p)->Write(stream);
+    }
+  }
+
+  void Read(std::istream &stream)
+  {
+    stream >> m_name;
+    for(std::vector<ParamBase *>::const_iterator p = m_params.begin(); p != m_params.end(); ++p) {
+      (*p)->Read(stream);
+    }
+  }
+
+  bool IsList() { return true; }
+
+protected:
+  bool active;
+  std::vector<ParamBase *> m_params;
+  std::map<std::string, ParamBase *> m_map;
+  std::vector<ParamBase *>::const_iterator m_current;
+};
+
+#endif
--- a/Extras/CUDA/paramgl.cpp
+++ b/Extras/CUDA/paramgl.cpp
@@ -0,0 +1,209 @@
+/*
+    ParamListGL
+    - class derived from ParamList to do simple OpenGL rendering of a parameter list
+    sgg 8/2001
+*/
+
+#include <param.h>
+#include <paramgl.h>
+
+void beginWinCoords(void)
+{
+  glMatrixMode(GL_MODELVIEW);
+  glPushMatrix();
+  glLoadIdentity();
+  glTranslatef(0.0, glutGet(GLUT_WINDOW_HEIGHT) - 1, 0.0);
+  glScalef(1.0, -1.0, 1.0);
+
+  glMatrixMode(GL_PROJECTION);
+  glPushMatrix();
+  glLoadIdentity();
+  glOrtho(0, glutGet(GLUT_WINDOW_WIDTH), 0, glutGet(GLUT_WINDOW_HEIGHT), -1, 1);
+
+  glMatrixMode(GL_MODELVIEW);
+}
+
+void endWinCoords(void)
+{
+  glMatrixMode(GL_PROJECTION);
+  glPopMatrix();
+
+  glMatrixMode(GL_MODELVIEW);
+  glPopMatrix();
+}
+
+void glPrint(int x, int y, const char *s, void *font)
+{
+  int i, len;
+
+  glRasterPos2f(x, y);
+  len = (int) strlen(s);
+  for (i = 0; i < len; i++) {
+    glutBitmapCharacter(font, s[i]);
+  }
+}
+
+void glPrintShadowed(int x, int y, const char *s, void *font, float *color)
+{
+  glColor3f(0.0, 0.0, 0.0);
+  glPrint(x-1, y-1, s, font);
+
+  glColor3fv((GLfloat *) color);
+  glPrint(x, y, s, font);
+}
+
+
+ParamListGL::ParamListGL(char *name) : ParamList(name)
+{
+  font = (void *) GLUT_BITMAP_9_BY_15;
+//    font = (void *) GLUT_BITMAP_8_BY_13;
+
+  bar_x = 250;
+  bar_w = 250;
+  bar_h = 10;
+  bar_offset = 5;
+  text_x = 5;
+  separation = 15;
+  value_x = 200;
+  font_h = 15;
+  start_x = 0;
+  start_y = 0;
+  
+  text_col_selected[0] = 1.0;
+  text_col_selected[1] = 1.0;
+  text_col_selected[2] = 1.0;
+
+  text_col_unselected[0] = 0.75;
+  text_col_unselected[1] = 0.75;
+  text_col_unselected[2] = 0.75;
+
+  bar_col_outer[0] = 0.0;
+  bar_col_outer[1] = 0.0;
+  bar_col_outer[2] = 0.0;
+
+  bar_col_inner[0] = 0.0;
+  bar_col_inner[1] = 0.0;
+  bar_col_inner[2] = 0.0;
+
+  text_col_shadow[0] = 0.0;
+  text_col_shadow[1] = 0.0;
+  text_col_shadow[2] = 0.0;
+}
+
+void
+ParamListGL::Render(int x, int y, bool shadow)
+{
+  beginWinCoords();
+
+  start_x = x; start_y = y;
+
+  for(std::vector<ParamBase *>::const_iterator p = m_params.begin(); p != m_params.end(); ++p) {
+    if ((*p)->IsList()) {
+      ParamListGL *list = (ParamListGL *) (*p);
+      list->Render(x+10, y);
+      y += separation*list->GetSize();
+
+    } else {
+      if (p == m_current)
+        glColor3fv(text_col_selected);
+      else
+        glColor3fv(text_col_unselected);
+      
+      if (shadow) {
+        glPrintShadowed(x + text_x, y + font_h, (*p)->GetName()->c_str(), font, (p == m_current) ? text_col_selected : text_col_unselected);
+        glPrintShadowed(x + value_x, y + font_h, (*p)->GetValueString().c_str(), font, (p == m_current) ? text_col_selected : text_col_unselected);
+      }
+      else {
+        glPrint(x + text_x, y + font_h, (*p)->GetName()->c_str(), font);
+        glPrint(x + value_x, y + font_h, (*p)->GetValueString().c_str(), font);
+      } 
+
+//      glColor3fv((GLfloat *) &bar_col_outer);
+      glBegin(GL_LINE_LOOP);
+      glVertex2f(x + bar_x, y + bar_offset);
+      glVertex2f(x + bar_x + bar_w, y + bar_offset);
+      glVertex2f(x + bar_x + bar_w, y + bar_offset + bar_h);
+      glVertex2f(x + bar_x, y + bar_offset + bar_h);
+      glEnd();
+
+//      glColor3fv((GLfloat *) &bar_col_inner);
+      glRectf(x + bar_x, y + bar_offset + bar_h, x + bar_x + (bar_w*(*p)->GetPercentage()), y + bar_offset);
+
+      y += separation;
+    }
+
+  }
+
+  endWinCoords();
+}
+
+
+bool
+ParamListGL::Mouse(int x, int y, int button, int state)
+{
+  if ((y < start_y) || (y > (int)(start_y + (separation * m_params.size()) - 1)))
+    return false;
+
+  int i = (y - start_y) / separation;
+
+  if ((button==GLUT_LEFT_BUTTON) && (state==GLUT_DOWN)) {
+#if defined(__GNUC__) && (__GNUC__ < 3)
+    m_current = &m_params[i];
+#else
+    // MJH: workaround since the version of vector::at used here is non-standard
+    for (m_current = m_params.begin(); m_current != m_params.end() && i > 0; m_current++, i--);
+    //m_current = (std::vector<ParamBase *>::const_iterator)&m_params.at(i);
+#endif
+
+    if ((x > bar_x) && (x < bar_x + bar_w)) {
+      Motion(x, y);
+    }
+  }
+  return true;
+}
+
+bool
+ParamListGL::Motion(int x, int y)
+{
+  if ((y < start_y) || (y > start_y + (separation * (int)m_params.size()) - 1) )
+    return false;
+
+  if (x < bar_x) {
+    (*m_current)->SetPercentage(0.0);
+    return true;
+  }
+    
+  if (x > bar_x + bar_w) {
+    (*m_current)->SetPercentage(1.0);
+    return true;
+  }
+
+  (*m_current)->SetPercentage((x-bar_x) / (float) bar_w);
+  return true;
+}
+
+void
+ParamListGL::Special(int key, int /*x*/, int /*y*/)
+{
+  switch(key) {
+  case GLUT_KEY_DOWN:
+    Increment();
+    break;
+  case GLUT_KEY_UP:
+    Decrement();
+    break;
+  case GLUT_KEY_RIGHT:
+    GetCurrent()->Increment();
+    break;
+  case GLUT_KEY_LEFT:
+    GetCurrent()->Decrement();
+    break;
+  case GLUT_KEY_HOME:
+    GetCurrent()->Reset();
+    break;
+  case GLUT_KEY_END:
+    GetCurrent()->SetPercentage(1.0);
+    break;
+  }
+  glutPostRedisplay();
+}
--- a/Extras/CUDA/paramgl.h
+++ b/Extras/CUDA/paramgl.h
@@ -0,0 +1,54 @@
+/*
+    ParamListGL
+    - class derived from ParamList to do simple OpenGL rendering of a parameter list
+    sgg 8/2001
+*/
+
+#ifndef PARAMGL_H
+#define PARAMGL_H
+
+#if defined(__APPLE__) || defined(MACOSX)
+#include <GLUT/glut.h>
+#else
+#include <GL/glut.h>
+#endif
+
+#include <param.h>
+
+void beginWinCoords();
+void endWinCoords();
+void glPrint(int x, int y, const char *s, void *font);
+void glPrintShadowed(int x, int y, const char *s, void *font, float *color);
+
+class ParamListGL : public ParamList {
+public:
+  ParamListGL(char *name = "");
+
+  void Render(int x, int y, bool shadow = false);
+  bool Mouse(int x, int y, int button=GLUT_LEFT_BUTTON, int state=GLUT_DOWN);
+  bool Motion(int x, int y);
+  void Special(int key, int x, int y);
+
+  void SetSelectedColor(float r, float g, float b) { text_col_selected[0] = r; text_col_selected[1] = g; text_col_selected[2] = b; }
+  void SetUnSelectedColor(float r, float g, float b) { text_col_unselected[0] = r; text_col_unselected[1] = g; text_col_unselected[2] = b; }
+
+  int bar_x;
+  int bar_w;
+  int bar_h;
+  int text_x;
+  int separation;
+  int value_x;
+  int font_h;
+  int start_x, start_y;
+  int bar_offset;
+
+  float text_col_selected[3];
+  float text_col_unselected[3];
+  float text_col_shadow[3];
+  float bar_col_outer[3];
+  float bar_col_inner[3];
+
+  void *font;
+};
+
+#endif
--- a/Extras/CUDA/particleSystem.cpp
+++ b/Extras/CUDA/particleSystem.cpp
@@ -0,0 +1,280 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as 
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer software" and "commercial computer software 
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein.
+ */
+
+#include "particleSystem.h"
+#include "particleSystem.cuh"
+#include "radixsort.cuh"
+#include "particles_kernel.cuh"
+
+
+#include <assert.h>
+#include <math.h>
+#include <memory.h>
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <GL/glew.h>
+
+#include <btBulletDynamicsCommon.h>
+
+#include "btCudaBroadphase.h"
+
+
+#ifndef CUDART_PI_F
+#define CUDART_PI_F         3.141592654f
+#endif
+
+
+ParticleSystem::ParticleSystem(uint numParticles, uint3 gridSize) :
+	m_simulationMode(SIMULATION_BULLET_CPU)//SIMULATION_CUDA)
+{
+	this->m_params.numBodies = numParticles;
+	this->m_params.m_gridSize = gridSize;
+	initializeBullet();
+}
+
+ParticleSystem::~ParticleSystem()
+{
+	finalizeBullet();
+}
+#include "../../Demos/OpenGL/GLDebugDrawer.h"
+
+GLDebugDrawer debugDrawer;
+
+void	ParticleSystem::initializeBullet()
+{
+	
+	m_collisionConfiguration = new btDefaultCollisionConfiguration();
+	m_dispatcher = new btCollisionDispatcher(m_collisionConfiguration);
+	//m_broadphase = new btDbvtBroadphase();
+	//m_broadphase = new btAxisSweep3(btVector3(-3,-3,-3),btVector3(3,3,3));
+	m_broadphase = new btCudaBroadphase(m_params,m_params.numBodies+6);
+
+
+
+	m_constraintSolver=new btSequentialImpulseConstraintSolver();
+	m_dynamicsWorld = new btDiscreteDynamicsWorld(m_dispatcher,m_broadphase,m_constraintSolver,m_collisionConfiguration);
+	m_dynamicsWorld->setDebugDrawer(&debugDrawer);
+	//debugDrawer.setDebugMode(btIDebugDraw::DBG_DrawPairs);
+
+
+	m_dynamicsWorld->setGravity(100*btVector3(m_params.gravity.x,m_params.gravity.y,m_params.gravity.z));
+	m_dynamicsWorld->getSolverInfo().m_numIterations=1;
+
+	btBoxShape* worldBox = new btBoxShape(btVector3(m_params.worldSize.x/2,m_params.worldSize.y/2,m_params.worldSize.z/2));
+	worldBox->setMargin(0.f);
+
+	//create 6 static planes for the world cube
+	btStaticPlaneShape* planeShape;
+	btRigidBody* body;
+	btVector3 worldSize();
+
+	int i;
+
+	
+
+	btSphereShape* particleSphere = new btSphereShape(m_params.particleRadius);
+	particleSphere->setMargin(0.0);
+	btVector3 localInertia;
+	particleSphere->calculateLocalInertia(1,localInertia);
+
+	float* m_hPos = m_broadphase->getHposPtr();
+
+	for (i=0;i<m_params.numBodies;i++)
+	{
+		btRigidBody::btRigidBodyConstructionInfo rbci(1.,0,particleSphere,localInertia);
+		rbci.m_startWorldTransform.setOrigin(btVector3(m_hPos[i*4],m_hPos[i*4+1],m_hPos[i*4+2]));
+		body = new btRigidBody(rbci);
+		body->setActivationState(DISABLE_DEACTIVATION);
+		m_bulletParticles.push_back(body);
+		m_dynamicsWorld->addRigidBody(body);
+	}
+
+		reset(CONFIG_GRID);
+
+/*	for (i=0;i<6;i++)
+	{
+		btVector4	planeEq;
+		worldBox->getPlaneEquation(planeEq,i);
+
+		planeShape = new btStaticPlaneShape(-planeEq,planeEq.getW());
+		planeShape->setMargin(0.f);
+		btRigidBody::btRigidBodyConstructionInfo rbci(0.f,0,planeShape);
+		body = new btRigidBody(rbci);
+		m_dynamicsWorld->addRigidBody(body);
+	}
+*/
+
+}
+
+void	ParticleSystem::finalizeBullet()
+{
+	delete m_dynamicsWorld;
+	delete m_constraintSolver;
+	delete m_broadphase;
+	delete m_dispatcher ;
+	delete m_collisionConfiguration;
+}
+
+
+
+void 
+ParticleSystem::update(float deltaTime)
+{
+    assert(m_bInitialized);
+
+	switch (m_simulationMode)
+	{
+		case SIMULATION_CUDA:
+		{
+			m_broadphase->quickHack(deltaTime);
+			//todo
+			break;
+		}
+		case SIMULATION_BULLET_CPU:
+			{
+				m_broadphase->integrate();
+
+
+				///copy particles from device to main memory
+				{
+					float* hPosData = m_broadphase->copyBuffersFromDeviceToHost();
+					float* m_hVel = m_broadphase->getHvelPtr();
+					m_broadphase->copyBuffersFromHostToDevice();
+
+
+					//sync transform and velocity from particle system to Bullet
+
+					for (int i=0;i<m_params.numBodies;i++)
+					{
+						btTransform& trans = m_bulletParticles[i]->getWorldTransform();
+						trans.setOrigin(btVector3(hPosData[i*4],hPosData[i*4+1],hPosData[i*4+2]));
+						m_bulletParticles[i]->setLinearVelocity(btVector3(m_hVel[i*4],m_hVel[i*4+1],m_hVel[i*4+2])*10.);
+					}
+				}
+
+				m_dynamicsWorld->stepSimulation(deltaTime);
+
+/*				for (int i=0;i<m_numParticles;i++)
+				{
+					data[i*4+1] -= 0.001f;
+					m_hVel[i*4]=0;
+					m_hVel[i*4+1]=0;
+					m_hVel[i*4+2]=0;
+				}
+				*/
+
+				{
+					float* hPosData = m_broadphase->copyBuffersFromDeviceToHost();
+					float* m_hVel = m_broadphase->getHvelPtr();
+
+					//sync transform and velocity from Bullet to particle system
+					for (int i=0;i<m_params.numBodies;i++)
+					{
+						btTransform& trans = m_bulletParticles[i]->getWorldTransform();
+						hPosData[i*4] = trans.getOrigin().getX();
+						hPosData[i*4+1] = trans.getOrigin().getY();
+						hPosData[i*4+2] = trans.getOrigin().getZ();
+
+						m_hVel[i*4] = m_bulletParticles[i]->getLinearVelocity().getX()/10.;
+						m_hVel[i*4+1] = m_bulletParticles[i]->getLinearVelocity().getY()/10.;
+						m_hVel[i*4+2] = m_bulletParticles[i]->getLinearVelocity().getZ()/10.;
+					}
+
+					m_broadphase->copyBuffersFromHostToDevice();
+				}
+	
+				break;
+
+			}
+
+
+
+	default:
+		{
+			printf("unknown simulation method\n");
+		}
+	};
+
+}
+
+
+
+float*  ParticleSystem::getArray(ParticleArray array)
+{
+	return m_broadphase->getArray((btCudaBroadphase::ParticleArray)array);
+
+}
+void	ParticleSystem::debugDraw()
+{
+	glDisable(GL_DEPTH_TEST);
+	m_dynamicsWorld->debugDrawWorld();
+	glEnable(GL_DEPTH_TEST);
+}
+
+
+void ParticleSystem::reset(ParticleConfig config)
+{
+	m_broadphase->reset((btCudaBroadphase::ParticleConfig)config);
+	for (int i=0;i<m_bulletParticles.size();i++)
+	{
+		m_bulletParticles[i]->setAngularVelocity(btVector3(0,0,0));
+	}
+					
+}
+
+
+void ParticleSystem::addSphere(int start, float *pos, float *vel, int r, float spacing)
+{
+	m_broadphase->addSphere(start,pos,vel,r,spacing);
+}
+
+unsigned int ParticleSystem::getCurrentReadBuffer() const
+{
+	return m_broadphase->getCurrentReadBuffer();
+}
+unsigned int ParticleSystem::getColorBuffer() const
+{
+	return m_broadphase->getColorBuffer();
+}
+
+void ParticleSystem::dumpGrid()
+{
+	return m_broadphase->dumpGrid();
+}
+
+void ParticleSystem::dumpParticles(uint start, uint count)
+{
+	m_broadphase->dumpParticles(start,count);
+}
+
+int   ParticleSystem::getNumParticles() const
+{
+	return m_params.numBodies;
+}
--- a/Extras/CUDA/particleSystem.cu
+++ b/Extras/CUDA/particleSystem.cu
@@ -0,0 +1,331 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as 
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer software" and "commercial computer software 
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein.
+ */
+
+//#include <cutil.h>
+#include <cstdlib>
+#include <cstdio>
+#include <string.h>
+
+#if defined(__APPLE__) || defined(MACOSX)
+#include <GLUT/glut.h>
+#else
+#include <GL/glut.h>
+#endif
+
+#include <cuda_gl_interop.h>
+
+#include "particles_kernel.cu"
+#include "radixsort.cu"
+
+    //! Check for CUDA error
+#  define CUT_CHECK_ERROR(errorMessage) do {                                 \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    err = cudaThreadSynchronize();                                           \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+
+#  define MY_CUDA_SAFE_CALL_NO_SYNC( call) do {                                 \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define MY_CUDA_SAFE_CALL( call) do {                                         \
+    MY_CUDA_SAFE_CALL_NO_SYNC(call);                                            \
+    cudaError err = cudaThreadSynchronize();                                 \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+
+extern "C"
+{
+
+void cudaInit(int argc, char **argv)
+{   
+    //CUT_DEVICE_INIT(argc, argv);
+}
+
+void allocateArray(void **devPtr, size_t size)
+{
+    MY_CUDA_SAFE_CALL(cudaMalloc(devPtr, size));
+}
+
+void freeArray(void *devPtr)
+{
+    MY_CUDA_SAFE_CALL(cudaFree(devPtr));
+}
+
+void threadSync()
+{
+    MY_CUDA_SAFE_CALL(cudaThreadSynchronize());
+}
+
+void copyArrayFromDevice(void* host, const void* device, unsigned int vbo, int size)
+{   
+    if (vbo)
+        MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&device, vbo));
+    MY_CUDA_SAFE_CALL(cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost));
+    if (vbo)
+        MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vbo));
+}
+
+void copyArrayToDevice(void* device, const void* host, int offset, int size)
+{
+    MY_CUDA_SAFE_CALL(cudaMemcpy((char *) device + offset, host, size, cudaMemcpyHostToDevice));
+}
+
+void registerGLBufferObject(uint vbo)
+{
+    MY_CUDA_SAFE_CALL(cudaGLRegisterBufferObject(vbo));
+}
+
+void unregisterGLBufferObject(uint vbo)
+{
+    MY_CUDA_SAFE_CALL(cudaGLUnregisterBufferObject(vbo));
+}
+
+void setParameters(SimParams *hostParams)
+{
+    // copy parameters to constant memory
+    MY_CUDA_SAFE_CALL( cudaMemcpyToSymbol(params, hostParams, sizeof(SimParams)) );
+}
+
+//Round a / b to nearest higher integer value
+int iDivUp(int a, int b){
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+}
+
+// compute grid and thread block size for a given number of elements
+void computeGridSize(int n, int blockSize, int &numBlocks, int &numThreads)
+{
+    numThreads = min(blockSize, n);
+    numBlocks = iDivUp(n, numThreads);
+}
+
+void 
+integrateSystem(uint vboOldPos, uint vboNewPos, 
+                float* oldVel, float* newVel, 
+                float deltaTime,
+                int numBodies)
+{
+    int numThreads, numBlocks;
+    computeGridSize(numBodies, 256, numBlocks, numThreads);
+
+    float *oldPos, *newPos;
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&oldPos, vboOldPos));
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&newPos, vboNewPos));
+
+    // execute the kernel
+    integrate<<< numBlocks, numThreads >>>((float4*)newPos, (float4*)newVel,
+                                           (float4*)oldPos, (float4*)oldVel,
+                                           deltaTime);
+    
+    // check if kernel invocation generated an error
+    CUT_CHECK_ERROR("integrate kernel execution failed");
+
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboOldPos));
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboNewPos));
+}
+
+void 
+updateGrid(uint    vboPos, 
+           uint*   gridCounters,
+           uint*   gridCells,
+           uint    numBodies,
+           uint    numCells)
+{
+    int numThreads, numBlocks;
+    computeGridSize(numBodies, 256, numBlocks, numThreads);
+
+    float *pos;
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&pos, vboPos));
+
+    MY_CUDA_SAFE_CALL(cudaMemset(gridCounters, 0, numCells*sizeof(uint)));
+
+    // execute the kernel
+    updateGridD<<< numBlocks, numThreads >>>((float4 *) pos,
+                                             gridCounters,
+                                             gridCells);
+    
+    // check if kernel invocation generated an error
+    CUT_CHECK_ERROR("Kernel execution failed");
+
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboPos));
+}
+
+
+void 
+calcHash(uint    vboPos, 
+         uint*   particleHash,
+         int     numBodies)
+{
+    int numThreads, numBlocks;
+    computeGridSize(numBodies, 256, numBlocks, numThreads);
+
+    float *pos;
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&pos, vboPos));
+
+    // execute the kernel
+    calcHashD<<< numBlocks, numThreads >>>((float4 *) pos,
+                                           (uint2 *) particleHash);
+    
+    // check if kernel invocation generated an error
+    CUT_CHECK_ERROR("Kernel execution failed");
+
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboPos));
+}
+
+void 
+reorderDataAndFindCellStart(uint*  particleHash,
+							uint   vboOldPos,
+							float* oldVel,
+							float* sortedPos,
+							float* sortedVel,
+							uint*  cellStart,
+							uint   numBodies,
+							uint   numCells)
+{
+    int numThreads, numBlocks;
+    computeGridSize(numBodies, 256, numBlocks, numThreads);
+
+	MY_CUDA_SAFE_CALL(cudaMemset(cellStart, 0xffffffff, numCells*sizeof(uint)));
+
+    float *oldPos;
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&oldPos, vboOldPos));
+
+#if USE_TEX
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldPosTex, oldPos, numBodies*sizeof(float4)));
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldVelTex, oldVel, numBodies*sizeof(float4)));
+#endif
+
+    reorderDataAndFindCellStartD<<< numBlocks, numThreads >>>(
+		(uint2 *)  particleHash,
+        (float4 *) oldPos,
+        (float4 *) oldVel,
+        (float4 *) sortedPos,
+        (float4 *) sortedVel,
+        (uint *)   cellStart);
+    CUT_CHECK_ERROR("Kernel execution failed: reorderDataAndFindCellStartD");
+
+#if USE_TEX
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldPosTex));
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldVelTex));
+#endif
+
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboOldPos));
+}
+
+void
+collide(uint   vboOldPos, uint vboNewPos,
+        float* sortedPos, float* sortedVel,
+        float* oldVel, float* newVel,
+        uint*  gridCounters,
+        uint*  gridCells,
+        uint*  particleHash,
+        uint*  cellStart,
+        uint   numBodies,
+        uint   numCells,
+        uint   maxParticlesPerCell)
+{
+    float4 *oldPos, *newPos;
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&oldPos, vboOldPos));
+    MY_CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&newPos, vboNewPos));
+
+#if USE_TEX
+
+#if USE_SORT
+    // use sorted arrays
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldPosTex, sortedPos, numBodies*sizeof(float4)));
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldVelTex, sortedVel, numBodies*sizeof(float4)));
+
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, particleHashTex, particleHash, numBodies*sizeof(uint2)));
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, cellStartTex, cellStart, numCells*sizeof(uint)));
+#else
+
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldPosTex, oldPos, numBodies*sizeof(float4)));
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, oldVelTex, oldVel, numBodies*sizeof(float4)));
+
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, gridCountersTex, gridCounters,numCells*sizeof(uint)));
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, gridCellsTex, gridCells, numCells*maxParticlesPerCell*sizeof(uint)));
+#endif
+
+#endif
+
+    // thread per particle
+    int numThreads, numBlocks;
+    computeGridSize(numBodies, BLOCKDIM, numBlocks, numThreads);
+
+    // execute the kernel
+    collideD<<< numBlocks, numThreads >>>((float4*)newPos, (float4*)newVel,
+#if USE_SORT
+                                          (float4*)sortedPos, (float4*)sortedVel,
+                                          (uint2 *) particleHash,
+                                          cellStart
+#else
+                                          (float4*)oldPos, (float4*)oldVel,
+                                          gridCounters,
+                                          gridCells
+#endif
+                                          );
+
+    // check if kernel invocation generated an error
+    CUT_CHECK_ERROR("Kernel execution failed");
+
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboNewPos));
+    MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboOldPos));
+
+#if USE_TEX
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldPosTex));
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(oldVelTex));
+
+#if USE_SORT
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(particleHashTex));
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(cellStartTex));
+#else
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(gridCountersTex));
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(gridCellsTex));
+#endif
+#endif
+}
+
+}   // extern "C"
--- a/Extras/CUDA/particleSystem.cuh
+++ b/Extras/CUDA/particleSystem.cuh
@@ -0,0 +1,57 @@
+extern "C"
+{
+void cudaInit(int argc, char **argv);
+
+void allocateArray(void **devPtr, int size);
+void freeArray(void *devPtr);
+
+void threadSync();
+
+void copyArrayFromDevice(void* host, const void* device, unsigned int vbo, int size);
+void copyArrayToDevice(void* device, const void* host, int offset, int size);
+void registerGLBufferObject(unsigned int vbo);
+void unregisterGLBufferObject(unsigned int vbo);
+
+void setParameters(SimParams *hostParams);
+
+void 
+integrateSystem(uint vboOldPos, uint vboNewPos, 
+                float* oldVel, float* newVel, 
+                float deltaTime,
+                int numBodies);
+
+void 
+updateGrid(uint    vboPos, 
+           uint*   gridCounters,
+           uint*   gridCells,
+           uint    numBodies,
+           uint    numCells);
+
+void 
+calcHash(uint    vboPos, 
+         uint*   particleHash,
+         int     numBodies);
+
+void 
+reorderDataAndFindCellStart(uint*  particleHash,
+							uint   vboOldPos,
+							float* oldVel,
+							float* sortedPos,
+							float* sortedVel,
+							uint*  cellStart,
+							uint   numBodies,
+							uint   numCells);
+
+void
+collide(uint   vboOldPos, uint vboNewPos,
+        float* sortedPos, float* sortedVel,
+        float* oldVel, float* newVel,
+        uint*  gridCounters,
+        uint*  gridCells,
+        uint*  particleHash,
+        uint*  cellStart,
+        uint   numBodies,
+        uint   numCells,
+        uint   maxParticlesPerCell);
+
+}
--- a/Extras/CUDA/particleSystem.h
+++ b/Extras/CUDA/particleSystem.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as 
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer software" and "commercial computer software 
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein.
+ */
+
+#ifndef __BODYSYSTEMCUDA_H__
+#define __BODYSYSTEMCUDA_H__
+
+#define DEBUG_GRID 0
+#define DO_TIMING 0
+
+#include "particles_kernel.cuh"
+#include "vector_functions.h"
+#include "LinearMath/btAlignedObjectArray.h"
+
+// CUDA BodySystem: runs on the GPU
+class ParticleSystem
+{
+public:
+    ParticleSystem(uint numParticles, uint3 gridSize);
+    ~ParticleSystem();
+
+	
+	enum ParticleArray
+    {
+        POSITION,
+        VELOCITY,
+    };
+
+    enum ParticleConfig
+    {
+	    CONFIG_RANDOM,
+	    CONFIG_GRID,
+	    _NUM_CONFIGS
+    };
+
+	enum SimulationMode
+    {
+        SIMULATION_CUDA,
+        SIMULATION_BULLET_CPU,
+        SIMULATION_NUM_MODES
+    };
+
+	void reset(ParticleConfig config);
+		void	debugDraw();
+	
+
+	///
+	///Bullet data
+	///
+
+	void	initializeBullet();
+	void	finalizeBullet();
+	class btDiscreteDynamicsWorld*				m_dynamicsWorld;
+	class btDefaultCollisionConfiguration*		m_collisionConfiguration;
+	class btCollisionDispatcher*					m_dispatcher;
+	class btCudaBroadphase*						m_broadphase;
+	class btSequentialImpulseConstraintSolver*		m_constraintSolver;
+	btAlignedObjectArray<class btRigidBody*>				m_bulletParticles;
+
+    
+    void update(float deltaTime);
+
+    float* getArray(ParticleArray array);
+
+    int    getNumParticles() const;
+
+    unsigned int getCurrentReadBuffer() const;
+    unsigned int getColorBuffer() const;
+
+    void dumpGrid();
+    void dumpParticles(uint start, uint count);
+
+    void setIterations(int i) { m_solverIterations = i; }
+
+    void setDamping(float x) { m_params.globalDamping = x; }
+    void setGravity(float x) { m_params.gravity = make_float3(0.0f, x, 0.0f); }
+
+    void setCollideSpring(float x) { m_params.spring = x; }
+    void setCollideDamping(float x) { m_params.damping = x; }
+    void setCollideShear(float x) { m_params.shear = x; }
+    void setCollideAttraction(float x) { m_params.attraction = x; }
+
+    void setColliderPos(float4 x) { m_params.colliderPos = x; }
+
+    float getParticleRadius() { return m_params.particleRadius; }
+    float4 getColliderPos() { return m_params.colliderPos; }
+    float getColliderRadius() { return m_params.colliderRadius; }
+    uint3 getGridSize() { return m_params.gridSize; }
+    float3 getWorldOrigin() { return m_params.worldOrigin; }
+    float3 getCellSize() { return m_params.cellSize; }
+
+    void addSphere(int index, float *pos, float *vel, int r, float spacing);
+
+	SimulationMode	getSimulationMode() const
+	{
+		return m_simulationMode;
+	}
+
+	void	setSimulationMode(SimulationMode	mode)
+	{
+		m_simulationMode=mode;
+	}
+
+protected: // methods
+    ParticleSystem()
+		: m_simulationMode(SIMULATION_CUDA)
+	{}
+  
+    void initGrid(uint *size, float spacing, float jitter, uint numParticles);
+
+
+protected: // data
+    bool m_bInitialized;
+    
+
+  
+    // params
+    SimParams m_params;
+    uint m_maxParticlesPerCell;
+
+    uint m_timer;
+
+    uint m_solverIterations;
+
+	SimulationMode m_simulationMode;
+};
+
+#endif // __BODYSYSTEMCUDA_H__
--- a/Extras/CUDA/particles.cpp
+++ b/Extras/CUDA/particles.cpp
@@ -0,0 +1,544 @@
+/*
+ * Copyright 1993-2007 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as 
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer software" and "commercial computer software 
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein.
+ */
+
+/*
+    Particle system example with collisions using uniform grid
+*/
+
+#include <cstdlib>
+#include <cstdio>
+#include <algorithm>
+#include <math.h>
+//#include <cutil.h>
+
+#include <GL/glew.h>
+
+#if defined(__APPLE__) || defined(MACOSX)
+#include <GLUT/glut.h>
+#else
+#include <GL/glut.h>
+#endif
+
+#include "particleSystem.h"
+#include "render_particles.h"
+#include "paramgl.h"
+
+// view params
+int ox, oy;
+int buttonState = 0;
+float camera_trans[] = {0, 0, -3};
+float camera_rot[]   = {0, 0, 0};
+float camera_trans_lag[] = {0, 0, -3};
+float camera_rot_lag[] = {0, 0, 0};
+const float inertia = 0.1;
+ParticleRenderer::DisplayMode displayMode = ParticleRenderer::PARTICLE_SPHERES;
+
+
+
+int mode = 0;
+bool displayEnabled = true;
+bool bPause = false;
+bool displaySliders = false;
+bool wireframe = false;
+
+enum { M_VIEW = 0, M_MOVE };
+
+uint numParticles = 0;
+uint3 gridSize;
+int numIterations = 0; // run until exit
+
+// simulation parameters
+float timestep = 0.5f;
+float damping = 1.0f;
+float gravity = 0.0003f;
+int iterations = 1;
+int ballr = 10;
+
+float collideSpring = 0.5f;;
+float collideDamping = 0.02f;;
+float collideShear = 0.1f;
+float collideAttraction = 0.0f;
+
+ParticleSystem *psystem = 0;
+
+// fps
+
+ParticleRenderer *renderer = 0;
+
+float modelView[16];
+
+ParamListGL *params;
+
+extern "C" void cudaInit(int argc, char **argv);
+
+void init(int numParticles, uint3 gridSize)
+{
+    psystem = new ParticleSystem(numParticles, gridSize); 
+    psystem->reset(ParticleSystem::CONFIG_GRID);
+
+    renderer = new ParticleRenderer;
+    renderer->setParticleRadius(psystem->getParticleRadius());
+    renderer->setColorBuffer(psystem->getColorBuffer());
+
+    
+}
+
+void initGL()
+{  
+    glewInit();
+    if (!glewIsSupported("GL_VERSION_2_0 GL_VERSION_1_5 GL_ARB_multitexture GL_ARB_vertex_buffer_object")) {
+        fprintf(stderr, "Required OpenGL extensions missing.");
+        exit(-1);
+    }
+
+    glEnable(GL_DEPTH_TEST);
+    glClearColor(0.25, 0.25, 0.25, 1.0);
+
+    glutReportErrors();
+}
+
+
+void display()
+{
+    
+    // update the simulation
+    if (!bPause)
+    {
+        psystem->setIterations(iterations);
+        psystem->setDamping(damping);
+        psystem->setGravity(-gravity);
+        psystem->setCollideSpring(collideSpring);
+        psystem->setCollideDamping(collideDamping);
+        psystem->setCollideShear(collideShear);
+        psystem->setCollideAttraction(collideAttraction);
+
+        psystem->update(timestep); 
+        renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles());
+		float* posArray = psystem->getArray(ParticleSystem::POSITION);
+		renderer->setPositions(posArray,psystem->getNumParticles());
+    }
+
+    // render
+    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);  
+
+    // view transform
+    glMatrixMode(GL_MODELVIEW);
+    glLoadIdentity();
+    for (int c = 0; c < 3; ++c)
+    {
+        camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia;
+        camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia;
+    }
+    glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]);
+    glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0);
+    glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0);
+
+    glGetFloatv(GL_MODELVIEW_MATRIX, modelView);
+
+    // cube
+    glColor3f(1.0, 1.0, 1.0);
+    glutWireCube(2.0);
+
+    // collider
+    glPushMatrix();
+    float4 p = psystem->getColliderPos();
+    glTranslatef(p.x, p.y, p.z);
+    glColor3f(1.0, 0.0, 0.0);
+    glutSolidSphere(psystem->getColliderRadius(), 20, 10);
+    glPopMatrix();
+
+    if (displayEnabled)
+    {
+        renderer->display(displayMode);
+    }
+
+    if (displaySliders) {
+        glDisable(GL_DEPTH_TEST);
+        glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color
+        glEnable(GL_BLEND);
+        params->Render(0, 0);
+        glDisable(GL_BLEND);
+        glEnable(GL_DEPTH_TEST);
+    }
+
+	psystem->debugDraw();
+    
+    glutSwapBuffers();
+
+	{
+		 char fps[256];
+		 //float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f);
+		 switch (psystem->getSimulationMode())
+		 {
+		 case ParticleSystem::SIMULATION_CUDA:
+			 {
+				sprintf(fps, "CUDA particles (%d particles)", numParticles);  
+				break;
+			 }
+		case ParticleSystem::SIMULATION_BULLET_CPU:
+			 {
+				 sprintf(fps, "Bullet btCudaBroadphase (%d btSphereShapes)", numParticles);  
+				 break;
+			 }
+		 default:
+			 {
+				 sprintf(fps, "Unknown simulation mode");  
+			 }
+		 }
+		 glutSetWindowTitle(fps);
+	}
+
+    glutReportErrors();
+}
+
+void reshape(int w, int h)
+{
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    gluPerspective(60.0, (float) w / (float) h, 0.1, 10.0);
+
+    glMatrixMode(GL_MODELVIEW);
+    glViewport(0, 0, w, h);
+
+    renderer->setWindowSize(w, h);
+    renderer->setFOV(60.0);
+}
+
+void mouse(int button, int state, int x, int y)
+{
+    int mods;
+
+    if (state == GLUT_DOWN)
+        buttonState |= 1<<button;
+    else if (state == GLUT_UP)
+        buttonState = 0;
+
+    mods = glutGetModifiers();
+    if (mods & GLUT_ACTIVE_SHIFT) {
+        buttonState = 2;
+    } else if (mods & GLUT_ACTIVE_CTRL) {
+        buttonState = 3;
+    }
+
+    ox = x; oy = y;
+
+    if (displaySliders) {
+        if (params->Mouse(x, y, button, state)) {
+            glutPostRedisplay();
+            return;
+        }
+    }
+
+    glutPostRedisplay();
+}
+
+// transfrom vector by matrix
+void xform(float *v, float *r, GLfloat *m)
+{
+  r[0] = v[0]*m[0] + v[1]*m[4] + v[2]*m[8] + m[12];
+  r[1] = v[0]*m[1] + v[1]*m[5] + v[2]*m[9] + m[13];
+  r[2] = v[0]*m[2] + v[1]*m[6] + v[2]*m[10] + m[14];
+}
+
+// transform vector by transpose of matrix
+void ixform(float *v, float *r, GLfloat *m)
+{
+  r[0] = v[0]*m[0] + v[1]*m[1] + v[2]*m[2];
+  r[1] = v[0]*m[4] + v[1]*m[5] + v[2]*m[6];
+  r[2] = v[0]*m[8] + v[1]*m[9] + v[2]*m[10];
+}
+
+void ixformPoint(float *v, float *r, GLfloat *m)
+{
+    float x[4];
+    x[0] = v[0] - m[12];
+    x[1] = v[1] - m[13];
+    x[2] = v[2] - m[14];
+    x[3] = 1.0f;
+    ixform(x, r, m);
+}
+
+void motion(int x, int y)
+{
+    float dx, dy;
+    dx = x - ox;
+    dy = y - oy;
+
+    if (displaySliders) {
+        if (params->Motion(x, y)) {
+            ox = x; oy = y;
+            glutPostRedisplay();
+            return;
+        }
+    }
+
+    switch(mode) 
+    {
+    case M_VIEW:
+        if (buttonState == 3) {
+            // left+middle = zoom
+            camera_trans[2] += (dy / 100.0) * 0.5 * fabs(camera_trans[2]);
+        } 
+        else if (buttonState & 2) {
+            // middle = translate
+            camera_trans[0] += dx / 100.0;
+            camera_trans[1] -= dy / 100.0;
+        }
+        else if (buttonState & 1) {
+            // left = rotate
+            camera_rot[0] += dy / 5.0;
+            camera_rot[1] += dx / 5.0;
+        }
+        break;
+
+    case M_MOVE:
+        {
+            float translateSpeed = 0.003f;
+            float4 p = psystem->getColliderPos();
+            if (buttonState==1) {
+                float v[3], r[3];
+                v[0] = dx*translateSpeed;
+                v[1] = -dy*translateSpeed;
+                v[2] = 0.0f;
+                ixform(v, r, modelView);
+                p.x += r[0];
+                p.y += r[1];
+                p.z += r[2];
+            } else if (buttonState==2) {
+                float v[3], r[3];
+                v[0] = 0.0f;
+                v[1] = 0.0f;
+                v[2] = dy*translateSpeed;
+                ixform(v, r, modelView);
+                p.x += r[0];
+                p.y += r[1];
+                p.z += r[2];
+            }
+            psystem->setColliderPos(p);
+        }
+        break;
+    }
+
+    ox = x; oy = y;
+    glutPostRedisplay();
+}
+
+inline float frand()
+{
+    return rand() / (float) RAND_MAX;
+}
+
+// commented out to remove unused parameter warnings in Linux
+void key(unsigned char key, int /*x*/, int /*y*/)
+{
+    switch (key) 
+    {
+    case ' ':
+        bPause = !bPause;
+        break;
+    case 13:
+        psystem->update(timestep); 
+        renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles());
+        break;
+    case '\033':
+    case 'q':
+        exit(0);
+        break;
+    case 'v':
+        mode = M_VIEW;
+        break;
+    case 'm':
+        mode = M_MOVE;
+        break;
+	case 's':
+		psystem->setSimulationMode((ParticleSystem::SimulationMode) ((psystem->getSimulationMode() + 1) % ParticleSystem::SIMULATION_NUM_MODES));
+        break;
+
+    case 'p':
+        displayMode = (ParticleRenderer::DisplayMode) ((displayMode + 1) % ParticleRenderer::PARTICLE_NUM_MODES);
+        break;
+    case 'd':
+        psystem->dumpGrid();
+        break;
+    case 'u':
+        psystem->dumpParticles(0, 1);
+        break;
+
+    case 'r':
+        displayEnabled = !displayEnabled;
+        break;
+
+    case '1':
+        psystem->reset(ParticleSystem::CONFIG_GRID);
+        break;
+    case '2':
+        psystem->reset(ParticleSystem::CONFIG_RANDOM);
+        break;
+    case '3':
+        {
+            // inject a sphere of particles
+            float pr = psystem->getParticleRadius();
+            float tr = pr+(pr*2.0f)*ballr;
+            float pos[4], vel[4];
+            pos[0] = -1.0 + tr + frand()*(2.0f - tr*2.0f);
+            pos[1] = 1.0f - tr;
+            pos[2] = -1.0 + tr + frand()*(2.0f - tr*2.0f);
+            pos[3] = 0.0f;
+            vel[0] = vel[1] = vel[2] = vel[3] = 0.0f;
+            psystem->addSphere(0, pos, vel, ballr, pr*2.0f);
+        }
+        break;
+    case '4':
+        {
+            // shoot ball from camera
+            float pr = psystem->getParticleRadius();
+            float vel[4], velw[4], pos[4], posw[4];
+            vel[0] = 0.0f;
+            vel[1] = 0.0f;
+            vel[2] = -0.05f;
+            vel[3] = 0.0f;
+            ixform(vel, velw, modelView);
+
+            pos[0] = 0.0f;
+            pos[1] = 0.0f;
+            pos[2] = -2.5f;
+            pos[3] = 1.0;
+            ixformPoint(pos, posw, modelView);
+            posw[3] = 0.0f;
+
+            psystem->addSphere(0, posw, velw, ballr, pr*2.0f);
+        }
+        break;
+
+    case 'w':
+        wireframe = !wireframe;
+        break;
+
+    case 'h':
+        displaySliders = !displaySliders;
+        break;
+    }
+
+    glutPostRedisplay();
+}
+
+void special(int k, int x, int y)
+{
+    if (displaySliders) {
+        params->Special(k, x, y);
+    }
+}
+
+void idle(void)
+{
+    glutPostRedisplay();
+}
+
+void initParams()
+{
+    // create a new parameter list
+    params = new ParamListGL("misc");
+    params->AddParam(new Param<float>("time step", timestep, 0.0, 1.0, 0.01, &timestep));
+    params->AddParam(new Param<int>("iterations", iterations, 0, 10, 1, &iterations));
+    params->AddParam(new Param<float>("damping", damping, 0.0, 1.0, 0.001, &damping));
+    params->AddParam(new Param<float>("gravity", gravity, 0.0, 0.001, 0.0001, &gravity));
+    params->AddParam(new Param<int>("ball r", ballr, 1, 20, 1, &ballr));
+
+    params->AddParam(new Param<float>("collide spring", collideSpring, 0.0, 1.0, 0.001, &collideSpring));
+    params->AddParam(new Param<float>("collide damping", collideDamping, 0.0, 0.1, 0.001, &collideDamping));
+    params->AddParam(new Param<float>("collide shear", collideShear, 0.0, 0.1, 0.001, &collideShear));
+    params->AddParam(new Param<float>("collide attract", collideAttraction, 0.0, 0.1, 0.001, &collideAttraction));
+}
+
+void mainMenu(int i)
+{
+    key((unsigned char) i, 0, 0);
+}
+
+void initMenus()
+{
+    glutCreateMenu(mainMenu);
+    glutAddMenuEntry("Reset block [1]", '1');
+    glutAddMenuEntry("Reset random [2]", '2');
+    glutAddMenuEntry("Add sphere [3]", '3');
+    glutAddMenuEntry("View mode [v]", 'v');
+    glutAddMenuEntry("Move cursor mode [m]", 'm');
+    glutAddMenuEntry("Toggle point rendering [p]", 'p');
+	glutAddMenuEntry("Toggle Bullet simulation[s]", 's');
+    glutAddMenuEntry("Toggle animation [ ]", ' ');
+    glutAddMenuEntry("Step animation [ret]", 13);
+    glutAddMenuEntry("Toggle sliders [h]", 'h');
+    glutAddMenuEntry("Quit (esc)", '\033');
+    glutAttachMenu(GLUT_RIGHT_BUTTON);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int
+main(int argc, char** argv) 
+{
+    numParticles =1024;//1024;//64;//16380;//32768;
+    uint gridDim = 64;
+    numIterations = 0;
+
+    gridSize.x = gridSize.y = gridSize.z = gridDim;
+    printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z);
+
+    bool benchmark = false;
+    
+    cudaInit(argc, argv);
+
+    glutInit(&argc, argv);
+    glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE);
+    glutInitWindowSize(640, 480);
+    glutCreateWindow("CUDA particles");
+
+    initGL();
+    init(numParticles, gridSize);
+    initParams();
+    initMenus();
+
+   
+    glutDisplayFunc(display);
+    glutReshapeFunc(reshape);
+    glutMouseFunc(mouse);
+    glutMotionFunc(motion);
+    glutKeyboardFunc(key);
+    glutSpecialFunc(special);
+    glutIdleFunc(idle);
+
+    glutMainLoop();
+
+
+    if (psystem)
+        delete psystem;
+
+    return 0;
+}
--- a/Extras/CUDA/particles_kernel.cu
+++ b/Extras/CUDA/particles_kernel.cu
@@ -0,0 +1,381 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as 
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer software" and "commercial computer software 
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein.
+ */
+
+/* 
+ * Device code.
+ */
+
+#ifndef _PARTICLES_KERNEL_H_
+#define _PARTICLES_KERNEL_H_
+
+#include <stdio.h>
+#include <math.h>
+#include "cutil_math.h"
+#include "math_constants.h"
+#include "particles_kernel.cuh"
+
+#if USE_TEX
+// textures for particle position and velocity
+texture<float4, 1, cudaReadModeElementType> oldPosTex;
+texture<float4, 1, cudaReadModeElementType> oldVelTex;
+
+texture<uint2, 1, cudaReadModeElementType> particleHashTex;
+texture<uint, 1, cudaReadModeElementType> cellStartTex;
+
+texture<uint, 1, cudaReadModeElementType> gridCountersTex;
+texture<uint, 1, cudaReadModeElementType> gridCellsTex;
+#endif
+
+__constant__ SimParams params;
+
+// integrate particle attributes
+__global__ void
+integrate(float4* newPos, float4* newVel, 
+          float4* oldPos, float4* oldVel, 
+          float deltaTime)
+{
+    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+
+	float4 pos4 = oldPos[index];
+    float4 vel4 = oldVel[index];
+    float3 pos = make_float3(pos4);
+    float3 vel = make_float3(vel4);
+
+    vel += params.gravity * deltaTime;
+    vel *= params.globalDamping;
+
+    // new position = old position + velocity * deltaTime
+    pos += vel * deltaTime;
+
+    // bounce off cube sides
+    if (pos.x > 1.0f - params.particleRadius) { pos.x = 1.0f - params.particleRadius; vel.x *= params.boundaryDamping; }
+    if (pos.x < -1.0f + params.particleRadius) { pos.x = -1.0f + params.particleRadius; vel.x *= params.boundaryDamping;}
+    if (pos.y > 1.0f - params.particleRadius) { pos.y = 1.0f - params.particleRadius; vel.y *= params.boundaryDamping; }
+    if (pos.y < -1.0f + params.particleRadius) { pos.y = -1.0f + params.particleRadius; vel.y *= params.boundaryDamping;}
+    if (pos.z > 1.0f - params.particleRadius) { pos.z = 1.0f - params.particleRadius; vel.z *= params.boundaryDamping; }
+    if (pos.z < -1.0f + params.particleRadius) { pos.z = -1.0f + params.particleRadius; vel.z *= params.boundaryDamping;}
+
+    // store new position and velocity
+    newPos[index] = make_float4(pos, pos4.w);
+    newVel[index] = make_float4(vel, vel4.w);
+}
+
+// calculate position in uniform grid
+__device__ int3 calcGridPos(float4 p)
+{
+    int3 gridPos;
+    gridPos.x = floor((p.x - params.worldOrigin.x) / params.cellSize.x);
+    gridPos.y = floor((p.y - params.worldOrigin.y) / params.cellSize.y);
+    gridPos.z = floor((p.z - params.worldOrigin.z) / params.cellSize.z);
+    return gridPos;
+}
+
+// calculate address in grid from position (clamping to edges)
+__device__ uint calcGridHash(int3 gridPos)
+{
+    gridPos.x = max(0, min(gridPos.x, params.gridSize.x-1));
+    gridPos.y = max(0, min(gridPos.y, params.gridSize.y-1));
+    gridPos.z = max(0, min(gridPos.z, params.gridSize.z-1));
+    return __mul24(__mul24(gridPos.z, params.gridSize.y), params.gridSize.x) + __mul24(gridPos.y, params.gridSize.x) + gridPos.x;
+}
+
+// add particle to cell using atomics
+__device__ void addParticleToCell(int3 gridPos,
+                                  uint index,
+                                  uint* gridCounters,
+                                  uint* gridCells)
+{
+    // calculate grid hash
+    uint gridHash = calcGridHash(gridPos);
+
+    // increment cell counter using atomics
+#if defined CUDA_NO_SM_11_ATOMIC_INTRINSICS
+    int counter = 0;
+#else
+    int counter = atomicAdd(&gridCounters[gridHash], 1); // returns previous value
+    counter = min(counter, params.maxParticlesPerCell-1);
+#endif
+
+    // write particle index into this cell (very uncoalesced!)
+    gridCells[gridHash*params.maxParticlesPerCell + counter] = index;
+}
+
+
+// update uniform grid
+__global__ void
+updateGridD(float4* pos,
+            uint*   gridCounters,
+            uint*   gridCells)
+{
+    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    float4 p = pos[index];
+
+    // get address in grid
+    int3 gridPos = calcGridPos(p);
+
+    addParticleToCell(gridPos, index, gridCounters, gridCells);
+}
+
+// calculate grid hash value for each particle
+__global__ void
+calcHashD(float4* pos,
+          uint2*  particleHash)
+{
+    int index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    float4 p = pos[index];
+
+    // get address in grid
+    int3 gridPos = calcGridPos(p);
+    uint gridHash = calcGridHash(gridPos);
+
+    // store grid hash and particle index
+    particleHash[index] = make_uint2(gridHash, index);
+}
+
+// rearrange particle data into sorted order, and find the start of each cell in the
+// sorted hash array
+__global__ void
+reorderDataAndFindCellStartD(uint2*  particleHash,  // particle id sorted by hash
+				             float4* oldPos,
+							 float4* oldVel,
+							 float4* sortedPos, 
+							 float4* sortedVel,
+							 uint*   cellStart)
+{
+    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+
+    uint2 sortedData = particleHash[index];
+
+	// Load hash data into shared memory so that we can look 
+	// at neighboring particle's hash value without loading
+	// two hash values per thread
+	__shared__ uint sharedHash[257];
+	sharedHash[threadIdx.x+1] = sortedData.x;
+	if (index > 0 && threadIdx.x == 0)
+	{
+		// first thread in block must load neighbor particle hash
+		volatile uint2 prevData = particleHash[index-1];
+		sharedHash[0] = prevData.x;
+	}
+
+	__syncthreads();
+	if (index == 0 || sortedData.x != sharedHash[threadIdx.x])
+	{
+		cellStart[sortedData.x] = index;
+	}
+
+	// Now use the sorted index to reorder the pos and vel data
+	float4 pos = FETCH(oldPos, sortedData.y);       // macro does either global read or texture fetch
+    float4 vel = FETCH(oldVel, sortedData.y);       // see particles_kernel.cuh
+
+    sortedPos[index] = pos;
+    sortedVel[index] = vel;
+
+}
+
+// collide two spheres using DEM method
+__device__ float3 collideSpheres(float4 posA, float4 posB,
+                                 float4 velA, float4 velB,
+                                 float radiusA, float radiusB,
+                                 float attraction)
+{
+	// calculate relative position
+    float3 relPos;
+    relPos.x = posB.x - posA.x;
+    relPos.y = posB.y - posA.y;
+    relPos.z = posB.z - posA.z;
+
+    float dist = length(relPos);
+    float collideDist = radiusA + radiusB;
+
+    float3 force = make_float3(0.0f);
+    if (dist < collideDist) {
+        float3 norm = relPos / dist;
+
+		// relative velocity
+        float3 relVel;
+        relVel.x = velB.x - velA.x;
+        relVel.y = velB.y - velA.y;
+        relVel.z = velB.z - velA.z;
+
+        // relative tangential velocity
+        float3 tanVel = relVel - (dot(relVel, norm) * norm);
+
+        // spring force
+        force = -params.spring*(collideDist - dist) * norm;
+        // dashpot (damping) force
+        force += params.damping*relVel;
+        // tangential shear force
+        force += params.shear*tanVel;
+		// attraction
+        force += attraction*relPos;
+    }
+
+    return force;
+}
+
+
+// collide particle with all particles in a given cell
+// version using grid built with atomics
+__device__
+float3 collideCell(int3 gridPos,
+                   uint index,
+                   float4 pos,
+                   float4 vel,
+                   float4* oldPos, 
+                   float4* oldVel,
+                   uint*   gridCounters,
+                   uint*   gridCells)
+{
+    float3 force = make_float3(0.0f);
+
+    if ((gridPos.x < 0) || (gridPos.x > params.gridSize.x-1) ||
+        (gridPos.y < 0) || (gridPos.y > params.gridSize.y-1) ||
+        (gridPos.z < 0) || (gridPos.z > params.gridSize.z-1)) {
+        return force;
+    }
+
+    uint gridHash = calcGridHash(gridPos);
+    
+    // iterate over particles in this cell
+    uint particlesInCell = FETCH(gridCounters, gridHash);
+    particlesInCell = min(particlesInCell, params.maxParticlesPerCell-1);
+
+    for(uint i=0; i<particlesInCell; i++) {
+        uint index2 = FETCH(gridCells, gridHash*params.maxParticlesPerCell + i);
+
+        if (index2 != index) {              // check not colliding with self
+	        float4 pos2 = FETCH(oldPos, index2);
+            float4 vel2 = FETCH(oldVel, index2);
+
+            // collide two spheres
+            float3 projVec = collideSpheres(pos, pos2, vel, vel2, params.particleRadius, params.particleRadius, params.attraction);
+            force += projVec;
+        }
+    }
+
+    return force;
+}
+
+
+// version using sorted grid
+__device__
+float3 collideCell2(int3   gridPos,
+                   uint    index,
+                   float4  pos,
+                   float4  vel,
+                   float4* oldPos, 
+                   float4* oldVel,
+                   uint2*  particleHash,
+                   uint*   cellStart)
+{
+    float3 force = make_float3(0.0f);
+
+    if ((gridPos.x < 0) || (gridPos.x > params.gridSize.x-1) ||
+        (gridPos.y < 0) || (gridPos.y > params.gridSize.y-1) ||
+        (gridPos.z < 0) || (gridPos.z > params.gridSize.z-1)) {
+        return force;
+    }
+
+    uint gridHash = calcGridHash(gridPos);
+
+    // get start of bucket for this cell
+    uint bucketStart = FETCH(cellStart, gridHash);
+    if (bucketStart == 0xffffffff)
+        return force;   // cell empty
+ 
+    // iterate over particles in this cell
+    for(uint i=0; i<params.maxParticlesPerCell; i++) {
+        uint index2 = bucketStart + i;
+        uint2 cellData = FETCH(particleHash, index2);
+        if (cellData.x != gridHash) break;   // no longer in same bucket
+
+        if (index2 != index) {              // check not colliding with self
+	        float4 pos2 = FETCH(oldPos, index2);
+            float4 vel2 = FETCH(oldVel, index2);
+
+            // collide two spheres
+            float3 projVec = collideSpheres(pos, pos2, vel, vel2, params.particleRadius, params.particleRadius, params.attraction);
+            force += projVec;
+        }
+    }
+
+    return force;
+}
+
+
+__global__ void
+collideD(float4* newPos, float4* newVel, 
+         float4* oldPos, float4* oldVel, 
+#if USE_SORT
+         uint2*  particleHash,
+         uint*   cellStart
+#else
+         uint*   gridCounters,
+         uint*   gridCells
+#endif
+         )
+{
+    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+
+    // read particle data from sorted arrays
+	float4 pos = FETCH(oldPos, index);
+    float4 vel = FETCH(oldVel, index);
+
+    // get address in grid
+    int3 gridPos = calcGridPos(pos);
+
+    float3 force = make_float3(0.0f);
+
+    // examine only neighbouring cells
+    for(int z=-1; z<=1; z++) {
+        for(int y=-1; y<=1; y++) {
+            for(int x=-1; x<=1; x++) {
+#if USE_SORT
+                force += collideCell2(gridPos + make_int3(x, y, z), index, pos, vel, oldPos, oldVel, particleHash, cellStart);
+#else
+                force += collideCell(gridPos + make_int3(x, y, z), index, pos, vel, oldPos, oldVel, gridCounters, gridCells);
+#endif
+            }
+        }
+    }
+
+    float3 projVec = collideSpheres(pos, params.colliderPos, vel, make_float4(0.0f, 0.0f, 0.0f, 0.0f), params.particleRadius, params.colliderRadius, 0.0f);
+    force += projVec;
+
+#if USE_SORT
+    // write new velocity back to original unsorted location
+    volatile uint2 sortedData = particleHash[index];
+    newVel[sortedData.y] = vel + make_float4(force, 0.0f);
+#else
+    newVel[index] = vel + make_float4(force, 0.0f);
+#endif
+}
+
+#endif
--- a/Extras/CUDA/particles_kernel.cuh
+++ b/Extras/CUDA/particles_kernel.cuh
@@ -0,0 +1,47 @@
+#ifndef PARTICLES_KERNEL_H
+#define PARTICLES_KERNEL_H
+
+#define BLOCKDIM 64
+#define USE_SORT 1
+
+#ifndef __DEVICE_EMULATION__
+#define USE_TEX 1
+#endif
+
+#ifdef USE_TEX
+#define FETCH(t, i) tex1Dfetch(t##Tex, i)
+#else
+#define FETCH(t, i) t[i]
+#endif
+
+#include "vector_types.h"
+typedef unsigned int uint;
+
+struct SimParams {
+    float4 colliderPos;
+    float  colliderRadius;    
+
+    float3 gravity;
+    float globalDamping;
+    float particleRadius;
+
+    uint3 gridSize;
+    uint numCells;
+    float3 worldOrigin;
+    
+    float3 cellSize;
+    float3 worldSize; 
+    uint3 m_gridSize;
+
+    uint numBodies;
+    uint maxParticlesPerCell;
+
+    float spring;
+    float damping;
+    float shear;
+    float attraction;
+    float boundaryDamping;
+  
+};
+
+#endif
--- a/Extras/CUDA/radixsort.cu
+++ b/Extras/CUDA/radixsort.cu
@@ -0,0 +1,79 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer software" and "commercial computer software
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ */
+
+/* Radixsort project with key/value and arbitrary datset size support
+ * which demonstrates the use of CUDA in a multi phase sorting 
+ * computation.
+ * Host code.
+ */
+
+#include "radixsort.cuh"
+#include "radixsort_kernel.cu"
+
+extern "C"
+{
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform a radix sort
+//! Sorting performed in place on passed arrays.
+//!
+//! @param pData0       input and output array - data will be sorted
+//! @param pData1       additional array to allow ping pong computation
+//! @param elements     number of elements to sort
+////////////////////////////////////////////////////////////////////////////////
+void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits)
+{
+    // Round element count to total number of threads for efficiency
+    uint elements_rounded_to_3072;
+    int modval = elements % 3072;
+    if( modval == 0 )
+        elements_rounded_to_3072 = elements;
+    else
+        elements_rounded_to_3072 = elements + (3072 - (modval));
+
+    // Iterate over n bytes of y bit word, using each byte to sort the list in turn
+    for (uint shift = 0; shift < bits; shift += RADIX)
+    {
+        // Perform one round of radix sorting
+
+        // Generate per radix group sums radix counts across a radix group
+        RadixSum<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, GRFSIZE>>>(pData0, elements, elements_rounded_to_3072, shift);
+        // Prefix sum in radix groups, and then between groups throughout a block
+        RadixPrefixSum<<<PREFIX_NUM_BLOCKS, PREFIX_NUM_THREADS_PER_BLOCK, PREFIX_GRFSIZE>>>();
+        // Sum the block offsets and then shuffle data into bins
+        RadixAddOffsetsAndShuffle<<<NUM_BLOCKS, NUM_THREADS_PER_BLOCK, SHUFFLE_GRFSIZE>>>(pData0, pData1, elements, elements_rounded_to_3072, shift); 
+
+        // Exchange data pointers
+        KeyValuePair* pTemp = pData0;
+        pData0 = pData1;
+        pData1 = pTemp;
+   }
+}
+
+}
--- a/Extras/CUDA/radixsort.cuh
+++ b/Extras/CUDA/radixsort.cuh
@@ -0,0 +1,63 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer software" and "commercial computer software
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ */
+
+/* Radixsort project which demonstrates the use of CUDA in a multi phase
+ * sorting computation.
+ * Type definitions.
+ */
+
+#ifndef _RADIXSORT_H_
+#define _RADIXSORT_H_
+
+#include <host_defines.h>
+
+#define SYNCIT __syncthreads()
+
+// Use 16 bit keys/values
+#define SIXTEEN 0
+
+typedef unsigned int uint;
+typedef unsigned short ushort;
+
+#if SIXTEEN
+typedef struct __align__(4) {
+    ushort key;
+    ushort value;
+#else
+typedef struct __align__(8) {
+    uint key;
+    uint value;
+#endif
+} KeyValuePair;
+
+extern "C" {
+    void RadixSort(KeyValuePair *pData0, KeyValuePair *pData1, uint elements, uint bits);
+}
+
+#endif // #ifndef _RADIXSORT_H_
--- a/Extras/CUDA/radixsort_kernel.cu
+++ b/Extras/CUDA/radixsort_kernel.cu
@@ -0,0 +1,577 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as 
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer software" and "commercial computer software 
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein.
+ */
+
+/* Radixsort project with key/value and arbitrary datset size support
+ * which demonstrates the use of CUDA in a multi phase sorting 
+ * computation.
+ * Device code.
+ */
+
+#ifndef _RADIXSORT_KERNEL_H_
+#define _RADIXSORT_KERNEL_H_
+
+#include <stdio.h>
+#include "radixsort.cuh"
+
+#define SYNCIT __syncthreads()
+
+static const int NUM_SMS = 16;
+static const int NUM_THREADS_PER_SM = 192;
+static const int NUM_THREADS_PER_BLOCK = 64;
+//static const int NUM_THREADS = NUM_THREADS_PER_SM * NUM_SMS;
+static const int NUM_BLOCKS = (NUM_THREADS_PER_SM / NUM_THREADS_PER_BLOCK) * NUM_SMS;
+static const int RADIX = 8;                                                        // Number of bits per radix sort pass
+static const int RADICES = 1 << RADIX;                                             // Number of radices
+static const int RADIXMASK = RADICES - 1;                                          // Mask for each radix sort pass
+#if SIXTEEN
+static const int RADIXBITS = 16;                                                   // Number of bits to sort over
+#else
+static const int RADIXBITS = 32;                                                   // Number of bits to sort over
+#endif
+static const int RADIXTHREADS = 16;                                                // Number of threads sharing each radix counter
+static const int RADIXGROUPS = NUM_THREADS_PER_BLOCK / RADIXTHREADS;               // Number of radix groups per CTA
+static const int TOTALRADIXGROUPS = NUM_BLOCKS * RADIXGROUPS;                      // Number of radix groups for each radix
+static const int SORTRADIXGROUPS = TOTALRADIXGROUPS * RADICES;                     // Total radix count
+static const int GRFELEMENTS = (NUM_THREADS_PER_BLOCK / RADIXTHREADS) * RADICES; 
+static const int GRFSIZE = GRFELEMENTS * sizeof(uint); 
+
+// Prefix sum variables
+static const int PREFIX_NUM_THREADS_PER_SM = NUM_THREADS_PER_SM;
+static const int PREFIX_NUM_THREADS_PER_BLOCK = PREFIX_NUM_THREADS_PER_SM;
+static const int PREFIX_NUM_BLOCKS = (PREFIX_NUM_THREADS_PER_SM / PREFIX_NUM_THREADS_PER_BLOCK) * NUM_SMS;
+static const int PREFIX_BLOCKSIZE = SORTRADIXGROUPS / PREFIX_NUM_BLOCKS;
+static const int PREFIX_GRFELEMENTS = PREFIX_BLOCKSIZE + 2 * PREFIX_NUM_THREADS_PER_BLOCK;
+static const int PREFIX_GRFSIZE = PREFIX_GRFELEMENTS * sizeof(uint);
+
+// Shuffle variables
+static const int SHUFFLE_GRFOFFSET = RADIXGROUPS * RADICES;
+static const int SHUFFLE_GRFELEMENTS = SHUFFLE_GRFOFFSET + PREFIX_NUM_BLOCKS; 
+static const int SHUFFLE_GRFSIZE = SHUFFLE_GRFELEMENTS * sizeof(uint); 
+
+
+#define SDATA( index)      CUT_BANK_CHECKER(sdata, index)
+
+// Prefix sum data
+uint gRadixSum[TOTALRADIXGROUPS * RADICES];
+__device__ uint dRadixSum[TOTALRADIXGROUPS * RADICES];
+uint gRadixBlockSum[PREFIX_NUM_BLOCKS];
+__device__ uint dRadixBlockSum[PREFIX_NUM_BLOCKS];
+
+extern __shared__ uint sRadixSum[];
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform a radix sum on the list to be sorted.  Each SM holds a set of
+//! radix counters for each group of RADIXGROUPS thread in the GRF. 
+//!
+//! @param pData     input data
+//! @param elements  total number of elements
+//! @param elements_rounded_to_3072  total number of elements rounded up to the 
+//!                                  nearest multiple of 3072
+//! @param shift     the shift (0 to 24) that we are using to obtain the correct 
+//!                  byte
+////////////////////////////////////////////////////////////////////////////////
+__global__ void RadixSum(KeyValuePair *pData, uint elements, uint elements_rounded_to_3072, uint shift)
+{
+    uint pos    = threadIdx.x;
+
+    // Zero radix counts
+    while (pos < GRFELEMENTS)
+    {
+        sRadixSum[pos] = 0;
+        pos += NUM_THREADS_PER_BLOCK;
+    }
+
+    // Sum up data
+    // Source addresses computed so that each thread is reading from a block of 
+    // consecutive addresses so there are no conflicts between threads
+    // They then loop over their combined region and the next batch works elsewhere.
+    // So threads 0 to 16 work on memory 0 to 320.
+    // First reading 0,1,2,3...15 then 16,17,18,19...31 and so on
+    // optimising parallel access to shared memory by a thread accessing 16*threadID
+    // The next radix group runs from 320 to 640 and the same applies in that region
+    uint tmod   =   threadIdx.x % RADIXTHREADS;
+    uint tpos   =   threadIdx.x / RADIXTHREADS;
+
+    // Take the rounded element list size so that all threads have a certain size dataset to work with
+    // and no zero size datasets confusing the issue
+    // By using a multiple of 3072 we ensure that all threads have elements
+    // to work with until the last phase, at which point we individually test
+    uint element_fraction  =   elements_rounded_to_3072 / TOTALRADIXGROUPS;
+
+    // Generate range 
+    // Note that it is possible for both pos and end to be past the end of the element set
+    // which will be caught later.
+    pos       = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction;
+    uint end  = pos + element_fraction;
+    pos      += tmod; 
+    //printf("pos: %d\n", pos);
+    __syncthreads();
+
+    while (pos < end )
+    {
+        uint key = 0;
+
+        // Read first data element if we are in the set of elements
+        //if( pos < elements )
+            //key = pData[pos].key;
+        KeyValuePair kvp;
+        // Read first data element, both items at once as the memory will want to coalesce like that anyway
+        if (pos < elements)
+            kvp = pData[pos];
+        else
+            kvp.key = 0;
+        key = kvp.key;
+
+
+        // Calculate position of radix counter to increment
+        // There are RADICES radices in each pass (256)
+        // and hence this many counters for bin grouping
+        // Multiply by RADIXGROUPS (4) to spread through memory
+        // and into 4 radix groups
+        uint p = ((key >> shift) & RADIXMASK) * RADIXGROUPS;
+       
+        // Increment radix counters
+        // Each radix group has its own set of counters
+        // so we add the thread position [0-3], ie the group index.
+        // We slow down here and take at least 16 cycles to write to the summation boxes
+        // but other groups will only conflict with themselves and so can also be writing
+        // 16 cycles here at least avoids retries.
+        uint ppos = p + tpos;
+
+        // If we are past the last element we don't want to do anything
+        // We do have to check each time, however, to ensure that all
+        // threads sync on each sync here.
+        if (tmod == 0 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;
+        if (tmod == 1 && pos < elements)
+            sRadixSum[ppos]++;  
+            SYNCIT;          
+        if (tmod == 2 && pos < elements)
+            sRadixSum[ppos]++;   
+            SYNCIT; 
+        if (tmod == 3 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;  
+        if (tmod == 4 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;
+        if (tmod == 5 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;           
+        if (tmod == 6 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;   
+        if (tmod == 7 && pos < elements)
+            sRadixSum[ppos]++;  
+            SYNCIT;   
+        if (tmod == 8 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;
+        if (tmod == 9 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;           
+        if (tmod == 10 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;    
+        if (tmod == 11 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;     
+        if (tmod == 12 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;
+        if (tmod == 13 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;           
+        if (tmod == 14 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;   
+        if (tmod == 15 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;   
+           
+        pos += RADIXTHREADS;
+                
+    }
+
+    __syncthreads();
+
+    __syncthreads();
+
+    // Output radix sums into separate memory regions for each radix group
+    // So this memory then is layed out:
+    // 0...... 192..... 384 ................ 192*256
+    // ie all 256 bins for each radix group
+    // in there:
+    // 0.............192
+    // 0  4  8  12...     - block idx * 4
+    // And in the block boxes we see the 4 radix groups for that block
+    // So 0-192 should contain bin 0 for each radix group, and so on
+    uint offset = blockIdx.x * RADIXGROUPS;
+    uint row    = threadIdx.x / RADIXGROUPS;
+    uint column = threadIdx.x % RADIXGROUPS;
+    while (row < RADICES)
+    {
+        dRadixSum[offset + row * TOTALRADIXGROUPS + column] = sRadixSum[row * RADIXGROUPS + column];
+        row += NUM_THREADS_PER_BLOCK / RADIXGROUPS;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Performs first part of parallel prefix sum - individual sums of each radix
+//! count. By the end of this we have prefix sums on a block level  in dRadixSum
+//! and totals for blocks in dRadixBlockSum.
+////////////////////////////////////////////////////////////////////////////////
+__global__ void RadixPrefixSum()
+{
+    // Read radix groups in offset by one in the GRF so a zero can be inserted at the beginning
+    // and the final sum of all radix counts summed here is tacked onto the end for reading by
+    // the next stage
+    // Each block in this case is the full number of threads per SM (and hence the total number 
+    // of radix groups), 192. We should then have the total set of offsets for an entire radix 
+    // group by the end of this stage
+    // Device mem addressing
+    
+    uint brow       = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS);
+    uint drow       = threadIdx.x / TOTALRADIXGROUPS; // In default parameterisation this is always 0
+    uint dcolumn    = threadIdx.x % TOTALRADIXGROUPS; // And similarly this is always the same as threadIdx.x   
+    uint dpos       = (brow + drow) * TOTALRADIXGROUPS + dcolumn;
+    uint end        = ((blockIdx.x + 1) * (RADICES / PREFIX_NUM_BLOCKS)) * TOTALRADIXGROUPS;
+    // Shared mem addressing
+    uint srow       = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    uint scolumn    = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    uint spos       = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn;
+
+    // Read (RADICES / PREFIX_NUM_BLOCKS) radix counts into the GRF alongside each other
+    while (dpos < end)
+    {
+        sRadixSum[spos] = dRadixSum[dpos];
+        spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) * 
+                (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+        dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS;
+    }
+    __syncthreads();
+       
+    // Perform preliminary sum on each thread's stretch of data
+    // Each thread having a block of 16, with spacers between 0...16 18...33 and so on
+    int pos     = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+    end         = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    uint sum    = 0;
+    while (pos < end)
+    {
+        sum += sRadixSum[pos];
+        sRadixSum[pos] = sum;
+        pos++;  
+    }
+    __syncthreads();   
+  
+ 
+    // Calculate internal offsets by performing a more traditional parallel
+    // prefix sum of the topmost member of each thread's work data.  Right now,
+    // these are stored between the work data for each thread, allowing us to 
+    // eliminate GRF conflicts as well as hold the offsets needed to complete the sum
+    // In other words we have:
+    // 0....15 16 17....32 33 34....
+    // Where this first stage updates the intermediate values (so 16=15, 33=32 etc)
+    int m           = (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+    pos             = threadIdx.x  * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) +
+                      (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    sRadixSum[pos]  = sRadixSum[pos - 1];
+    __syncthreads();
+    // This stage then performs a parallel prefix sum (ie use powers of 2 to propagate in log n stages)
+    // to update 17, 34 etc with the totals to that point (so 34 becomes [34] + [17]) and so on.
+    while (m < PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1))
+    {
+        int p  = pos - m;
+        uint t = ((p > 0) ? sRadixSum[p] : 0);
+        __syncthreads();
+        sRadixSum[pos] += t;
+        __syncthreads();
+        m *= 2;
+    } 
+    __syncthreads();
+
+  
+  
+    // Add internal offsets to each thread's work data.
+    // So now we take 17 and add it to all values 18 to 33 so all offsets for that block 
+    // are updated.
+    pos     = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+    end     = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    int p   = pos - 1;
+    sum     = ((p > 0) ? sRadixSum[p] : 0);
+    while (pos < end)
+    {
+        sRadixSum[pos] += sum;
+        pos++; 
+    }
+    __syncthreads();        
+               
+    // Write summed data back out to global memory in the same way as we read it in
+    // We now have prefix sum values internal to groups
+    brow       = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS);
+    drow       = threadIdx.x / TOTALRADIXGROUPS;
+    dcolumn    = threadIdx.x % TOTALRADIXGROUPS;    
+    srow       = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    scolumn    = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    dpos       = (brow + drow) * TOTALRADIXGROUPS + dcolumn + 1;
+    spos       = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn;
+    end        = ((blockIdx.x + 1) * RADICES / PREFIX_NUM_BLOCKS) * TOTALRADIXGROUPS;
+    while (dpos < end)
+    {
+        dRadixSum[dpos] = sRadixSum[spos];
+        dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS;        
+        spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) * 
+                (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+    }
+
+    // Write last element to summation
+    // Storing block sums in a separate array
+    if (threadIdx.x == 0) {
+        dRadixBlockSum[blockIdx.x] = sRadixSum[PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) - 1];
+        dRadixSum[blockIdx.x * PREFIX_BLOCKSIZE] = 0;
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Initially perform prefix sum of block totals to obtain final set of offsets.
+//! Then make use of radix sums to perform a shuffling of the data into the 
+//! correct bins.
+//!
+//! @param pSrc      input data
+//! @param pDst      output data
+//! @param elements  total number of elements
+//! @param shift     the shift (0 to 24) that we are using to obtain the correct 
+//!                  byte
+////////////////////////////////////////////////////////////////////////////////
+__global__ void RadixAddOffsetsAndShuffle(KeyValuePair* pSrc, KeyValuePair* pDst, uint elements, uint elements_rounded_to_3072, int shift)
+{
+    // Read offsets from previous blocks
+    if (threadIdx.x == 0)
+        sRadixSum[SHUFFLE_GRFOFFSET] = 0;
+        
+    if (threadIdx.x < PREFIX_NUM_BLOCKS - 1)
+        sRadixSum[SHUFFLE_GRFOFFSET + threadIdx.x + 1] = dRadixBlockSum[threadIdx.x];
+    __syncthreads();
+    
+    // Parallel prefix sum over block sums
+    int pos = threadIdx.x;
+    int n = 1;
+    while (n < PREFIX_NUM_BLOCKS)
+    {
+        int ppos = pos - n;
+        uint t0 = ((pos < PREFIX_NUM_BLOCKS) && (ppos >= 0)) ? sRadixSum[SHUFFLE_GRFOFFSET + ppos] : 0;
+        __syncthreads();
+        if (pos < PREFIX_NUM_BLOCKS)
+            sRadixSum[SHUFFLE_GRFOFFSET + pos] += t0;
+        __syncthreads(); 
+        n *= 2;
+    }
+
+    // Read radix count data and add appropriate block offset
+    // for each radix at the memory location for this thread
+    // (where the other threads in the block will be reading
+    // as well, hence the large stride).
+    // There is one counter box per radix group per radix 
+    // per block (4*256*3)
+    // We use 64 threads to read the 4 radix groups set of radices 
+    // for the block. 
+    int row    = threadIdx.x / RADIXGROUPS;
+    int column = threadIdx.x % RADIXGROUPS;
+    int spos   = row * RADIXGROUPS + column;
+    int dpos   = row * TOTALRADIXGROUPS + column + blockIdx.x * RADIXGROUPS;
+    while (spos < SHUFFLE_GRFOFFSET)
+    {
+        sRadixSum[spos] = dRadixSum[dpos] + sRadixSum[SHUFFLE_GRFOFFSET + dpos / (TOTALRADIXGROUPS * RADICES / PREFIX_NUM_BLOCKS)];
+        spos += NUM_THREADS_PER_BLOCK;
+        dpos += (NUM_THREADS_PER_BLOCK / RADIXGROUPS) * TOTALRADIXGROUPS;
+    }
+    __syncthreads();
+
+    //int pos;
+    // Shuffle data
+    // Each of the subbins for a block should be filled via the counters, properly interleaved
+    // Then, as we now iterate over each data value, we increment the subbins (each thread in the 
+    // radix group in turn to avoid miss writes due to conflicts) and set locations correctly.
+    uint element_fraction  =   elements_rounded_to_3072 / TOTALRADIXGROUPS;
+    int tmod   =   threadIdx.x % RADIXTHREADS;
+    int tpos   =   threadIdx.x / RADIXTHREADS;
+
+    pos       = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction;
+    uint end  = pos + element_fraction; //(blockIdx.x * RADIXGROUPS + tpos + 1) * element_fraction;
+    pos      += tmod; 
+
+    __syncthreads();
+
+    while (pos < end )
+    {
+        KeyValuePair kvp;
+#if 1 // old load
+        // Read first data element, both items at once as the memory will want to coalesce like that anyway
+        if (pos < elements)
+        {
+            kvp = pSrc[pos];
+        }
+        else
+            kvp.key = 0;
+
+#else // casting to float2 to get it to combine loads
+        int2 kvpf2;
+
+        // Read first data element, both items at once as the memory will want to coalesce like that anyway
+        if (pos < elements)
+        {
+  //          kvp = pSrc[pos];
+            kvpf2 = ((int2*)pSrc)[pos];
+           // printf("kvp: %f %f  kvpf2: %f %f\n", kvp.key, kvp.value, kvpf2.x, kvpf2.y);
+        }
+        else
+            //kvp.key = 0;
+            kvpf2.x = 0;
+ 
+        kvp.key = kvpf2.x;
+        kvp.value = kvpf2.y;
+#endif  
+
+        uint index;
+                
+        // Calculate position of radix counter to increment
+        uint p = ((kvp.key >> shift) & RADIXMASK) * RADIXGROUPS;
+                
+        // Move data, keeping counts updated.
+        // Increment radix counters, relying on hexadecathread
+        // warp to prevent this code from stepping all over itself.
+        uint ppos = p + tpos;
+        if (tmod == 0 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 1 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;     
+        if (tmod == 2 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 3 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 4 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 5 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT; 
+        if (tmod == 6 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 7 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT; 
+        if (tmod == 8 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 9 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;          
+        if (tmod == 10 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 11 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 12 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 13 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;           
+        if (tmod == 14 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 15 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;                         
+
+        pos += RADIXTHREADS;
+    }
+
+    __syncthreads();
+}
+
+#endif // #ifndef _RADIXSORT_KERNEL_H_
--- a/Extras/CUDA/render_particles.cpp
+++ b/Extras/CUDA/render_particles.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as 
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer software" and "commercial computer software 
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein.
+ */
+
+#include <GL/glew.h>
+
+#include <math.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include "render_particles.h"
+#include "shaders.h"
+
+#ifndef M_PI
+#define M_PI    3.1415926535897932384626433832795
+#endif
+
+ParticleRenderer::ParticleRenderer()
+: m_pos(0),
+  m_numParticles(0),
+  m_pointSize(1.0f),
+  m_particleRadius(0.125f * 0.5f),
+  m_program(0),
+  m_vbo(0),
+  m_colorVBO(0)
+{
+    _initGL();
+}
+
+ParticleRenderer::~ParticleRenderer()
+{
+    m_pos = 0;
+}
+
+void ParticleRenderer::setPositions(float *pos, int numParticles)
+{
+    m_pos = pos;
+    m_numParticles = numParticles;
+}
+
+void ParticleRenderer::setVertexBuffer(unsigned int vbo, int numParticles)
+{
+    m_vbo = vbo;
+    m_numParticles = numParticles;
+}
+
+void ParticleRenderer::_drawPoints()
+{
+    if (!m_vbo)
+    {
+        glBegin(GL_POINTS);
+        {
+            int k = 0;
+            for (int i = 0; i < m_numParticles; ++i)
+            {
+                glVertex3fv(&m_pos[k]);
+                k += 4;
+            }
+        }
+        glEnd();
+    }
+    else
+    {
+        glBindBufferARB(GL_ARRAY_BUFFER_ARB, m_vbo);
+        glVertexPointer(4, GL_FLOAT, 0, 0);
+        glEnableClientState(GL_VERTEX_ARRAY);                
+
+        if (m_colorVBO) {
+            glBindBufferARB(GL_ARRAY_BUFFER_ARB, m_colorVBO);
+            glColorPointer(4, GL_FLOAT, 0, 0);
+            glEnableClientState(GL_COLOR_ARRAY);
+        }
+
+        glDrawArrays(GL_POINTS, 0, m_numParticles);
+
+        glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
+        glDisableClientState(GL_VERTEX_ARRAY); 
+        glDisableClientState(GL_COLOR_ARRAY); 
+    }
+}
+
+void ParticleRenderer::display(DisplayMode mode /* = PARTICLE_POINTS */)
+{
+    switch (mode)
+    {
+    case PARTICLE_POINTS:
+        glColor3f(1, 1, 1);
+        glPointSize(m_pointSize);
+        _drawPoints();
+        break;
+
+    default:
+    case PARTICLE_SPHERES:
+        glEnable(GL_POINT_SPRITE_ARB);
+        glTexEnvi(GL_POINT_SPRITE_ARB, GL_COORD_REPLACE_ARB, GL_TRUE);
+        glEnable(GL_VERTEX_PROGRAM_POINT_SIZE_NV);
+        glDepthMask(GL_TRUE);
+        glEnable(GL_DEPTH_TEST);
+
+        glUseProgram(m_program);
+        glUniform1f( glGetUniformLocation(m_program, "pointScale"), m_window_h / tanf(m_fov*0.5f*(float)M_PI/180.0f) );
+        glUniform1f( glGetUniformLocation(m_program, "pointRadius"), m_particleRadius );
+
+        glColor3f(1, 1, 1);
+        _drawPoints();
+
+        glUseProgram(0);
+        glDisable(GL_POINT_SPRITE_ARB);
+        break;
+    }
+}
+
+GLuint
+ParticleRenderer::_compileProgram(const char *vsource, const char *fsource)
+{
+    GLuint vertexShader = glCreateShader(GL_VERTEX_SHADER);
+    GLuint fragmentShader = glCreateShader(GL_FRAGMENT_SHADER);
+
+    glShaderSource(vertexShader, 1, &vsource, 0);
+    glShaderSource(fragmentShader, 1, &fsource, 0);
+    
+    glCompileShader(vertexShader);
+    glCompileShader(fragmentShader);
+
+    GLuint program = glCreateProgram();
+
+    glAttachShader(program, vertexShader);
+    glAttachShader(program, fragmentShader);
+
+    glLinkProgram(program);
+
+    // check if program linked
+    GLint success = 0;
+    glGetProgramiv(program, GL_LINK_STATUS, &success);
+
+    if (!success) {
+        char temp[256];
+        glGetProgramInfoLog(program, 256, 0, temp);
+        printf("Failed to link program:\n%s\n", temp);
+        glDeleteProgram(program);
+        program = 0;
+    }
+
+    return program;
+}
+
+void ParticleRenderer::_initGL()
+{
+    m_program = _compileProgram(vertexShader, spherePixelShader);
+
+#if !defined(__APPLE__) && !defined(MACOSX)
+    glClampColorARB(GL_CLAMP_VERTEX_COLOR_ARB, GL_FALSE);
+    glClampColorARB(GL_CLAMP_FRAGMENT_COLOR_ARB, GL_FALSE);
+#endif
+}
--- a/Extras/CUDA/render_particles.h
+++ b/Extras/CUDA/render_particles.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as 
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer software" and "commercial computer software 
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein.
+ */
+
+#ifndef __RENDER_PARTICLES__
+#define __RENDER_PARTICLES__
+
+class ParticleRenderer
+{
+public:
+    ParticleRenderer();
+    ~ParticleRenderer();
+
+    void setPositions(float *pos, int numParticles);
+    void setVertexBuffer(unsigned int vbo, int numParticles);
+    void setColorBuffer(unsigned int vbo) { m_colorVBO = vbo; }
+
+    enum DisplayMode
+    {
+        PARTICLE_POINTS,
+        PARTICLE_SPHERES,
+        PARTICLE_NUM_MODES
+    };
+
+    void display(DisplayMode mode = PARTICLE_POINTS);
+    void displayGrid();
+
+    void setPointSize(float size)  { m_pointSize = size; }
+    void setParticleRadius(float r) { m_particleRadius = r; }
+    void setFOV(float fov) { m_fov = fov; }
+    void setWindowSize(int w, int h) { m_window_w = w; m_window_h = h; }
+
+protected: // methods
+    void _initGL();
+    void _drawPoints();
+    GLuint _compileProgram(const char *vsource, const char *fsource);
+
+protected: // data
+    float *m_pos;
+    int m_numParticles;
+
+    float m_pointSize;
+    float m_particleRadius;
+    float m_fov;
+    int m_window_w, m_window_h;
+
+    GLuint m_program;
+
+    GLuint m_vbo;
+    GLuint m_colorVBO;
+};
+
+#endif //__ RENDER_PARTICLES__
--- a/Extras/CUDA/shaders.cpp
+++ b/Extras/CUDA/shaders.cpp
@@ -0,0 +1,41 @@
+#define STRINGIFY(A) #A
+
+// vertex shader
+const char *vertexShader = STRINGIFY(
+uniform float pointRadius;  // point size in world space
+uniform float pointScale;   // scale to calculate size in pixels
+uniform float densityScale;
+uniform float densityOffset;
+void main()
+{
+    // calculate window-space point size
+    vec3 posEye = vec3(gl_ModelViewMatrix * vec4(gl_Vertex.xyz, 1.0));
+    float dist = length(posEye);
+    gl_PointSize = pointRadius * (pointScale / dist);
+
+    gl_TexCoord[0] = gl_MultiTexCoord0;
+    gl_Position = gl_ModelViewProjectionMatrix * vec4(gl_Vertex.xyz, 1.0);
+
+    gl_FrontColor = gl_Color;
+}
+);
+
+// pixel shader for rendering points as shaded spheres
+const char *spherePixelShader = STRINGIFY(
+void main()
+{
+    const vec3 lightDir = vec3(0.577, 0.577, 0.577);
+
+    // calculate normal from texture coordinates
+    vec3 N;
+    N.xy = gl_TexCoord[0].xy*vec2(2.0, -2.0) + vec2(-1.0, 1.0);
+    float mag = dot(N.xy, N.xy);
+    if (mag > 1.0) discard;   // kill pixels outside circle
+    N.z = sqrt(1.0-mag);
+
+    // calculate lighting
+    float diffuse = max(0.0, dot(lightDir, N));
+
+    gl_FragColor = gl_Color * diffuse;
+}
+);
--- a/Extras/CUDA/shaders.h
+++ b/Extras/CUDA/shaders.h
@@ -0,0 +1,2 @@
+extern const char *vertexShader;
+extern const char *spherePixelShader;